sched: mix tasks and groups

author Dhaval Giani <dhaval@linux.vnet.ibm.com>

Sat, 19 Apr 2008 17:44:59 +0000 (19:44 +0200)

committer Ingo Molnar <mingo@elte.hu>

Sat, 19 Apr 2008 17:44:59 +0000 (19:44 +0200)
author Dhaval Giani <dhaval@linux.vnet.ibm.com>
Sat, 19 Apr 2008 17:44:59 +0000 (19:44 +0200)
committer Ingo Molnar <mingo@elte.hu>
Sat, 19 Apr 2008 17:44:59 +0000 (19:44 +0200)
diff --git a/kernel/sched.c b/kernel/sched.c

index 62830ea..1b7399d 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -273,6 +273,7 @@ struct task_group {
         struct list_head list;
  };
  
+#ifdef CONFIG_USER_SCHED
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /* Default task group's sched entity on each cpu */
  static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
@@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
  static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
  #endif
+#endif
  
  /* task_group_lock serializes add/remove of task groups and also changes to
   * a task group's cpu shares.
@@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
                 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
  
         tg->se[cpu] = se;
+       /* se could be NULL for init_task_group */
+       if (!se)
+               return;
+
         se->cfs_rq = &rq->cfs;
         se->my_q = cfs_rq;
         se->load.weight = tg->shares;
@@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
                 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
  
         tg->rt_se[cpu] = rt_se;
+       if (!rt_se)
+               return;
+
         rt_se->rt_rq = &rq->rt;
         rt_se->my_q = rt_rq;
         rt_se->parent = NULL;
@@ -7539,18 +7548,56 @@ void __init sched_init(void)
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 init_task_group.shares = init_task_group_load;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+               /*
+                * How much cpu bandwidth does init_task_group get?
+                *
+                * In case of task-groups formed thr' the cgroup filesystem, it
+                * gets 100% of the cpu resources in the system. This overall
+                * system cpu resource is divided among the tasks of
+                * init_task_group and its child task-groups in a fair manner,
+                * based on each entity's (task or task-group's) weight
+                * (se->load.weight).
+                *
+                * In other words, if init_task_group has 10 tasks of weight
+                * 1024) and two child groups A0 and A1 (of weight 1024 each),
+                * then A0's share of the cpu resource is:
+                *
+                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                *
+                * We achieve this by letting init_task_group's tasks sit
+                * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+                */
+               init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1);
+#elif defined CONFIG_USER_SCHED
+               /*
+                * In case of task-groups formed thr' the user id of tasks,
+                * init_task_group represents tasks belonging to root user.
+                * Hence it forms a sibling of all subsequent groups formed.
+                * In this case, init_task_group gets only a fraction of overall
+                * system cpu resource, based on the weight assigned to root
+                * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
+                * by letting tasks of init_task_group sit in a separate cfs_rq
+                * (init_cfs_rq) and having one entity represent this group of
+                * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
+                */
                 init_tg_cfs_entry(rq, &init_task_group,
                                 &per_cpu(init_cfs_rq, i),
                                 &per_cpu(init_sched_entity, i), i, 1);
  
  #endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
  #ifdef CONFIG_RT_GROUP_SCHED
                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+               init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1);
+#elif defined CONFIG_USER_SCHED
                 init_tg_rt_entry(rq, &init_task_group,
                                 &per_cpu(init_rt_rq, i),
                                 &per_cpu(init_sched_rt_entity, i), i, 1);
-#else
-               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#endif
  #endif
  
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 022e036..3dde0f0 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
         return 0;
  }
  
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+       int depth = 0;
+
+       for_each_sched_entity(se)
+               depth++;
+
+       return depth;
+}
+
  /*
   * Preempt the current task with a newly woken task if needed:
   */
@@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
         struct task_struct *curr = rq->curr;
         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
         struct sched_entity *se = &curr->se, *pse = &p->se;
+       int se_depth, pse_depth;
  
         if (unlikely(rt_prio(p->prio))) {
                 update_rq_clock(rq);
@@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
         if (!sched_feat(WAKEUP_PREEMPT))
                 return;
  
+       /*
+        * preemption test can be made between sibling entities who are in the
+        * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+        * both tasks until we find their ancestors who are siblings of common
+        * parent.
+        */
+
+       /* First walk up until both entities are at same depth */
+       se_depth = depth_se(se);
+       pse_depth = depth_se(pse);
+
+       while (se_depth > pse_depth) {
+               se_depth--;
+               se = parent_entity(se);
+       }
+
+       while (pse_depth > se_depth) {
+               pse_depth--;
+               pse = parent_entity(pse);
+       }
+
         while (!is_same_group(se, pse)) {
                 se = parent_entity(se);
                 pse = parent_entity(pse);
@@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
  static struct task_struct *
  __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
  {
-       struct task_struct *p;
+       struct task_struct *p = NULL;
+       struct sched_entity *se;
  
         if (!curr)
                 return NULL;
  
-       p = rb_entry(curr, struct task_struct, se.run_node);
-       cfs_rq->rb_load_balance_curr = rb_next(curr);
+       /* Skip over entities that are not tasks */
+       do {
+               se = rb_entry(curr, struct sched_entity, run_node);
+               curr = rb_next(curr);
+       } while (curr && !entity_is_task(se));
+
+       cfs_rq->rb_load_balance_curr = curr;
+
+       if (entity_is_task(se))
+               p = task_of(se);
  
         return p;
  }
@@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
  {
         struct cfs_rq *cfs_rq;
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
-#endif
         rcu_read_lock();
         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
                 print_cfs_rq(m, cpu, cfs_rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index 8ff8245..201a693 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq)
         curr->se.exec_start = rq->clock;
         cpuacct_charge(curr, delta_exec);
  
-       spin_lock(&rt_rq->rt_runtime_lock);
-       rt_rq->rt_time += delta_exec;
-       if (sched_rt_runtime_exceeded(rt_rq))
-               resched_task(curr);
-       spin_unlock(&rt_rq->rt_runtime_lock);
+       for_each_sched_rt_entity(rt_se) {
+               rt_rq = rt_rq_of_se(rt_se);
+
+               spin_lock(&rt_rq->rt_runtime_lock);
+               rt_rq->rt_time += delta_exec;
+               if (sched_rt_runtime_exceeded(rt_rq))
+                       resched_task(curr);
+               spin_unlock(&rt_rq->rt_runtime_lock);
+       }
  }
  
  static inline
@@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
   * entries, we must remove entries top - down.
   *
   * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
- *      doesn't matter much for now, as h=2 for GROUP_SCHED.
   */
  static void dequeue_rt_stack(struct task_struct *p)
  {
author	Dhaval Giani <dhaval@linux.vnet.ibm.com>
	Sat, 19 Apr 2008 17:44:59 +0000 (19:44 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Sat, 19 Apr 2008 17:44:59 +0000 (19:44 +0200)
kernel/sched.c		patch \| blob \| history
kernel/sched_fair.c		patch \| blob \| history
kernel/sched_rt.c		patch \| blob \| history