Merge branch 'tracing-v28-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 927c930..d906f72 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
+#include <trace/sched.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
@@ -201,14 +202,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
         hrtimer_init(&rt_b->rt_period_timer,
                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rt_b->rt_period_timer.function = sched_rt_period_timer;
-       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+}
+
+static inline int rt_bandwidth_enabled(void)
+{
+       return sysctl_sched_rt_runtime >= 0;
  }
  
  static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  {
         ktime_t now;
  
-       if (rt_b->rt_runtime == RUNTIME_INF)
+       if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
                 return;
  
         if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
  static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
  #endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
  #define root_task_group init_task_group
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_USER_SCHED */
  
  /* task_group_lock serializes add/remove of task groups and also changes to
   * a task group's cpu shares.
@@ -1087,7 +1093,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
         return NOTIFY_DONE;
  }
  
-static void init_hrtick(void)
+static __init void init_hrtick(void)
  {
         hotcpu_notifier(hotplug_hrtick, 0);
  }
@@ -1119,7 +1125,7 @@ static void init_rq_hrtick(struct rq *rq)
  
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rq->hrtick_timer.function = hrtick;
-       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  }
  #else  /* CONFIG_SCHED_HRTICK */
  static inline void hrtick_clear(struct rq *rq)
@@ -1380,38 +1386,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
         update_load_sub(&rq->load, load);
  }
  
-#ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-
-       if (rq->nr_running)
-               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-
-       return rq->avg_load_per_task;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+typedef int (*tg_visitor)(struct task_group *, void *);
  
  /*
   * Iterate the full tree, calling @down when first entering a node and @up when
   * leaving it for the final time.
   */
-static void
-walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
  {
         struct task_group *parent, *child;
+       int ret;
  
         rcu_read_lock();
         parent = &root_task_group;
  down:
-       (*down)(parent, cpu, sd);
+       ret = (*down)(parent, data);
+       if (ret)
+               goto out_unlock;
         list_for_each_entry_rcu(child, &parent->children, siblings) {
                 parent = child;
                 goto down;
@@ -1419,15 +1411,43 @@ down:
  up:
                 continue;
         }
-       (*up)(parent, cpu, sd);
+       ret = (*up)(parent, data);
+       if (ret)
+               goto out_unlock;
  
         child = parent;
         parent = parent->parent;
         if (parent)
                 goto up;
+out_unlock:
         rcu_read_unlock();
+
+       return ret;
  }
  
+static int tg_nop(struct task_group *tg, void *data)
+{
+       return 0;
+}
+#endif
+
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       if (rq->nr_running)
+               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+       return rq->avg_load_per_task;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
  static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  
  /*
@@ -1486,11 +1506,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
   * This needs to be done in a bottom-up fashion because the rq weight of a
   * parent group depends on the shares of its child groups.
   */
-static void
-tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_shares_up(struct task_group *tg, void *data)
  {
         unsigned long rq_weight = 0;
         unsigned long shares = 0;
+       struct sched_domain *sd = data;
         int i;
  
         for_each_cpu_mask(i, sd->span) {
@@ -1515,6 +1535,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
                 __update_group_shares_cpu(tg, i, shares, rq_weight);
                 spin_unlock_irqrestore(&rq->lock, flags);
         }
+
+       return 0;
  }
  
  /*
@@ -1522,10 +1544,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
   * This needs to be done in a top-down fashion because the load of a child
   * group is a fraction of its parents load.
   */
-static void
-tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_load_down(struct task_group *tg, void *data)
  {
         unsigned long load;
+       long cpu = (long)data;
  
         if (!tg->parent) {
                 load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1558,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
         }
  
         tg->cfs_rq[cpu]->h_load = load;
-}
  
-static void
-tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
+       return 0;
  }
  
  static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1569,7 @@ static void update_shares(struct sched_domain *sd)
  
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                 sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+               walk_tg_tree(tg_nop, tg_shares_up, sd);
         }
  }
  
@@ -1561,9 +1580,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
         spin_lock(&rq->lock);
  }
  
-static void update_h_load(int cpu)
+static void update_h_load(long cpu)
  {
-       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+       walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
  #else
@@ -1918,6 +1937,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                  * just go back and repeat.
                  */
                 rq = task_rq_lock(p, &flags);
+               trace_sched_wait_task(rq, p);
                 running = task_running(rq, p);
                 on_rq = p->se.on_rq;
                 ncsw = 0;
@@ -2279,9 +2299,7 @@ out_activate:
         success = 1;
  
  out_running:
-       trace_mark(kernel_sched_wakeup,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
+       trace_sched_wakeup(rq, p);
         check_preempt_curr(rq, p, sync);
  
         p->state = TASK_RUNNING;
@@ -2414,9 +2432,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
-       trace_mark(kernel_sched_wakeup_new,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
+       trace_sched_wakeup_new(rq, p);
         check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@ -2589,11 +2605,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         struct mm_struct *mm, *oldmm;
  
         prepare_task_switch(rq, prev, next);
-       trace_mark(kernel_sched_schedule,
-               "prev_pid %d next_pid %d prev_state %ld "
-               "## rq %p prev %p next %p",
-               prev->pid, next->pid, prev->state,
-               rq, prev, next);
+       trace_sched_switch(rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@ -2833,6 +2845,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
  
+       trace_sched_migrate_task(rq, p, dest_cpu);
         /* force the process onto the specified CPU */
         if (migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
@@ -4034,23 +4047,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
   */
-unsigned long long task_sched_runtime(struct task_struct *p)
+unsigned long long task_delta_exec(struct task_struct *p)
  {
         unsigned long flags;
-       u64 ns, delta_exec;
         struct rq *rq;
+       u64 ns = 0;
  
         rq = task_rq_lock(p, &flags);
-       ns = p->se.sum_exec_runtime;
+
         if (task_current(rq, p)) {
+               u64 delta_exec;
+
                 update_rq_clock(rq);
                 delta_exec = rq->clock - p->se.exec_start;
                 if ((s64)delta_exec > 0)
-                       ns += delta_exec;
+                       ns = delta_exec;
         }
+
         task_rq_unlock(rq, &flags);
  
         return ns;
@@ -4067,6 +4083,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
         cputime64_t tmp;
  
         p->utime = cputime_add(p->utime, cputime);
+       account_group_user_time(p, cputime);
  
         /* Add user time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@ -4091,6 +4108,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
         tmp = cputime_to_cputime64(cputime);
  
         p->utime = cputime_add(p->utime, cputime);
+       account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
  
         cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4126,6 +4144,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         }
  
         p->stime = cputime_add(p->stime, cputime);
+       account_group_system_time(p, cputime);
  
         /* Add system time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@ -4167,6 +4186,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
  
         if (p == rq->idle) {
                 p->stime = cputime_add(p->stime, steal);
+               account_group_system_time(p, steal);
                 if (atomic_read(&rq->nr_iowait) > 0)
                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                 else
@@ -5171,7 +5191,8 @@ recheck:
                  * Do not allow realtime tasks into groups that have no runtime
                  * assigned.
                  */
-               if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+               if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->rt_bandwidth.rt_runtime == 0)
                         return -EPERM;
  #endif
  
@@ -6332,7 +6353,7 @@ set_table_entry(struct ctl_table *entry,
  static struct ctl_table *
  sd_alloc_ctl_domain_table(struct sched_domain *sd)
  {
-       struct ctl_table *table = sd_alloc_ctl_entry(12);
+       struct ctl_table *table = sd_alloc_ctl_entry(13);
  
         if (table == NULL)
                 return NULL;
@@ -6360,7 +6381,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                 sizeof(int), 0644, proc_dointvec_minmax);
         set_table_entry(&table[10], "flags", &sd->flags,
                 sizeof(int), 0644, proc_dointvec_minmax);
-       /* &table[11] is terminator */
+       set_table_entry(&table[11], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring);
+       /* &table[12] is terminator */
  
         return table;
  }
@@ -7244,13 +7267,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
   */
  
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)                sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)                do { } while (0)
+#endif
+
  #define        SD_INIT(sd, type)       sd_init_##type(sd)
+
  #define SD_INIT_FUNC(type)     \
  static noinline void sd_init_##type(struct sched_domain *sd)   \
  {                                                              \
         memset(sd, 0, sizeof(*sd));                             \
         *sd = SD_##type##_INIT;                                 \
         sd->level = SD_LV_##type;                               \
+       SD_INIT_NAME(sd, type);                                 \
  }
  
  SD_INIT_FUNC(CPU)
@@ -8808,73 +8839,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
  static unsigned long to_ratio(u64 period, u64 runtime)
  {
         if (runtime == RUNTIME_INF)
-               return 1ULL << 16;
+               return 1ULL << 20;
  
-       return div64_u64(runtime << 16, period);
+       return div64_u64(runtime << 20, period);
  }
  
-#ifdef CONFIG_CGROUP_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+       struct task_struct *g, *p;
+
+       do_each_thread(g, p) {
+               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                       return 1;
+       } while_each_thread(g, p);
+
+       return 0;
+}
+
+struct rt_schedulable_data {
+       struct task_group *tg;
+       u64 rt_period;
+       u64 rt_runtime;
+};
+
+static int tg_schedulable(struct task_group *tg, void *data)
  {
-       struct task_group *tgi, *parent = tg->parent;
-       unsigned long total = 0;
+       struct rt_schedulable_data *d = data;
+       struct task_group *child;
+       unsigned long total, sum = 0;
+       u64 period, runtime;
  
-       if (!parent) {
-               if (global_rt_period() < period)
-                       return 0;
+       period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       runtime = tg->rt_bandwidth.rt_runtime;
  
-               return to_ratio(period, runtime) <
-                       to_ratio(global_rt_period(), global_rt_runtime());
+       if (tg == d->tg) {
+               period = d->rt_period;
+               runtime = d->rt_runtime;
         }
  
-       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-               return 0;
+       /*
+        * Cannot have more runtime than the period.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-               if (tgi == tg)
-                       continue;
+       /*
+        * Ensure we don't starve existing RT tasks.
+        */
+       if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+               return -EBUSY;
  
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
-       }
-       rcu_read_unlock();
+       total = to_ratio(period, runtime);
  
-       return total + to_ratio(period, runtime) <=
-               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-                               parent->rt_bandwidth.rt_runtime);
-}
-#elif defined CONFIG_USER_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-       struct task_group *tgi;
-       unsigned long total = 0;
-       unsigned long global_ratio =
-               to_ratio(global_rt_period(), global_rt_runtime());
+       /*
+        * Nobody can have more than the global setting allows.
+        */
+       if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+               return -EINVAL;
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &task_groups, list) {
-               if (tgi == tg)
-                       continue;
+       /*
+        * The sum of our children's runtime should not exceed our own.
+        */
+       list_for_each_entry_rcu(child, &tg->children, siblings) {
+               period = ktime_to_ns(child->rt_bandwidth.rt_period);
+               runtime = child->rt_bandwidth.rt_runtime;
  
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
+               if (child == d->tg) {
+                       period = d->rt_period;
+                       runtime = d->rt_runtime;
+               }
+
+               sum += to_ratio(period, runtime);
         }
-       rcu_read_unlock();
  
-       return total + to_ratio(period, runtime) < global_ratio;
+       if (sum > total)
+               return -EINVAL;
+
+       return 0;
  }
-#endif
  
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  {
-       struct task_struct *g, *p;
-       do_each_thread(g, p) {
-               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-                       return 1;
-       } while_each_thread(g, p);
-       return 0;
+       struct rt_schedulable_data data = {
+               .tg = tg,
+               .rt_period = period,
+               .rt_runtime = runtime,
+       };
+
+       return walk_tg_tree(tg_schedulable, tg_nop, &data);
  }
  
  static int tg_set_bandwidth(struct task_group *tg,
@@ -8884,14 +8937,9 @@ static int tg_set_bandwidth(struct task_group *tg,
  
         mutex_lock(&rt_constraints_mutex);
         read_lock(&tasklist_lock);
-       if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-               err = -EBUSY;
-               goto unlock;
-       }
-       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-               err = -EINVAL;
+       err = __rt_schedulable(tg, rt_period, rt_runtime);
+       if (err)
                 goto unlock;
-       }
  
         spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8960,16 +9008,25 @@ long sched_group_rt_period(struct task_group *tg)
  
  static int sched_rt_global_constraints(void)
  {
-       struct task_group *tg = &root_task_group;
-       u64 rt_runtime, rt_period;
+       u64 runtime, period;
         int ret = 0;
  
-       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       rt_runtime = tg->rt_bandwidth.rt_runtime;
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
+
+       runtime = global_rt_runtime();
+       period = global_rt_period();
+
+       /*
+        * Sanity check on the sysctl variables.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
  
         mutex_lock(&rt_constraints_mutex);
-       if (!__rt_schedulable(tg, rt_period, rt_runtime))
-               ret = -EINVAL;
+       read_lock(&tasklist_lock);
+       ret = __rt_schedulable(NULL, 0, 0);
+       read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);
  
         return ret;
@@ -8980,6 +9037,9 @@ static int sched_rt_global_constraints(void)
         unsigned long flags;
         int i;
  
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
+
         spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -9040,7 +9100,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
  
         if (!cgrp->parent) {
                 /* This is early initialization for the top cgroup */
-               init_task_group.css.cgroup = cgrp;
                 return &init_task_group.css;
         }
  
@@ -9049,9 +9108,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
         if (IS_ERR(tg))
                 return ERR_PTR(-ENOMEM);
  
-       /* Bind the cgroup to task_group object we just created */
-       tg->css.cgroup = cgrp;
-
         return &tg->css;
  }