tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp()

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index e1299de..d377097 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
  #include <linux/cpuset.h>
  #include <linux/percpu.h>
  #include <linux/kthread.h>
+#include <linux/proc_fs.h>
  #include <linux/seq_file.h>
  #include <linux/sysctl.h>
  #include <linux/syscalls.h>
@@ -71,6 +72,7 @@
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
+#include <trace/sched.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
@@ -116,6 +118,12 @@
   */
  #define RUNTIME_INF    ((u64)~0ULL)
  
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
  #ifdef CONFIG_SMP
  /*
   * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -201,7 +209,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
         hrtimer_init(&rt_b->rt_period_timer,
                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rt_b->rt_period_timer.function = sched_rt_period_timer;
-       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
  }
  
  static inline int rt_bandwidth_enabled(void)
@@ -226,9 +234,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  
                 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start(&rt_b->rt_period_timer,
-                             rt_b->rt_period_timer.expires,
-                             HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&rt_b->rt_period_timer,
+                               HRTIMER_MODE_ABS);
         }
         spin_unlock(&rt_b->rt_runtime_lock);
  }
@@ -385,7 +392,6 @@ struct cfs_rq {
  
         u64 exec_clock;
         u64 min_vruntime;
-       u64 pair_start;
  
         struct rb_root tasks_timeline;
         struct rb_node *rb_leftmost;
@@ -397,9 +403,9 @@ struct cfs_rq {
          * 'curr' points to currently running entity on this cfs_rq.
          * It is set to NULL otherwise (i.e when none are currently running).
          */
-       struct sched_entity *curr, *next;
+       struct sched_entity *curr, *next, *last;
  
-       unsigned long nr_spread_over;
+       unsigned int nr_spread_over;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -818,6 +824,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
  unsigned int sysctl_sched_shares_ratelimit = 250000;
  
  /*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+
+/*
   * period over which we measure -rt task cpu usage in us.
   * default: 1s
   */
@@ -962,6 +975,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
         }
  }
  
+void task_rq_unlock_wait(struct task_struct *p)
+{
+       struct rq *rq = task_rq(p);
+
+       smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+       spin_unlock_wait(&rq->lock);
+}
+
  static void __task_rq_unlock(struct rq *rq)
         __releases(rq->lock)
  {
@@ -1063,7 +1084,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
         struct hrtimer *timer = &rq->hrtick_timer;
         ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
  
-       timer->expires = time;
+       hrtimer_set_expires(timer, time);
  
         if (rq == this_rq()) {
                 hrtimer_restart(timer);
@@ -1124,7 +1145,7 @@ static void init_rq_hrtick(struct rq *rq)
  
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rq->hrtick_timer.function = hrtick;
-       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  }
  #else  /* CONFIG_SCHED_HRTICK */
  static inline void hrtick_clear(struct rq *rq)
@@ -1438,9 +1459,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
+       unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
  
-       if (rq->nr_running)
-               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+       if (nr_running)
+               rq->avg_load_per_task = rq->load.weight / nr_running;
+       else
+               rq->avg_load_per_task = 0;
  
         return rq->avg_load_per_task;
  }
@@ -1453,8 +1477,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
   * Calculate and set the cpu's group shares.
   */
  static void
-__update_group_shares_cpu(struct task_group *tg, int cpu,
-                         unsigned long sd_shares, unsigned long sd_rq_weight)
+update_group_shares_cpu(struct task_group *tg, int cpu,
+                       unsigned long sd_shares, unsigned long sd_rq_weight)
  {
         int boost = 0;
         unsigned long shares;
@@ -1485,19 +1509,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
          *
          */
         shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
  
-       /*
-        * record the actual number of shares, not the boosted amount.
-        */
-       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-       tg->cfs_rq[cpu]->rq_weight = rq_weight;
+       if (abs(shares - tg->se[cpu]->load.weight) >
+                       sysctl_sched_shares_thresh) {
+               struct rq *rq = cpu_rq(cpu);
+               unsigned long flags;
  
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       else if (shares > MAX_SHARES)
-               shares = MAX_SHARES;
+               spin_lock_irqsave(&rq->lock, flags);
+               /*
+                * record the actual number of shares, not the boosted amount.
+                */
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+               tg->cfs_rq[cpu]->rq_weight = rq_weight;
  
-       __set_se_shares(tg->se[cpu], shares);
+               __set_se_shares(tg->se[cpu], shares);
+               spin_unlock_irqrestore(&rq->lock, flags);
+       }
  }
  
  /*
@@ -1526,14 +1554,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
         if (!rq_weight)
                 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
  
-       for_each_cpu_mask(i, sd->span) {
-               struct rq *rq = cpu_rq(i);
-               unsigned long flags;
-
-               spin_lock_irqsave(&rq->lock, flags);
-               __update_group_shares_cpu(tg, i, shares, rq_weight);
-               spin_unlock_irqrestore(&rq->lock, flags);
-       }
+       for_each_cpu_mask(i, sd->span)
+               update_group_shares_cpu(tg, i, shares, rq_weight);
  
         return 0;
  }
@@ -1800,7 +1822,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         /*
          * Buddy candidates are cache hot:
          */
-       if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+       if (sched_feat(CACHE_HOT_BUDDY) &&
+                       (&p->se == cfs_rq_of(&p->se)->next ||
+                        &p->se == cfs_rq_of(&p->se)->last))
                 return 1;
  
         if (p->sched_class != &fair_sched_class)
@@ -1827,6 +1851,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  
         clock_offset = old_rq->clock - new_rq->clock;
  
+       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
  #ifdef CONFIG_SCHEDSTATS
         if (p->se.wait_start)
                 p->se.wait_start -= clock_offset;
@@ -1936,6 +1962,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                  * just go back and repeat.
                  */
                 rq = task_rq_lock(p, &flags);
+               trace_sched_wait_task(rq, p);
                 running = task_running(rq, p);
                 on_rq = p->se.on_rq;
                 ncsw = 0;
@@ -2297,9 +2324,7 @@ out_activate:
         success = 1;
  
  out_running:
-       trace_mark(kernel_sched_wakeup,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
+       trace_sched_wakeup(rq, p);
         check_preempt_curr(rq, p, sync);
  
         p->state = TASK_RUNNING;
@@ -2432,9 +2457,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
-       trace_mark(kernel_sched_wakeup_new,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
+       trace_sched_wakeup_new(rq, p);
         check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@ -2607,11 +2630,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         struct mm_struct *mm, *oldmm;
  
         prepare_task_switch(rq, prev, next);
-       trace_mark(kernel_sched_schedule,
-               "prev_pid %d next_pid %d prev_state %ld "
-               "## rq %p prev %p next %p",
-               prev->pid, next->pid, prev->state,
-               rq, prev, next);
+       trace_sched_switch(rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@ -3344,7 +3363,7 @@ small_imbalance:
                 } else
                         this_load_per_task = cpu_avg_load_per_task(this_cpu);
  
-               if (max_load - this_load + 2*busiest_load_per_task >=
+               if (max_load - this_load + busiest_load_per_task >=
                                         busiest_load_per_task * imbn) {
                         *imbalance = busiest_load_per_task;
                         return busiest;
@@ -4052,23 +4071,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
   */
-unsigned long long task_sched_runtime(struct task_struct *p)
+unsigned long long task_delta_exec(struct task_struct *p)
  {
         unsigned long flags;
-       u64 ns, delta_exec;
         struct rq *rq;
+       u64 ns = 0;
  
         rq = task_rq_lock(p, &flags);
-       ns = p->se.sum_exec_runtime;
+
         if (task_current(rq, p)) {
+               u64 delta_exec;
+
                 update_rq_clock(rq);
                 delta_exec = rq->clock - p->se.exec_start;
                 if ((s64)delta_exec > 0)
-                       ns += delta_exec;
+                       ns = delta_exec;
         }
+
         task_rq_unlock(rq, &flags);
  
         return ns;
@@ -4085,6 +4107,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
         cputime64_t tmp;
  
         p->utime = cputime_add(p->utime, cputime);
+       account_group_user_time(p, cputime);
  
         /* Add user time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@ -4109,6 +4132,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
         tmp = cputime_to_cputime64(cputime);
  
         p->utime = cputime_add(p->utime, cputime);
+       account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
  
         cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4144,6 +4168,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         }
  
         p->stime = cputime_add(p->stime, cputime);
+       account_group_system_time(p, cputime);
  
         /* Add system time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@ -4185,6 +4210,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
  
         if (p == rq->idle) {
                 p->stime = cputime_add(p->stime, steal);
+               account_group_system_time(p, steal);
                 if (atomic_read(&rq->nr_iowait) > 0)
                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                 else
@@ -4441,12 +4467,8 @@ need_resched_nonpreemptible:
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
  
-       /*
-        * Do the rq-clock update outside the rq lock:
-        */
-       local_irq_disable();
+       spin_lock_irq(&rq->lock);
         update_rq_clock(rq);
-       spin_lock(&rq->lock);
         clear_tsk_need_resched(prev);
  
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -5856,6 +5878,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
  
+       spin_lock_irqsave(&rq->lock, flags);
+
         __sched_fork(idle);
         idle->se.exec_start = sched_clock();
  
@@ -5863,7 +5887,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         idle->cpus_allowed = cpumask_of_cpu(cpu);
         __set_task_cpu(idle, cpu);
  
-       spin_lock_irqsave(&rq->lock, flags);
         rq->curr = rq->idle = idle;
  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
         idle->oncpu = 1;
@@ -5880,6 +5903,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
          * The idle tasks have their own, simple scheduling class:
          */
         idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_task(idle);
  }
  
  /*
@@ -6351,7 +6375,7 @@ set_table_entry(struct ctl_table *entry,
  static struct ctl_table *
  sd_alloc_ctl_domain_table(struct sched_domain *sd)
  {
-       struct ctl_table *table = sd_alloc_ctl_entry(12);
+       struct ctl_table *table = sd_alloc_ctl_entry(13);
  
         if (table == NULL)
                 return NULL;
@@ -6379,7 +6403,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                 sizeof(int), 0644, proc_dointvec_minmax);
         set_table_entry(&table[10], "flags", &sd->flags,
                 sizeof(int), 0644, proc_dointvec_minmax);
-       /* &table[11] is terminator */
+       set_table_entry(&table[11], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring);
+       /* &table[12] is terminator */
  
         return table;
  }
@@ -6871,15 +6897,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
         struct sched_domain *tmp;
  
         /* Remove the sched domains which do not contribute to scheduling. */
-       for (tmp = sd; tmp; tmp = tmp->parent) {
+       for (tmp = sd; tmp; ) {
                 struct sched_domain *parent = tmp->parent;
                 if (!parent)
                         break;
+
                 if (sd_parent_degenerate(tmp, parent)) {
                         tmp->parent = parent->parent;
                         if (parent->parent)
                                 parent->parent->child = tmp;
-               }
+               } else
+                       tmp = tmp->parent;
         }
  
         if (sd && sd_degenerate(sd)) {
@@ -7263,13 +7291,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
   */
  
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)                sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)                do { } while (0)
+#endif
+
  #define        SD_INIT(sd, type)       sd_init_##type(sd)
+
  #define SD_INIT_FUNC(type)     \
  static noinline void sd_init_##type(struct sched_domain *sd)   \
  {                                                              \
         memset(sd, 0, sizeof(*sd));                             \
         *sd = SD_##type##_INIT;                                 \
         sd->level = SD_LV_##type;                               \
+       SD_INIT_NAME(sd, type);                                 \
  }
  
  SD_INIT_FUNC(CPU)
@@ -7660,6 +7696,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
  error:
         free_sched_groups(cpu_map, tmpmask);
         SCHED_CPUMASK_FREE((void *)allmasks);
+       kfree(rd);
         return -ENOMEM;
  #endif
  }
@@ -7761,13 +7798,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
   *
   * The passed in 'doms_new' should be kmalloc'd. This routine takes
   * ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
   *
- * If doms_new==NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
- * It will not create the default domain.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
   *
   * Call with hotplug lock held
   */
@@ -9088,7 +9126,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
  
         if (!cgrp->parent) {
                 /* This is early initialization for the top cgroup */
-               init_task_group.css.cgroup = cgrp;
                 return &init_task_group.css;
         }
  
@@ -9097,9 +9134,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
         if (IS_ERR(tg))
                 return ERR_PTR(-ENOMEM);
  
-       /* Bind the cgroup to task_group object we just created */
-       tg->css.cgroup = cgrp;
-
         return &tg->css;
  }