kill-the-bkl/reiserfs: fix reiserfs lock to cpu_add_remove_lock dependency

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 55a10b8..1b59e26 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
  #include <linux/completion.h>
  #include <linux/kernel_stat.h>
  #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
  #include <linux/security.h>
  #include <linux/notifier.h>
  #include <linux/profile.h>
@@ -68,17 +69,18 @@
  #include <linux/pagemap.h>
  #include <linux/hrtimer.h>
  #include <linux/tick.h>
-#include <linux/bootmem.h>
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
-#include <trace/sched.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
  
  #include "sched_cpupri.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
  /*
   * Convert user-nice values [ -20 ... 0 ... 19 ]
   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
   */
  #define RUNTIME_INF    ((u64)~0ULL)
  
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
  #ifdef CONFIG_SMP
  
  static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -231,13 +227,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  
         spin_lock(&rt_b->rt_runtime_lock);
         for (;;) {
+               unsigned long delta;
+               ktime_t soft, hard;
+
                 if (hrtimer_active(&rt_b->rt_period_timer))
                         break;
  
                 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start_expires(&rt_b->rt_period_timer,
-                               HRTIMER_MODE_ABS);
+
+               soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+               hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+               delta = ktime_to_ns(ktime_sub(hard, soft));
+               __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+                               HRTIMER_MODE_ABS_PINNED, 0);
         }
         spin_unlock(&rt_b->rt_runtime_lock);
  }
@@ -490,6 +493,7 @@ struct rt_rq {
  #endif
  #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
+       unsigned long rt_nr_total;
         int overloaded;
         struct plist_head pushable_tasks;
  #endif
@@ -577,6 +581,7 @@ struct rq {
         struct load_weight load;
         unsigned long nr_load_updates;
         u64 nr_switches;
+       u64 nr_migrations_in;
  
         struct cfs_rq cfs;
         struct rt_rq rt;
@@ -623,6 +628,10 @@ struct rq {
         struct list_head migration_queue;
  #endif
  
+       /* calc_load related fields */
+       unsigned long calc_load_update;
+       long calc_load_active;
+
  #ifdef CONFIG_SCHED_HRTICK
  #ifdef CONFIG_SMP
         int hrtick_csd_pending;
@@ -685,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
  #define task_rq(p)             cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
  {
         rq->clock = sched_clock_cpu(cpu_of(rq));
  }
@@ -1146,7 +1155,8 @@ static __init void init_hrtick(void)
   */
  static void hrtick_start(struct rq *rq, u64 delay)
  {
-       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+                       HRTIMER_MODE_REL_PINNED, 0);
  }
  
  static inline void init_hrtick(void)
@@ -1410,10 +1420,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
                    struct rq_iterator *iterator);
  #endif
  
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+       CPUACCT_STAT_USER,      /* ... user mode */
+       CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+
+       CPUACCT_STAT_NSTATS,
+};
+
  #ifdef CONFIG_CGROUP_CPUACCT
  static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val);
  #else
  static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val) {}
  #endif
  
  static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -1708,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
  }
  #endif
  
+static void calc_load_account_active(struct rq *this_rq);
+
  #include "sched_stats.h"
  #include "sched_idletask.c"
  #include "sched_fair.c"
@@ -1938,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  
         clock_offset = old_rq->clock - new_rq->clock;
  
-       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+       trace_sched_migrate_task(p, new_cpu);
  
  #ifdef CONFIG_SCHEDSTATS
         if (p->se.wait_start)
@@ -1947,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 p->se.sleep_start -= clock_offset;
         if (p->se.block_start)
                 p->se.block_start -= clock_offset;
+#endif
         if (old_cpu != new_cpu) {
-               schedstat_inc(p, se.nr_migrations);
+               p->se.nr_migrations++;
+               new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
                 if (task_hot(p, old_rq->clock, NULL))
                         schedstat_inc(p, se.nr_forced2_migrations);
-       }
  #endif
+               perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+                                    1, 1, NULL, 0);
+       }
         p->se.vruntime -= old_cfsrq->min_vruntime -
                                          new_cfsrq->min_vruntime;
  
@@ -1995,6 +2024,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
  }
  
  /*
+ * wait_task_context_switch -  wait for a thread to complete at least one
+ *                             context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+       unsigned long nvcsw, nivcsw, flags;
+       int running;
+       struct rq *rq;
+
+       nvcsw   = p->nvcsw;
+       nivcsw  = p->nivcsw;
+       for (;;) {
+               /*
+                * The runqueue is assigned before the actual context
+                * switch. We need to take the runqueue lock.
+                *
+                * We could check initially without the lock but it is
+                * very likely that we need to take the lock in every
+                * iteration.
+                */
+               rq = task_rq_lock(p, &flags);
+               running = task_running(rq, p);
+               task_rq_unlock(rq, &flags);
+
+               if (likely(!running))
+                       break;
+               /*
+                * The switch count is incremented before the actual
+                * context switch. We thus wait for two switches to be
+                * sure at least one completed.
+                */
+               if ((p->nvcsw - nvcsw) > 1)
+                       break;
+               if ((p->nivcsw - nivcsw) > 1)
+                       break;
+
+               cpu_relax();
+       }
+}
+
+/*
   * wait_task_inactive - wait for a thread to unschedule.
   *
   * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2122,6 +2194,7 @@ void kick_process(struct task_struct *p)
                 smp_send_reschedule(cpu);
         preempt_enable();
  }
+EXPORT_SYMBOL_GPL(kick_process);
  
  /*
   * Return a low guess at the load of a migration-source cpu weighted
@@ -2304,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag)
  
  #endif /* CONFIG_SMP */
  
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:         the task to evaluate
+ * @func:      the function to be called
+ * @info:      the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                             void (*func) (void *info), void *info)
+{
+       int cpu;
+
+       preempt_disable();
+       cpu = task_cpu(p);
+       if (task_curr(p))
+               smp_call_function_single(cpu, func, info, 1);
+       preempt_enable();
+}
+
  /***
   * try_to_wake_up - wake up a thread
   * @p: the to-be-woken-up thread
@@ -2438,6 +2532,17 @@ out:
         return success;
  }
  
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
  int wake_up_process(struct task_struct *p)
  {
         return try_to_wake_up(p, TASK_ALL, 0);
@@ -2460,21 +2565,44 @@ static void __sched_fork(struct task_struct *p)
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
+       p->se.nr_migrations             = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
         p->se.start_runtime             = 0;
         p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_start                = 0;
-       p->se.sum_sleep_runtime         = 0;
-       p->se.sleep_start               = 0;
-       p->se.block_start               = 0;
-       p->se.sleep_max                 = 0;
-       p->se.block_max                 = 0;
-       p->se.exec_max                  = 0;
-       p->se.slice_max                 = 0;
-       p->se.wait_max                  = 0;
+       p->se.wait_start                        = 0;
+       p->se.wait_max                          = 0;
+       p->se.wait_count                        = 0;
+       p->se.wait_sum                          = 0;
+
+       p->se.sleep_start                       = 0;
+       p->se.sleep_max                         = 0;
+       p->se.sum_sleep_runtime                 = 0;
+
+       p->se.block_start                       = 0;
+       p->se.block_max                         = 0;
+       p->se.exec_max                          = 0;
+       p->se.slice_max                         = 0;
+
+       p->se.nr_migrations_cold                = 0;
+       p->se.nr_failed_migrations_affine       = 0;
+       p->se.nr_failed_migrations_running      = 0;
+       p->se.nr_failed_migrations_hot          = 0;
+       p->se.nr_forced_migrations              = 0;
+       p->se.nr_forced2_migrations             = 0;
+
+       p->se.nr_wakeups                        = 0;
+       p->se.nr_wakeups_sync                   = 0;
+       p->se.nr_wakeups_migrate                = 0;
+       p->se.nr_wakeups_local                  = 0;
+       p->se.nr_wakeups_remote                 = 0;
+       p->se.nr_wakeups_affine                 = 0;
+       p->se.nr_wakeups_affine_attempts        = 0;
+       p->se.nr_wakeups_passive                = 0;
+       p->se.nr_wakeups_idle                   = 0;
+
  #endif
  
         INIT_LIST_HEAD(&p->rt.run_list);
@@ -2690,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
+       perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
         if (post_schedule)
@@ -2746,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
          * combine the page table reload and the switch backend into
          * one hypercall.
          */
-       arch_enter_lazy_cpu_mode();
+       arch_start_context_switch(prev);
  
         if (unlikely(!mm)) {
                 next->active_mm = oldmm;
@@ -2836,19 +2965,81 @@ unsigned long nr_iowait(void)
         return sum;
  }
  
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:     pointer to dest load array
+ * @offset:    offset to add
+ * @shift:     shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+       loads[0] = (avenrun[0] + offset) << shift;
+       loads[1] = (avenrun[1] + offset) << shift;
+       loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
  {
-       unsigned long i, running = 0, uninterruptible = 0;
+       load *= exp;
+       load += active * (FIXED_1 - exp);
+       return load >> FSHIFT;
+}
  
-       for_each_online_cpu(i) {
-               running += cpu_rq(i)->nr_running;
-               uninterruptible += cpu_rq(i)->nr_uninterruptible;
-       }
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+       unsigned long upd = calc_load_update + 10;
+       long active;
  
-       if (unlikely((long)uninterruptible < 0))
-               uninterruptible = 0;
+       if (time_before(jiffies, upd))
+               return;
  
-       return running + uninterruptible;
+       active = atomic_long_read(&calc_load_tasks);
+       active = active > 0 ? active * FIXED_1 : 0;
+
+       avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+       avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+       avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+       calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+       long nr_active, delta;
+
+       nr_active = this_rq->nr_running;
+       nr_active += (long) this_rq->nr_uninterruptible;
+
+       if (nr_active != this_rq->calc_load_active) {
+               delta = nr_active - this_rq->calc_load_active;
+               this_rq->calc_load_active = nr_active;
+               atomic_long_add(delta, &calc_load_tasks);
+       }
+}
+
+/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+       return cpu_rq(cpu)->nr_migrations_in;
  }
  
  /*
@@ -2879,6 +3070,11 @@ static void update_cpu_load(struct rq *this_rq)
                         new_load += scale-1;
                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
         }
+
+       if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+               this_rq->calc_load_update += LOAD_FREQ;
+               calc_load_account_active(this_rq);
+       }
  }
  
  #ifdef CONFIG_SMP
@@ -4220,10 +4416,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
  static struct {
         atomic_t load_balancer;
         cpumask_var_t cpu_mask;
+       cpumask_var_t ilb_grp_nohz_mask;
  } nohz ____cacheline_aligned = {
         .load_balancer = ATOMIC_INIT(-1),
  };
  
+int get_nohz_load_balancer(void)
+{
+       return atomic_read(&nohz.load_balancer);
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:       The cpu whose lowest level of sched domain is to
+ *             be returned.
+ * @flag:      The flag to check for the lowest sched_domain
+ *             for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+       struct sched_domain *sd;
+
+       for_each_domain(cpu, sd)
+               if (sd && (sd->flags & flag))
+                       break;
+
+       return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:       The cpu whose domains we're iterating over.
+ * @sd:                variable holding the value of the power_savings_sd
+ *             for cpu.
+ * @flag:      The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+       for (sd = lowest_flag_domain(cpu, flag); \
+               (sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group: group to be checked for semi-idleness
+ *
+ * Returns:    1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+       cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                       sched_group_cpus(ilb_group));
+
+       /*
+        * A sched_group is semi-idle when it has atleast one busy cpu
+        * and atleast one idle cpu.
+        */
+       if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+               return 0;
+
+       if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+               return 0;
+
+       return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:       The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:    Returns the id of the idle load balancer if it exists,
+ *             Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+       struct sched_domain *sd;
+       struct sched_group *ilb_group;
+
+       /*
+        * Have idle load balancer selection from semi-idle packages only
+        * when power-aware load balancing is enabled
+        */
+       if (!(sched_smt_power_savings || sched_mc_power_savings))
+               goto out_done;
+
+       /*
+        * Optimize for the case when we have no idle CPUs or only one
+        * idle CPU. Don't walk the sched_domain hierarchy in such cases
+        */
+       if (cpumask_weight(nohz.cpu_mask) < 2)
+               goto out_done;
+
+       for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+               ilb_group = sd->groups;
+
+               do {
+                       if (is_semi_idle_group(ilb_group))
+                               return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+                       ilb_group = ilb_group->next;
+
+               } while (ilb_group != sd->groups);
+       }
+
+out_done:
+       return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+       return cpumask_first(nohz.cpu_mask);
+}
+#endif
+
  /*
   * This routine will try to nominate the ilb (idle load balancing)
   * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4278,8 +4595,24 @@ int select_nohz_load_balancer(int stop_tick)
                         /* make me the ilb owner */
                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
                                 return 1;
-               } else if (atomic_read(&nohz.load_balancer) == cpu)
+               } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                       int new_ilb;
+
+                       if (!(sched_smt_power_savings ||
+                                               sched_mc_power_savings))
+                               return 1;
+                       /*
+                        * Check to see if there is a more power-efficient
+                        * ilb.
+                        */
+                       new_ilb = find_new_ilb(cpu);
+                       if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                               atomic_set(&nohz.load_balancer, -1);
+                               resched_cpu(new_ilb);
+                               return 0;
+                       }
                         return 1;
+               }
         } else {
                 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                         return 0;
@@ -4448,15 +4781,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                 }
  
                 if (atomic_read(&nohz.load_balancer) == -1) {
-                       /*
-                        * simple selection for now: Nominate the
-                        * first cpu in the nohz list to be the next
-                        * ilb owner.
-                        *
-                        * TBD: Traverse the sched domains and nominate
-                        * the nearest cpu in the nohz.cpu_mask.
-                        */
-                       int ilb = cpumask_first(nohz.cpu_mask);
+                       int ilb = find_new_ilb(cpu);
  
                         if (ilb < nr_cpu_ids)
                                 resched_cpu(ilb);
@@ -4503,9 +4828,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
   * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
   */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+       u64 ns = 0;
+
+       if (task_current(rq, p)) {
+               update_rq_clock(rq);
+               ns = rq->clock - p->se.exec_start;
+               if ((s64)ns < 0)
+                       ns = 0;
+       }
+
+       return ns;
+}
+
  unsigned long long task_delta_exec(struct task_struct *p)
  {
         unsigned long flags;
@@ -4513,16 +4854,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
         u64 ns = 0;
  
         rq = task_rq_lock(p, &flags);
+       ns = do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
  
-       if (task_current(rq, p)) {
-               u64 delta_exec;
+       return ns;
+}
  
-               update_rq_clock(rq);
-               delta_exec = rq->clock - p->se.exec_start;
-               if ((s64)delta_exec > 0)
-                       ns = delta_exec;
-       }
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns = 0;
+
+       rq = task_rq_lock(p, &flags);
+       ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
+
+       return ns;
+}
+
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+       struct task_cputime totals;
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns;
  
+       rq = task_rq_lock(p, &flags);
+       thread_group_cputime(p, &totals);
+       ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
         task_rq_unlock(rq, &flags);
  
         return ns;
@@ -4551,6 +4925,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
         else
                 cpustat->user = cputime64_add(cpustat->user, tmp);
+
+       cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
         /* Account for user time used */
         acct_update_integrals(p);
  }
@@ -4612,6 +4988,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
  
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
         /* Account for system time used */
         acct_update_integrals(p);
  }
@@ -4659,7 +5037,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
  
         if (user_tick)
                 account_user_time(p, one_jiffy, one_jiffy_scaled);
-       else if (p != rq->idle)
+       else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
                 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
                                     one_jiffy_scaled);
         else
@@ -4767,16 +5145,15 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         spin_unlock(&rq->lock);
  
+       perf_counter_task_tick(curr, cpu);
+
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
         trigger_load_balance(rq, cpu);
  #endif
  }
  
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-                               defined(CONFIG_PREEMPT_TRACER))
-
-static inline unsigned long get_parent_ip(unsigned long addr)
+notrace unsigned long get_parent_ip(unsigned long addr)
  {
         if (in_lock_functions(addr)) {
                 addr = CALLER_ADDR2;
@@ -4786,6 +5163,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
         return addr;
  }
  
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+
  void __kprobes add_preempt_count(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
@@ -4934,13 +5314,15 @@ pick_next_task(struct rq *rq)
  /*
   * schedule() is the main scheduler function.
   */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
  
+need_resched:
+       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_qsctr_inc(cpu);
@@ -4980,6 +5362,7 @@ need_resched_nonpreemptible:
  
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
+               perf_counter_task_sched_out(prev, next, cpu);
  
                 rq->nr_switches++;
                 rq->curr = next;
@@ -4997,15 +5380,9 @@ need_resched_nonpreemptible:
  
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
-}
  
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-       preempt_disable();
-       __schedule();
         preempt_enable_no_resched();
-       if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+       if (need_resched())
                 goto need_resched;
  }
  EXPORT_SYMBOL(schedule);
@@ -5148,7 +5525,7 @@ EXPORT_SYMBOL(default_wake_function);
   * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
   * zero in this (rare) case, and we handle it by continuing to scan the queue.
   */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, int sync, void *key)
  {
         wait_queue_t *curr, *next;
@@ -5168,6 +5545,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
   * @mode: which threads
   * @nr_exclusive: how many wake-one or wake-many threads to wake up
   * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
   */
  void __wake_up(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, void *key)
@@ -5206,6 +5586,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
   * with each other. This can prevent needless bouncing between CPUs.
   *
   * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
   */
  void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, void *key)
@@ -5242,6 +5625,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);        /* For internal use only */
   * awakened in the same order in which they were queued.
   *
   * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
   */
  void complete(struct completion *x)
  {
@@ -5259,6 +5645,9 @@ EXPORT_SYMBOL(complete);
   * @x:  holds the state of this particular completion
   *
   * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
   */
  void complete_all(struct completion *x)
  {
@@ -6175,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
+static inline int should_resched(void)
+{
+       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
  static void __cond_resched(void)
  {
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6194,8 +6588,7 @@ static void __cond_resched(void)
  
  int __sched _cond_resched(void)
  {
-       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
-                                       system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                 __cond_resched();
                 return 1;
         }
@@ -6213,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
   */
  int cond_resched_lock(spinlock_t *lock)
  {
-       int resched = need_resched() && system_state == SYSTEM_RUNNING;
+       int resched = should_resched();
         int ret = 0;
  
         if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
-               if (resched && need_resched())
+               if (resched)
                         __cond_resched();
                 else
                         cpu_relax();
@@ -6233,7 +6626,7 @@ int __sched cond_resched_softirq(void)
  {
         BUG_ON(!in_softirq());
  
-       if (need_resched() && system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                 local_bh_enable();
                 __cond_resched();
                 local_bh_disable();
@@ -6417,8 +6810,9 @@ void sched_show_task(struct task_struct *p)
  #ifdef CONFIG_DEBUG_STACK_USAGE
         free = stack_not_used(p);
  #endif
-       printk(KERN_CONT "%5lu %5d %6d\n", free,
-               task_pid_nr(p), task_pid_nr(p->real_parent));
+       printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+               task_pid_nr(p), task_pid_nr(p->real_parent),
+               (unsigned long)task_thread_info(p)->flags);
  
         show_stack(p, NULL);
  }
@@ -6679,7 +7073,7 @@ static int migration_thread(void *data)
  
                 if (cpu_is_offline(cpu)) {
                         spin_unlock_irq(&rq->lock);
-                       goto wait_to_die;
+                       break;
                 }
  
                 if (rq->active_balance) {
@@ -6705,16 +7099,7 @@ static int migration_thread(void *data)
                 complete(&req->done);
         }
         __set_current_state(TASK_RUNNING);
-       return 0;
  
-wait_to_die:
-       /* Wait for kthread_stop */
-       set_current_state(TASK_INTERRUPTIBLE);
-       while (!kthread_should_stop()) {
-               schedule();
-               set_current_state(TASK_INTERRUPTIBLE);
-       }
-       __set_current_state(TASK_RUNNING);
         return 0;
  }
  
@@ -6897,6 +7282,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
  
         }
  }
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
+}
  #endif /* CONFIG_HOTPLUG_CPU */
  
  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7120,7 +7514,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 rq = task_rq_lock(p, &flags);
                 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
                 task_rq_unlock(rq, &flags);
+               get_task_struct(p);
                 cpu_rq(cpu)->migration_thread = p;
+               rq->calc_load_update = calc_load_update;
                 break;
  
         case CPU_ONLINE:
@@ -7148,6 +7544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 kthread_bind(cpu_rq(cpu)->migration_thread,
                              cpumask_any(cpu_online_mask));
                 kthread_stop(cpu_rq(cpu)->migration_thread);
+               put_task_struct(cpu_rq(cpu)->migration_thread);
                 cpu_rq(cpu)->migration_thread = NULL;
                 break;
  
@@ -7157,6 +7554,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 migrate_live_tasks(cpu);
                 rq = cpu_rq(cpu);
                 kthread_stop(rq->migration_thread);
+               put_task_struct(rq->migration_thread);
                 rq->migration_thread = NULL;
                 /* Idle task back to normal (off runqueue, low prio) */
                 spin_lock_irq(&rq->lock);
@@ -7170,7 +7568,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 cpuset_unlock();
                 migrate_nr_uninterruptible(rq);
                 BUG_ON(rq->nr_running != 0);
-
+               calc_global_load_remove(rq);
                 /*
                  * No need to migrate the tasks: it was best-effort if
                  * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7206,8 +7604,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
   */
  static struct notifier_block __cpuinitdata migration_notifier = {
         .notifier_call = migration_call,
@@ -7294,7 +7694,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+
                 printk(KERN_CONT " %s", str);
+               if (group->__cpu_power != SCHED_LOAD_SCALE) {
+                       printk(KERN_CONT " (__cpu_power = %d)",
+                               group->__cpu_power);
+               }
  
                 group = group->next;
         } while (group != sd->groups);
@@ -7445,26 +7850,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                 free_rootdomain(old_rd);
  }
  
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
  {
+       gfp_t gfp = GFP_KERNEL;
+
         memset(rd, 0, sizeof(*rd));
  
-       if (bootmem) {
-               alloc_bootmem_cpumask_var(&def_root_domain.span);
-               alloc_bootmem_cpumask_var(&def_root_domain.online);
-               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-               cpupri_init(&rd->cpupri, true);
-               return 0;
-       }
+       if (bootmem)
+               gfp = GFP_NOWAIT;
  
-       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->span, gfp))
                 goto out;
-       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->online, gfp))
                 goto free_span;
-       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->rto_mask, gfp))
                 goto free_online;
  
-       if (cpupri_init(&rd->cpupri, false) != 0)
+       if (cpupri_init(&rd->cpupri, bootmem) != 0)
                 goto free_rto_mask;
         return 0;
  
@@ -7675,8 +8077,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  
  /*
   * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
   */
  struct static_sched_group {
         struct sched_group sg;
@@ -7797,7 +8200,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                         struct sched_domain *sd;
  
                         sd = &per_cpu(phys_domains, j).sd;
-                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+                       if (j != group_first_cpu(sd->groups)) {
                                 /*
                                  * Only add "power" once for each
                                  * physical package.
@@ -7875,7 +8278,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
  
         WARN_ON(!sd || !sd->groups);
  
-       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+       if (cpu != group_first_cpu(sd->groups))
                 return;
  
         child = sd->child;
@@ -8653,6 +9056,8 @@ void __init sched_init_smp(void)
  }
  #endif /* CONFIG_SMP */
  
+const_debug unsigned int sysctl_timer_migration = 1;
+
  int in_sched_functions(unsigned long addr)
  {
         return in_lock_functions(addr) ||
@@ -8692,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
  #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
-       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+       plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
  #endif
  
         rt_rq->rt_time = 0;
@@ -8787,7 +9192,7 @@ void __init sched_init(void)
          * we use alloc_bootmem().
          */
         if (alloc_size) {
-               ptr = (unsigned long)alloc_bootmem(alloc_size);
+               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 init_task_group.se = (struct sched_entity **)ptr;
@@ -8860,6 +9265,8 @@ void __init sched_init(void)
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
                 rq->nr_running = 0;
+               rq->calc_load_active = 0;
+               rq->calc_load_update = jiffies + LOAD_FREQ;
                 init_cfs_rq(&rq->cfs, rq);
                 init_rt_rq(&rq->rt, rq);
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8880,7 +9287,7 @@ void __init sched_init(void)
                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
                  * then A0's share of the cpu resource is:
                  *
-                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                  *
                  * We achieve this by letting init_task_group's tasks sit
                  * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -8967,20 +9374,26 @@ void __init sched_init(void)
          * when this runqueue becomes "idle".
          */
         init_idle(current, smp_processor_id());
+
+       calc_load_update = jiffies + LOAD_FREQ;
+
         /*
          * During early bootup we pretend to be a normal task:
          */
         current->sched_class = &fair_sched_class;
  
         /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+       alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
  #ifdef CONFIG_SMP
  #ifdef CONFIG_NO_HZ
-       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+       alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+       alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
  #endif
-       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+       alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
  #endif /* SMP */
  
+       perf_counter_init();
+
         scheduler_running = 1;
  }
  
@@ -9722,6 +10135,13 @@ static int sched_rt_global_constraints(void)
         if (sysctl_sched_rt_period <= 0)
                 return -EINVAL;
  
+       /*
+        * There's always some RT tasks in the root group
+        * -- migration, kstopmachine etc..
+        */
+       if (sysctl_sched_rt_runtime == 0)
+               return -EBUSY;
+
         spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -9917,6 +10337,7 @@ struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
         u64 *cpuusage;
+       struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
         struct cpuacct *parent;
  };
  
@@ -9941,20 +10362,32 @@ static struct cgroup_subsys_state *cpuacct_create(
         struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
         struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+       int i;
  
         if (!ca)
-               return ERR_PTR(-ENOMEM);
+               goto out;
  
         ca->cpuusage = alloc_percpu(u64);
-       if (!ca->cpuusage) {
-               kfree(ca);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!ca->cpuusage)
+               goto out_free_ca;
+
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               if (percpu_counter_init(&ca->cpustat[i], 0))
+                       goto out_free_counters;
  
         if (cgrp->parent)
                 ca->parent = cgroup_ca(cgrp->parent);
  
         return &ca->css;
+
+out_free_counters:
+       while (--i >= 0)
+               percpu_counter_destroy(&ca->cpustat[i]);
+       free_percpu(ca->cpuusage);
+out_free_ca:
+       kfree(ca);
+out:
+       return ERR_PTR(-ENOMEM);
  }
  
  /* destroy an existing cpu accounting group */
@@ -9962,7 +10395,10 @@ static void
  cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
         struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
  
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               percpu_counter_destroy(&ca->cpustat[i]);
         free_percpu(ca->cpuusage);
         kfree(ca);
  }
@@ -10049,6 +10485,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
         return 0;
  }
  
+static const char *cpuacct_stat_desc[] = {
+       [CPUACCT_STAT_USER] = "user",
+       [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+               struct cgroup_map_cb *cb)
+{
+       struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
+
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+               s64 val = percpu_counter_read(&ca->cpustat[i]);
+               val = cputime64_to_clock_t(val);
+               cb->fill(cb, cpuacct_stat_desc[i], val);
+       }
+       return 0;
+}
+
  static struct cftype files[] = {
         {
                 .name = "usage",
@@ -10059,7 +10514,10 @@ static struct cftype files[] = {
                 .name = "usage_percpu",
                 .read_seq_string = cpuacct_percpu_seq_read,
         },
-
+       {
+               .name = "stat",
+               .read_map = cpuacct_stats_show,
+       },
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10081,12 +10539,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
                 return;
  
         cpu = task_cpu(tsk);
+
+       rcu_read_lock();
+
         ca = task_ca(tsk);
  
         for (; ca; ca = ca->parent) {
                 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
+
+       rcu_read_unlock();
+}
+
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val)
+{
+       struct cpuacct *ca;
+
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       rcu_read_lock();
+       ca = task_ca(tsk);
+
+       do {
+               percpu_counter_add(&ca->cpustat[idx], val);
+               ca = ca->parent;
+       } while (ca);
+       rcu_read_unlock();
  }
  
  struct cgroup_subsys cpuacct_subsys = {