Merge branch 'core-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index ec715f9..5724508 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  
         spin_lock(&rt_b->rt_runtime_lock);
         for (;;) {
+               unsigned long delta;
+               ktime_t soft, hard;
+
                 if (hrtimer_active(&rt_b->rt_period_timer))
                         break;
  
                 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start_expires(&rt_b->rt_period_timer,
-                               HRTIMER_MODE_ABS);
+
+               soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+               hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+               delta = ktime_to_ns(ktime_sub(hard, soft));
+               __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+                               HRTIMER_MODE_ABS, 0);
         }
         spin_unlock(&rt_b->rt_runtime_lock);
  }
@@ -1110,7 +1117,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
         if (rq == this_rq()) {
                 hrtimer_restart(timer);
         } else if (!rq->hrtick_csd_pending) {
-               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
                 rq->hrtick_csd_pending = 1;
         }
  }
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
   */
  static void hrtick_start(struct rq *rq, u64 delay)
  {
-       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+                       HRTIMER_MODE_REL, 0);
  }
  
  static inline void init_hrtick(void)
@@ -1410,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
                    struct rq_iterator *iterator);
  #endif
  
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+       CPUACCT_STAT_USER,      /* ... user mode */
+       CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+
+       CPUACCT_STAT_NSTATS,
+};
+
  #ifdef CONFIG_CGROUP_CPUACCT
  static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val);
  #else
  static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val) {}
  #endif
  
  static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -3190,7 +3210,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
         return 0;
  }
  /********** Helpers for find_busiest_group ************************/
-/**
+/*
   * sd_lb_stats - Structure to store the statistics of a sched_domain
   *             during load balancing.
   */
@@ -3222,7 +3242,7 @@ struct sd_lb_stats {
  #endif
  };
  
-/**
+/*
   * sg_lb_stats - stats of a sched_group required for load_balancing
   */
  struct sg_lb_stats {
@@ -3270,6 +3290,152 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
  }
  
  
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+       /*
+        * Busy processors will not participate in power savings
+        * balance.
+        */
+       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+               sds->power_savings_balance = 0;
+       else {
+               sds->power_savings_balance = 1;
+               sds->min_nr_running = ULONG_MAX;
+               sds->leader_nr_running = 0;
+       }
+}
+
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ *             load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+
+       if (!sds->power_savings_balance)
+               return;
+
+       /*
+        * If the local group is idle or completely loaded
+        * no need to do power savings balance at this domain
+        */
+       if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                               !sds->this_nr_running))
+               sds->power_savings_balance = 0;
+
+       /*
+        * If a group is already running at full capacity or idle,
+        * don't include that group in power savings calculations
+        */
+       if (!sds->power_savings_balance ||
+               sgs->sum_nr_running >= sgs->group_capacity ||
+               !sgs->sum_nr_running)
+               return;
+
+       /*
+        * Calculate the group which has the least non-idle load.
+        * This is the group from where we need to pick up the load
+        * for saving power
+        */
+       if ((sgs->sum_nr_running < sds->min_nr_running) ||
+           (sgs->sum_nr_running == sds->min_nr_running &&
+            group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+               sds->group_min = group;
+               sds->min_nr_running = sgs->sum_nr_running;
+               sds->min_load_per_task = sgs->sum_weighted_load /
+                                               sgs->sum_nr_running;
+       }
+
+       /*
+        * Calculate the group which is almost near its
+        * capacity but still has some space to pick up some load
+        * from other group and save more power
+        */
+       if (sgs->sum_nr_running > sgs->group_capacity - 1)
+               return;
+
+       if (sgs->sum_nr_running > sds->leader_nr_running ||
+           (sgs->sum_nr_running == sds->leader_nr_running &&
+            group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+               sds->group_leader = group;
+               sds->leader_nr_running = sgs->sum_nr_running;
+       }
+}
+
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *     under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+{
+       if (!sds->power_savings_balance)
+               return 0;
+
+       if (sds->this != sds->group_leader ||
+                       sds->group_leader == sds->group_min)
+               return 0;
+
+       *imbalance = sds->min_load_per_task;
+       sds->busiest = sds->group_min;
+
+       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                       group_first_cpu(sds->group_leader);
+       }
+
+       return 1;
+
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+       return;
+}
+
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+       return;
+}
+
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+{
+       return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+
  /**
   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
   * @group: sched_group whose statistics are to be updated.
@@ -3385,10 +3551,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
         struct sg_lb_stats sgs;
         int load_idx;
  
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       sds->power_savings_balance = 1;
-       sds->min_nr_running = ULONG_MAX;
-#endif
+       init_sd_power_savings_stats(sd, sds, idle);
         load_idx = get_sd_load_idx(sd, idle);
  
         do {
@@ -3421,75 +3584,143 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                         sds->group_imb = sgs.group_imb;
                 }
  
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-               /*
-                * Busy processors will not participate in power savings
-                * balance.
-                */
-               if (idle == CPU_NOT_IDLE ||
-                               !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                       goto group_next;
+               update_sd_power_savings_stats(group, sds, local_group, &sgs);
+               group = group->next;
+       } while (group != sd->groups);
  
-               /*
-                * If the local group is idle or completely loaded
-                * no need to do power savings balance at this domain
-                */
-               if (local_group &&
-                       (sds->this_nr_running >= sgs.group_capacity ||
-                       !sds->this_nr_running))
-                       sds->power_savings_balance = 0;
+}
  
-               /*
-                * If a group is already running at full capacity or idle,
-                * don't include that group in power savings calculations
-                */
-               if (!sds->power_savings_balance ||
-                       sgs.sum_nr_running >= sgs.group_capacity ||
-                       !sgs.sum_nr_running)
-                       goto group_next;
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ *                     amongst the groups of a sched_domain, during
+ *                     load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                               int this_cpu, unsigned long *imbalance)
+{
+       unsigned long tmp, pwr_now = 0, pwr_move = 0;
+       unsigned int imbn = 2;
+
+       if (sds->this_nr_running) {
+               sds->this_load_per_task /= sds->this_nr_running;
+               if (sds->busiest_load_per_task >
+                               sds->this_load_per_task)
+                       imbn = 1;
+       } else
+               sds->this_load_per_task =
+                       cpu_avg_load_per_task(this_cpu);
  
-               /*
-                * Calculate the group which has the least non-idle load.
-                * This is the group from where we need to pick up the load
-                * for saving power
-                */
-               if ((sgs.sum_nr_running < sds->min_nr_running) ||
-                   (sgs.sum_nr_running == sds->min_nr_running &&
-                    group_first_cpu(group) >
-                       group_first_cpu(sds->group_min))) {
-                       sds->group_min = group;
-                       sds->min_nr_running = sgs.sum_nr_running;
-                       sds->min_load_per_task = sgs.sum_weighted_load /
-                                               sgs.sum_nr_running;
-               }
+       if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+                       sds->busiest_load_per_task * imbn) {
+               *imbalance = sds->busiest_load_per_task;
+               return;
+       }
  
-               /*
-                * Calculate the group which is almost near its
-                * capacity but still has some space to pick up some load
-                * from other group and save more power
-                */
-               if (sgs.sum_nr_running > sgs.group_capacity - 1)
-                       goto group_next;
-
-               if (sgs.sum_nr_running > sds->leader_nr_running ||
-                   (sgs.sum_nr_running == sds->leader_nr_running &&
-                    group_first_cpu(group) <
-                       group_first_cpu(sds->group_leader))) {
-                       sds->group_leader = group;
-                       sds->leader_nr_running = sgs.sum_nr_running;
-               }
-group_next:
-#endif
-               group = group->next;
-       } while (group != sd->groups);
+       /*
+        * OK, we don't have enough imbalance to justify moving tasks,
+        * however we may be able to increase total CPU power used by
+        * moving them.
+        */
+
+       pwr_now += sds->busiest->__cpu_power *
+                       min(sds->busiest_load_per_task, sds->max_load);
+       pwr_now += sds->this->__cpu_power *
+                       min(sds->this_load_per_task, sds->this_load);
+       pwr_now /= SCHED_LOAD_SCALE;
+
+       /* Amount of load we'd subtract */
+       tmp = sg_div_cpu_power(sds->busiest,
+                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       if (sds->max_load > tmp)
+               pwr_move += sds->busiest->__cpu_power *
+                       min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+       /* Amount of load we'd add */
+       if (sds->max_load * sds->busiest->__cpu_power <
+               sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+               tmp = sg_div_cpu_power(sds->this,
+                       sds->max_load * sds->busiest->__cpu_power);
+       else
+               tmp = sg_div_cpu_power(sds->this,
+                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       pwr_move += sds->this->__cpu_power *
+                       min(sds->this_load_per_task, sds->this_load + tmp);
+       pwr_move /= SCHED_LOAD_SCALE;
+
+       /* Move if we gain throughput */
+       if (pwr_move > pwr_now)
+               *imbalance = sds->busiest_load_per_task;
+}
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                      groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+               unsigned long *imbalance)
+{
+       unsigned long max_pull;
+       /*
+        * In the presence of smp nice balancing, certain scenarios can have
+        * max load less than avg load(as we skip the groups at or below
+        * its cpu_power, while calculating max_load..)
+        */
+       if (sds->max_load < sds->avg_load) {
+               *imbalance = 0;
+               return fix_small_imbalance(sds, this_cpu, imbalance);
+       }
+
+       /* Don't want to pull so many tasks that a group would go idle */
+       max_pull = min(sds->max_load - sds->avg_load,
+                       sds->max_load - sds->busiest_load_per_task);
+
+       /* How much load to actually move to equalise the imbalance */
+       *imbalance = min(max_pull * sds->busiest->__cpu_power,
+               (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
+                       / SCHED_LOAD_SCALE;
+
+       /*
+        * if *imbalance is less than the average load per runnable task
+        * there is no gaurantee that any tasks will be moved so we'll have
+        * a think about bumping its value to force at least one task to be
+        * moved
+        */
+       if (*imbalance < sds->busiest_load_per_task)
+               return fix_small_imbalance(sds, this_cpu, imbalance);
  
  }
  /******* find_busiest_group() helpers end here *********************/
  
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *             be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *     is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:    - the busiest group if imbalance exists.
+ *             - If no imbalance and user has opted for power-savings balance,
+ *                return the least loaded group whose CPUs can be
+ *                put to idle by rebalancing its tasks onto our group.
   */
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
@@ -3497,7 +3728,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                    int *sd_idle, const struct cpumask *cpus, int *balance)
  {
         struct sd_lb_stats sds;
-       unsigned long max_pull;
  
         memset(&sds, 0, sizeof(sds));
  
@@ -3508,17 +3738,31 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
                                         balance, &sds);
  
+       /* Cases where imbalance does not exist from POV of this_cpu */
+       /* 1) this_cpu is not the appropriate cpu to perform load balancing
+        *    at this level.
+        * 2) There is no busy sibling group to pull from.
+        * 3) This group is the busiest group.
+        * 4) This group is more busy than the avg busieness at this
+        *    sched_domain.
+        * 5) The imbalance is within the specified limit.
+        * 6) Any rebalance would lead to ping-pong
+        */
         if (balance && !(*balance))
                 goto ret;
  
-       if (!sds.busiest || sds.this_load >= sds.max_load
-               || sds.busiest_nr_running == 0)
+       if (!sds.busiest || sds.busiest_nr_running == 0)
+               goto out_balanced;
+
+       if (sds.this_load >= sds.max_load)
                 goto out_balanced;
  
         sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
  
-       if (sds.this_load >= sds.avg_load ||
-                       100*sds.max_load <= sd->imbalance_pct * sds.this_load)
+       if (sds.this_load >= sds.avg_load)
+               goto out_balanced;
+
+       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
                 goto out_balanced;
  
         sds.busiest_load_per_task /= sds.busiest_nr_running;
@@ -3540,110 +3784,17 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (sds.max_load <= sds.busiest_load_per_task)
                 goto out_balanced;
  
-       /*
-        * In the presence of smp nice balancing, certain scenarios can have
-        * max load less than avg load(as we skip the groups at or below
-        * its cpu_power, while calculating max_load..)
-        */
-       if (sds.max_load < sds.avg_load) {
-               *imbalance = 0;
-               goto small_imbalance;
-       }
-
-       /* Don't want to pull so many tasks that a group would go idle */
-       max_pull = min(sds.max_load - sds.avg_load,
-                       sds.max_load - sds.busiest_load_per_task);
-
-       /* How much load to actually move to equalise the imbalance */
-       *imbalance = min(max_pull * sds.busiest->__cpu_power,
-                       (sds.avg_load - sds.this_load) * sds.this->__cpu_power)
-                       / SCHED_LOAD_SCALE;
-
-       /*
-        * if *imbalance is less than the average load per runnable task
-        * there is no gaurantee that any tasks will be moved so we'll have
-        * a think about bumping its value to force at least one task to be
-        * moved
-        */
-       if (*imbalance < sds.busiest_load_per_task) {
-               unsigned long tmp, pwr_now, pwr_move;
-               unsigned int imbn;
-
-small_imbalance:
-               pwr_move = pwr_now = 0;
-               imbn = 2;
-               if (sds.this_nr_running) {
-                       sds.this_load_per_task /= sds.this_nr_running;
-                       if (sds.busiest_load_per_task >
-                                       sds.this_load_per_task)
-                               imbn = 1;
-               } else
-                       sds.this_load_per_task =
-                               cpu_avg_load_per_task(this_cpu);
-
-               if (sds.max_load - sds.this_load +
-                       sds.busiest_load_per_task >=
-                               sds.busiest_load_per_task * imbn) {
-                       *imbalance = sds.busiest_load_per_task;
-                       return sds.busiest;
-               }
-
-               /*
-                * OK, we don't have enough imbalance to justify moving tasks,
-                * however we may be able to increase total CPU power used by
-                * moving them.
-                */
-
-               pwr_now += sds.busiest->__cpu_power *
-                               min(sds.busiest_load_per_task, sds.max_load);
-               pwr_now += sds.this->__cpu_power *
-                               min(sds.this_load_per_task, sds.this_load);
-               pwr_now /= SCHED_LOAD_SCALE;
-
-               /* Amount of load we'd subtract */
-               tmp = sg_div_cpu_power(sds.busiest,
-                               sds.busiest_load_per_task * SCHED_LOAD_SCALE);
-               if (sds.max_load > tmp)
-                       pwr_move += sds.busiest->__cpu_power *
-                               min(sds.busiest_load_per_task,
-                                               sds.max_load - tmp);
-
-               /* Amount of load we'd add */
-               if (sds.max_load * sds.busiest->__cpu_power <
-                               sds.busiest_load_per_task * SCHED_LOAD_SCALE)
-                       tmp = sg_div_cpu_power(sds.this,
-                               sds.max_load * sds.busiest->__cpu_power);
-               else
-                       tmp = sg_div_cpu_power(sds.this,
-                               sds.busiest_load_per_task * SCHED_LOAD_SCALE);
-               pwr_move += sds.this->__cpu_power *
-                               min(sds.this_load_per_task,
-                                       sds.this_load + tmp);
-               pwr_move /= SCHED_LOAD_SCALE;
-
-               /* Move if we gain throughput */
-               if (pwr_move > pwr_now)
-                       *imbalance = sds.busiest_load_per_task;
-       }
-
+       /* Looks like there is an imbalance. Compute it */
+       calculate_imbalance(&sds, this_cpu, imbalance);
         return sds.busiest;
  
  out_balanced:
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-               goto ret;
-
-       if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
-               goto ret;
-
-       *imbalance = sds.min_load_per_task;
-       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-                       group_first_cpu(sds.group_leader);
-       }
-       return sds.group_min;
-
-#endif
+       /*
+        * There is no obvious imbalance. But check if we can do some balancing
+        * to save power.
+        */
+       if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+               return sds.busiest;
  ret:
         *imbalance = 0;
         return NULL;
@@ -3687,19 +3838,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
   */
  #define MAX_PINNED_INTERVAL    512
  
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
  /*
   * Check this_cpu to ensure it is balanced within domain. Attempt to move
   * tasks if there is an imbalance.
   */
  static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, struct cpumask *cpus)
+                       int *balance)
  {
         int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
         struct sched_group *group;
         unsigned long imbalance;
         struct rq *busiest;
         unsigned long flags;
+       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
  
         cpumask_setall(cpus);
  
@@ -3854,8 +4009,7 @@ out:
   * this_rq is locked.
   */
  static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       struct cpumask *cpus)
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
  {
         struct sched_group *group;
         struct rq *busiest = NULL;
@@ -3863,6 +4017,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
         int ld_moved = 0;
         int sd_idle = 0;
         int all_pinned = 0;
+       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
  
         cpumask_setall(cpus);
  
@@ -4003,10 +4158,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
         struct sched_domain *sd;
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
-       cpumask_var_t tmpmask;
-
-       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
-               return;
  
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@ -4017,7 +4168,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                 if (sd->flags & SD_BALANCE_NEWIDLE)
                         /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, tmpmask);
+                                                          sd);
  
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
@@ -4032,7 +4183,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                  */
                 this_rq->next_balance = next_balance;
         }
-       free_cpumask_var(tmpmask);
  }
  
  /*
@@ -4182,11 +4332,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
         int need_serialize;
-       cpumask_var_t tmp;
-
-       /* Fails alloc?  Rebalancing probably not a priority right now. */
-       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
-               return;
  
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -4211,7 +4356,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                 }
  
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance)) {
                                 /*
                                  * We've pulled tasks over so either we're no
                                  * longer idle, or one of our SMT siblings is
@@ -4245,8 +4390,6 @@ out:
          */
         if (likely(update_next_balance))
                 rq->next_balance = next_balance;
-
-       free_cpumask_var(tmp);
  }
  
  /*
@@ -4380,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
   * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
   */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+       u64 ns = 0;
+
+       if (task_current(rq, p)) {
+               update_rq_clock(rq);
+               ns = rq->clock - p->se.exec_start;
+               if ((s64)ns < 0)
+                       ns = 0;
+       }
+
+       return ns;
+}
+
  unsigned long long task_delta_exec(struct task_struct *p)
  {
         unsigned long flags;
@@ -4390,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
         u64 ns = 0;
  
         rq = task_rq_lock(p, &flags);
+       ns = do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
  
-       if (task_current(rq, p)) {
-               u64 delta_exec;
+       return ns;
+}
  
-               update_rq_clock(rq);
-               delta_exec = rq->clock - p->se.exec_start;
-               if ((s64)delta_exec > 0)
-                       ns = delta_exec;
-       }
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns = 0;
  
+       rq = task_rq_lock(p, &flags);
+       ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
+
+       return ns;
+}
+
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+       struct task_cputime totals;
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns;
+
+       rq = task_rq_lock(p, &flags);
+       thread_group_cputime(p, &totals);
+       ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
         task_rq_unlock(rq, &flags);
  
         return ns;
@@ -4428,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
         else
                 cpustat->user = cputime64_add(cpustat->user, tmp);
+
+       cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
         /* Account for user time used */
         acct_update_integrals(p);
  }
@@ -4489,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
  
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
         /* Account for system time used */
         acct_update_integrals(p);
  }
@@ -4650,10 +4846,7 @@ void scheduler_tick(void)
  #endif
  }
  
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-                               defined(CONFIG_PREEMPT_TRACER))
-
-static inline unsigned long get_parent_ip(unsigned long addr)
+unsigned long get_parent_ip(unsigned long addr)
  {
         if (in_lock_functions(addr)) {
                 addr = CALLER_ADDR2;
@@ -4663,6 +4856,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
         return addr;
  }
  
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+
  void __kprobes add_preempt_count(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
@@ -4811,15 +5007,13 @@ pick_next_task(struct rq *rq)
  /*
   * schedule() is the main scheduler function.
   */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
  
-need_resched:
-       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_qsctr_inc(cpu);
@@ -4876,13 +5070,80 @@ need_resched_nonpreemptible:
  
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
+}
  
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+       preempt_disable();
+       __schedule();
         preempt_enable_no_resched();
         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                 goto need_resched;
  }
  EXPORT_SYMBOL(schedule);
  
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+       unsigned int cpu;
+       struct rq *rq;
+
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * Need to access the cpu field knowing that
+        * DEBUG_PAGEALLOC could have unmapped it if
+        * the mutex owner just released it and exited.
+        */
+       if (probe_kernel_address(&owner->cpu, cpu))
+               goto out;
+#else
+       cpu = owner->cpu;
+#endif
+
+       /*
+        * Even if the access succeeded (likely case),
+        * the cpu field may no longer be valid.
+        */
+       if (cpu >= nr_cpumask_bits)
+               goto out;
+
+       /*
+        * We need to validate that we can do a
+        * get_cpu() and that we have the percpu area.
+        */
+       if (!cpu_online(cpu))
+               goto out;
+
+       rq = cpu_rq(cpu);
+
+       for (;;) {
+               /*
+                * Owner changed, break to re-assess state.
+                */
+               if (lock->owner != owner)
+                       break;
+
+               /*
+                * Is that owner really running on that cpu?
+                */
+               if (task_thread_info(rq->curr) != owner || need_resched())
+                       return 0;
+
+               cpu_relax();
+       }
+out:
+       return 1;
+}
+#endif
+
  #ifdef CONFIG_PREEMPT
  /*
   * this is the entry point to schedule() from in-kernel preemption
@@ -5000,11 +5261,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
         __wake_up_common(q, mode, 1, 0, NULL);
  }
  
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+       __wake_up_common(q, mode, 1, 0, key);
+}
+
  /**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
   * @q: the waitqueue
   * @mode: which threads
   * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
   *
   * The sync wakeup differs that the waker knows that it will schedule
   * away soon, so while the target thread will be woken up, it will not
@@ -5013,8 +5280,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
   *
   * On UP it can prevent extra preemption.
   */
-void
-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+                       int nr_exclusive, void *key)
  {
         unsigned long flags;
         int sync = 1;
@@ -5026,9 +5293,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
                 sync = 0;
  
         spin_lock_irqsave(&q->lock, flags);
-       __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+       __wake_up_common(q, mode, nr_exclusive, sync, key);
         spin_unlock_irqrestore(&q->lock, flags);
  }
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+       __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
  EXPORT_SYMBOL_GPL(__wake_up_sync);     /* For internal use only */
  
  /**
@@ -6212,12 +6488,7 @@ void sched_show_task(struct task_struct *p)
                 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
  #endif
  #ifdef CONFIG_DEBUG_STACK_USAGE
-       {
-               unsigned long *n = end_of_stack(p);
-               while (!*n)
-                       n++;
-               free = (unsigned long)n - (unsigned long)end_of_stack(p);
-       }
+       free = stack_not_used(p);
  #endif
         printk(KERN_CONT "%5lu %5d %6d\n", free,
                 task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -7096,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-               printk(KERN_CONT " %s", str);
+               printk(KERN_CONT " %s (__cpu_power = %d)", str,
+                                               group->__cpu_power);
  
                 group = group->next;
         } while (group != sd->groups);
@@ -7522,7 +7794,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
  {
         int group;
  
-       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
         group = cpumask_first(mask);
         if (sg)
                 *sg = &per_cpu(sched_group_core, group).sg;
@@ -7551,7 +7823,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
         cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
         group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
-       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
         group = cpumask_first(mask);
  #else
         group = cpu;
@@ -7894,7 +8166,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 SD_INIT(sd, SIBLING);
                 set_domain_attribute(sd, attr);
                 cpumask_and(sched_domain_span(sd),
-                           &per_cpu(cpu_sibling_map, i), cpu_map);
+                           topology_thread_cpumask(i), cpu_map);
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7905,7 +8177,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
         /* Set up CPU (sibling) groups */
         for_each_cpu(i, cpu_map) {
                 cpumask_and(this_sibling_map,
-                           &per_cpu(cpu_sibling_map, i), cpu_map);
+                           topology_thread_cpumask(i), cpu_map);
                 if (i != cpumask_first(this_sibling_map))
                         continue;
  
@@ -8581,6 +8853,9 @@ void __init sched_init(void)
  #ifdef CONFIG_USER_SCHED
         alloc_size *= 2;
  #endif
+#ifdef CONFIG_CPUMASK_OFFSTACK
+       alloc_size += num_possible_cpus() * cpumask_size();
+#endif
         /*
          * As sched_init() is called before page_alloc is setup,
          * we use alloc_bootmem().
@@ -8618,6 +8893,12 @@ void __init sched_init(void)
                 ptr += nr_cpu_ids * sizeof(void **);
  #endif /* CONFIG_USER_SCHED */
  #endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+               for_each_possible_cpu(i) {
+                       per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                       ptr += cpumask_size();
+               }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
         }
  
  #ifdef CONFIG_SMP
@@ -9710,6 +9991,7 @@ struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
         u64 *cpuusage;
+       struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
         struct cpuacct *parent;
  };
  
@@ -9734,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
         struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
         struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+       int i;
  
         if (!ca)
-               return ERR_PTR(-ENOMEM);
+               goto out;
  
         ca->cpuusage = alloc_percpu(u64);
-       if (!ca->cpuusage) {
-               kfree(ca);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!ca->cpuusage)
+               goto out_free_ca;
+
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               if (percpu_counter_init(&ca->cpustat[i], 0))
+                       goto out_free_counters;
  
         if (cgrp->parent)
                 ca->parent = cgroup_ca(cgrp->parent);
  
         return &ca->css;
+
+out_free_counters:
+       while (--i >= 0)
+               percpu_counter_destroy(&ca->cpustat[i]);
+       free_percpu(ca->cpuusage);
+out_free_ca:
+       kfree(ca);
+out:
+       return ERR_PTR(-ENOMEM);
  }
  
  /* destroy an existing cpu accounting group */
@@ -9755,14 +10049,17 @@ static void
  cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
         struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
  
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               percpu_counter_destroy(&ca->cpustat[i]);
         free_percpu(ca->cpuusage);
         kfree(ca);
  }
  
  static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
  {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
         u64 data;
  
  #ifndef CONFIG_64BIT
@@ -9781,7 +10078,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
  
  static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
  {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
  
  #ifndef CONFIG_64BIT
         /*
@@ -9842,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
         return 0;
  }
  
+static const char *cpuacct_stat_desc[] = {
+       [CPUACCT_STAT_USER] = "user",
+       [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+               struct cgroup_map_cb *cb)
+{
+       struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
+
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+               s64 val = percpu_counter_read(&ca->cpustat[i]);
+               val = cputime64_to_clock_t(val);
+               cb->fill(cb, cpuacct_stat_desc[i], val);
+       }
+       return 0;
+}
+
  static struct cftype files[] = {
         {
                 .name = "usage",
@@ -9852,7 +10168,10 @@ static struct cftype files[] = {
                 .name = "usage_percpu",
                 .read_seq_string = cpuacct_percpu_seq_read,
         },
-
+       {
+               .name = "stat",
+               .read_map = cpuacct_stats_show,
+       },
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9874,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
                 return;
  
         cpu = task_cpu(tsk);
+
+       rcu_read_lock();
+
         ca = task_ca(tsk);
  
         for (; ca; ca = ca->parent) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
+
+       rcu_read_unlock();
+}
+
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val)
+{
+       struct cpuacct *ca;
+
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       rcu_read_lock();
+       ca = task_ca(tsk);
+
+       do {
+               percpu_counter_add(&ca->cpustat[idx], val);
+               ca = ca->parent;
+       } while (ca);
+       rcu_read_unlock();
  }
  
  struct cgroup_subsys cpuacct_subsys = {