Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sameo/mfd-2.6

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 1f9c6d9..fd05861 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
  #endif /* CONFIG_RT_GROUP_SCHED */
  #else /* !CONFIG_USER_SCHED */
  #define root_task_group init_task_group
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
   * default: 0.25ms
   */
  unsigned int sysctl_sched_shares_ratelimit = 250000;
+unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
  
  /*
   * Inject some fuzzyness into changing the per-cpu group shares
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
   */
  static int tg_shares_up(struct task_group *tg, void *data)
  {
-       unsigned long weight, rq_weight = 0, shares = 0;
+       unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
         unsigned long *usd_rq_weight;
         struct sched_domain *sd = data;
         unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
                 weight = tg->cfs_rq[i]->load.weight;
                 usd_rq_weight[i] = weight;
  
+               rq_weight += weight;
                 /*
                  * If there are currently no tasks on the cpu pretend there
                  * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
                 if (!weight)
                         weight = NICE_0_LOAD;
  
-               rq_weight += weight;
+               sum_weight += weight;
                 shares += tg->cfs_rq[i]->shares;
         }
  
+       if (!rq_weight)
+               rq_weight = sum_weight;
+
         if ((!shares && rq_weight) || shares > tg->shares)
                 shares = tg->shares;
  
@@ -1810,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
  #endif
  
  static void calc_load_account_active(struct rq *this_rq);
+static void update_sysctl(void);
+static int get_update_sysctl_factor(void);
+
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+       set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+       /*
+        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+        * successfuly executed on another CPU. We must ensure that updates of
+        * per-task data have been completed by this moment.
+        */
+       smp_wmb();
+       task_thread_info(p)->cpu = cpu;
+#endif
+}
  
  #include "sched_stats.h"
  #include "sched_idletask.c"
@@ -1967,20 +1988,6 @@ inline int task_curr(const struct task_struct *p)
         return cpu_curr(task_cpu(p)) == p;
  }
  
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-       set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-       /*
-        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-        * successfuly executed on another CPU. We must ensure that updates of
-        * per-task data have been completed by this moment.
-        */
-       smp_wmb();
-       task_thread_info(p)->cpu = cpu;
-#endif
-}
-
  static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                        const struct sched_class *prev_class,
                                        int oldprio, int running)
@@ -2060,7 +2067,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
         int old_cpu = task_cpu(p);
-       struct rq *old_rq = cpu_rq(old_cpu);
         struct cfs_rq *old_cfsrq = task_cfs_rq(p),
                       *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
  
@@ -2068,10 +2074,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  
         if (old_cpu != new_cpu) {
                 p->se.nr_migrations++;
-#ifdef CONFIG_SCHEDSTATS
-               if (task_hot(p, old_rq->clock, NULL))
-                       schedstat_inc(p, se.nr_forced2_migrations);
-#endif
                 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
                                      1, 1, NULL, 0);
         }
@@ -2371,17 +2373,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         if (task_contributes_to_load(p))
                 rq->nr_uninterruptible--;
         p->state = TASK_WAKING;
-       task_rq_unlock(rq, &flags);
+       __task_rq_unlock(rq);
  
         cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-       if (cpu != orig_cpu) {
-               local_irq_save(flags);
-               rq = cpu_rq(cpu);
-               update_rq_clock(rq);
+       if (cpu != orig_cpu)
                 set_task_cpu(p, cpu);
-               local_irq_restore(flags);
-       }
-       rq = task_rq_lock(p, &flags);
+
+       rq = __task_rq_lock(p);
+       update_rq_clock(rq);
  
         WARN_ON(p->state != TASK_WAKING);
         cpu = task_cpu(p);
@@ -2496,7 +2495,6 @@ static void __sched_fork(struct task_struct *p)
         p->se.avg_overlap               = 0;
         p->se.start_runtime             = 0;
         p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
-       p->se.avg_running               = 0;
  
  #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                        = 0;
@@ -2518,7 +2516,6 @@ static void __sched_fork(struct task_struct *p)
         p->se.nr_failed_migrations_running      = 0;
         p->se.nr_failed_migrations_hot          = 0;
         p->se.nr_forced_migrations              = 0;
-       p->se.nr_forced2_migrations             = 0;
  
         p->se.nr_wakeups                        = 0;
         p->se.nr_wakeups_sync                   = 0;
@@ -2555,7 +2552,6 @@ static void __sched_fork(struct task_struct *p)
  void sched_fork(struct task_struct *p, int clone_flags)
  {
         int cpu = get_cpu();
-       unsigned long flags;
  
         __sched_fork(p);
  
@@ -2589,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags)
         if (!rt_prio(p->prio))
                 p->sched_class = &fair_sched_class;
  
+       if (p->sched_class->task_fork)
+               p->sched_class->task_fork(p);
+
  #ifdef CONFIG_SMP
         cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
  #endif
-       local_irq_save(flags);
-       update_rq_clock(cpu_rq(cpu));
         set_task_cpu(p, cpu);
-       local_irq_restore(flags);
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         if (likely(sched_info_on()))
@@ -2628,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
         rq = task_rq_lock(p, &flags);
         BUG_ON(p->state != TASK_RUNNING);
         update_rq_clock(rq);
-
-       if (!p->sched_class->task_new || !current->se.on_rq) {
-               activate_task(rq, p, 0);
-       } else {
-               /*
-                * Let the scheduling class do new task startup
-                * management (if any):
-                */
-               p->sched_class->task_new(rq, p);
-               inc_nr_running(rq);
-       }
+       activate_task(rq, p, 0);
         trace_sched_wakeup_new(rq, p, 1);
         check_preempt_curr(rq, p, WF_FORK);
  #ifdef CONFIG_SMP
@@ -3169,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
         deactivate_task(src_rq, p, 0);
         set_task_cpu(p, this_cpu);
         activate_task(this_rq, p, 0);
-       /*
-        * Note that idle threads have a prio of MAX_PRIO, for this test
-        * to be always true for them.
-        */
         check_preempt_curr(this_rq, p, 0);
  }
  
@@ -5393,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev)
  #endif
  }
  
-static void put_prev_task(struct rq *rq, struct task_struct *p)
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
  {
-       u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
+       if (prev->state == TASK_RUNNING) {
+               u64 runtime = prev->se.sum_exec_runtime;
  
-       update_avg(&p->se.avg_running, runtime);
+               runtime -= prev->se.prev_sum_exec_runtime;
+               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
  
-       if (p->state == TASK_RUNNING) {
                 /*
                  * In order to avoid avg_overlap growing stale when we are
                  * indeed overlapping and hence not getting put to sleep, grow
@@ -5409,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
                  * correlates to the amount of cache footprint a task can
                  * build up.
                  */
-               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-               update_avg(&p->se.avg_overlap, runtime);
-       } else {
-               update_avg(&p->se.avg_running, 0);
+               update_avg(&prev->se.avg_overlap, runtime);
         }
-       p->sched_class->put_prev_task(rq, p);
+       prev->sched_class->put_prev_task(rq, prev);
  }
  
  /*
@@ -7005,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         __sched_fork(idle);
         idle->se.exec_start = sched_clock();
  
-       idle->prio = idle->normal_prio = MAX_PRIO;
         cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
         __set_task_cpu(idle, cpu);
  
@@ -7046,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask;
   *
   * This idea comes from the SD scheduler of Con Kolivas:
   */
-static inline void sched_init_granularity(void)
+static int get_update_sysctl_factor(void)
  {
-       unsigned int factor = 1 + ilog2(num_online_cpus());
-       const unsigned long limit = 200000000;
+       unsigned int cpus = min_t(int, num_online_cpus(), 8);
+       unsigned int factor;
+
+       switch (sysctl_sched_tunable_scaling) {
+       case SCHED_TUNABLESCALING_NONE:
+               factor = 1;
+               break;
+       case SCHED_TUNABLESCALING_LINEAR:
+               factor = cpus;
+               break;
+       case SCHED_TUNABLESCALING_LOG:
+       default:
+               factor = 1 + ilog2(cpus);
+               break;
+       }
  
-       sysctl_sched_min_granularity *= factor;
-       if (sysctl_sched_min_granularity > limit)
-               sysctl_sched_min_granularity = limit;
+       return factor;
+}
  
-       sysctl_sched_latency *= factor;
-       if (sysctl_sched_latency > limit)
-               sysctl_sched_latency = limit;
+static void update_sysctl(void)
+{
+       unsigned int factor = get_update_sysctl_factor();
  
-       sysctl_sched_wakeup_granularity *= factor;
+#define SET_SYSCTL(name) \
+       (sysctl_##name = (factor) * normalized_sysctl_##name)
+       SET_SYSCTL(sched_min_granularity);
+       SET_SYSCTL(sched_latency);
+       SET_SYSCTL(sched_wakeup_granularity);
+       SET_SYSCTL(sched_shares_ratelimit);
+#undef SET_SYSCTL
+}
  
-       sysctl_sched_shares_ratelimit *= factor;
+static inline void sched_init_granularity(void)
+{
+       update_sysctl();
  }
  
  #ifdef CONFIG_SMP
@@ -7449,17 +7449,16 @@ static struct ctl_table sd_ctl_dir[] = {
                 .procname       = "sched_domain",
                 .mode           = 0555,
         },
-       {0, },
+       {}
  };
  
  static struct ctl_table sd_ctl_root[] = {
         {
-               .ctl_name       = CTL_KERN,
                 .procname       = "kernel",
                 .mode           = 0555,
                 .child          = sd_ctl_dir,
         },
-       {0, },
+       {}
  };
  
  static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7709,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 spin_lock_irq(&rq->lock);
                 update_rq_clock(rq);
                 deactivate_task(rq, rq->idle, 0);
-               rq->idle->static_prio = MAX_PRIO;
                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                 rq->idle->sched_class = &idle_sched_class;
                 migrate_dead_tasks(cpu);
@@ -8288,14 +8286,14 @@ enum s_alloc {
   */
  #ifdef CONFIG_SCHED_SMT
  static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
  
  static int
  cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
                  struct sched_group **sg, struct cpumask *unused)
  {
         if (sg)
-               *sg = &per_cpu(sched_group_cpus, cpu).sg;
+               *sg = &per_cpu(sched_groups, cpu).sg;
         return cpu;
  }
  #endif /* CONFIG_SCHED_SMT */
@@ -9585,7 +9583,7 @@ void __init sched_init(void)
  #elif defined CONFIG_USER_SCHED
                 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
                 init_tg_rt_entry(&init_task_group,
-                               &per_cpu(init_rt_rq, i),
+                               &per_cpu(init_rt_rq_var, i),
                                 &per_cpu(init_sched_rt_entity, i), i, 1,
                                 root_task_group.rt_se[i]);
  #endif
@@ -9850,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 se = kzalloc_node(sizeof(struct sched_entity),
                                   GFP_KERNEL, cpu_to_node(i));
                 if (!se)
-                       goto err;
+                       goto err_free_rq;
  
                 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
         }
  
         return 1;
  
+ err_free_rq:
+       kfree(cfs_rq);
   err:
         return 0;
  }
@@ -9938,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
                                      GFP_KERNEL, cpu_to_node(i));
                 if (!rt_se)
-                       goto err;
+                       goto err_free_rq;
  
                 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
         }
  
         return 1;
  
+ err_free_rq:
+       kfree(rt_rq);
   err:
         return 0;
  }