sched: make the scheduler converge to the ideal latency

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 6247e4a..b533d6d 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,8 @@ struct rq {
         s64 clock_max_delta;
  
         unsigned int clock_warps, clock_overflows;
-       unsigned int clock_unstable_events;
+       u64 idle_clock;
+       unsigned int clock_deep_idle_events;
         u64 tick_timestamp;
  
         atomic_t nr_iowait;
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void)
  }
  
  /*
- * CPU frequency is/was unstable - start new by setting prev_clock_raw:
+ * We are going deep-idle (irqs are disabled):
   */
-void sched_clock_unstable_event(void)
+void sched_clock_idle_sleep_event(void)
  {
-       unsigned long flags;
-       struct rq *rq;
+       struct rq *rq = cpu_rq(smp_processor_id());
  
-       rq = task_rq_lock(current, &flags);
-       rq->prev_clock_raw = sched_clock();
-       rq->clock_unstable_events++;
-       task_rq_unlock(rq, &flags);
+       spin_lock(&rq->lock);
+       __update_rq_clock(rq);
+       spin_unlock(&rq->lock);
+       rq->clock_deep_idle_events++;
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+       struct rq *rq = cpu_rq(smp_processor_id());
+       u64 now = sched_clock();
+
+       rq->idle_clock += delta_ns;
+       /*
+        * Override the previous timestamp and ignore all
+        * sched_clock() deltas that occured while we idled,
+        * and use the PM-provided delta_ns to advance the
+        * rq clock:
+        */
+       spin_lock(&rq->lock);
+       rq->prev_clock_raw = now;
+       rq->clock += delta_ns;
+       spin_unlock(&rq->lock);
  }
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
  
  /*
   * resched_task - mark a task 'to be rescheduled now'.
@@ -1564,6 +1587,7 @@ static void __sched_fork(struct task_struct *p)
         p->se.wait_start_fair           = 0;
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
+       p->se.prev_sum_exec_runtime     = 0;
         p->se.delta_exec                = 0;
         p->se.delta_fair_run            = 0;
         p->se.delta_fair_sleep          = 0;
@@ -2157,12 +2181,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         if (task_running(rq, p))
                 return 0;
  
-       /*
-        * Aggressive migration if too many balance attempts have failed:
-        */
-       if (sd->nr_balance_failed > sd->cache_nice_tries)
-               return 1;
-
         return 1;
  }
  
@@ -2494,7 +2512,7 @@ group_next:
          * a think about bumping its value to force at least one task to be
          * moved
          */
-       if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
+       if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
                 unsigned long tmp, pwr_now, pwr_move;
                 unsigned int imbn;
  
@@ -3020,6 +3038,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
         struct sched_domain *sd;
         /* Earliest time when we have to do rebalance again */
         unsigned long next_balance = jiffies + 60*HZ;
+       int update_next_balance = 0;
  
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3056,8 +3075,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
                 if (sd->flags & SD_SERIALIZE)
                         spin_unlock(&balancing);
  out:
-               if (time_after(next_balance, sd->last_balance + interval))
+               if (time_after(next_balance, sd->last_balance + interval)) {
                         next_balance = sd->last_balance + interval;
+                       update_next_balance = 1;
+               }
  
                 /*
                  * Stop the load balance at this level. There is another
@@ -3067,7 +3088,14 @@ out:
                 if (!balance)
                         break;
         }
-       rq->next_balance = next_balance;
+
+       /*
+        * next_balance will be updated only when there is a need.
+        * When the cpu is attached to null domain for ex, it will not be
+        * updated.
+        */
+       if (likely(update_next_balance))
+               rq->next_balance = next_balance;
  }
  
  /*
@@ -3106,7 +3134,7 @@ static void run_rebalance_domains(struct softirq_action *h)
                         if (need_resched())
                                 break;
  
-                       rebalance_domains(balance_cpu, SCHED_IDLE);
+                       rebalance_domains(balance_cpu, CPU_IDLE);
  
                         rq = cpu_rq(balance_cpu);
                         if (time_after(this_rq->next_balance, rq->next_balance))
@@ -4884,14 +4912,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
  static inline void sched_init_granularity(void)
  {
         unsigned int factor = 1 + ilog2(num_online_cpus());
-       const unsigned long gran_limit = 100000000;
+       const unsigned long limit = 100000000;
+
+       sysctl_sched_min_granularity *= factor;
+       if (sysctl_sched_min_granularity > limit)
+               sysctl_sched_min_granularity = limit;
  
-       sysctl_sched_granularity *= factor;
-       if (sysctl_sched_granularity > gran_limit)
-               sysctl_sched_granularity = gran_limit;
+       sysctl_sched_latency *= factor;
+       if (sysctl_sched_latency > limit)
+               sysctl_sched_latency = limit;
  
-       sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
-       sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
+       sysctl_sched_runtime_limit = sysctl_sched_latency;
+       sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
  }
  
  #ifdef CONFIG_SMP
@@ -5234,15 +5266,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
  static struct ctl_table sd_ctl_dir[] = {
         {
                 .procname       = "sched_domain",
-               .mode           = 0755,
+               .mode           = 0555,
         },
         {0,},
  };
  
  static struct ctl_table sd_ctl_root[] = {
         {
+               .ctl_name       = CTL_KERN,
                 .procname       = "kernel",
-               .mode           = 0755,
+               .mode           = 0555,
                 .child          = sd_ctl_dir,
         },
         {0,},
@@ -5318,7 +5351,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
         for_each_domain(cpu, sd) {
                 snprintf(buf, 32, "domain%d", i);
                 entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0755;
+               entry->mode = 0555;
                 entry->child = sd_alloc_ctl_domain_table(sd);
                 entry++;
                 i++;
@@ -5338,7 +5371,7 @@ static void init_sched_domain_sysctl(void)
         for (i = 0; i < cpu_num; i++, entry++) {
                 snprintf(buf, 32, "cpu%d", i);
                 entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0755;
+               entry->mode = 0555;
                 entry->child = sd_alloc_ctl_cpu_table(i);
         }
         sd_sysctl_header = register_sysctl_table(sd_ctl_root);
@@ -6328,7 +6361,7 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
  }
  
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-int arch_reinit_sched_domains(void)
+static int arch_reinit_sched_domains(void)
  {
         int err;
  
@@ -6357,24 +6390,6 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
         return ret ? ret : count;
  }
  
-int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
-{
-       int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
-       if (smt_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_smt_power_savings.attr);
-#endif
-#ifdef CONFIG_SCHED_MC
-       if (!err && mc_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_mc_power_savings.attr);
-#endif
-       return err;
-}
-#endif
-
  #ifdef CONFIG_SCHED_MC
  static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
  {
@@ -6385,8 +6400,8 @@ static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
  {
         return sched_power_savings_store(buf, count, 0);
  }
-SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
-           sched_mc_power_savings_store);
+static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+                  sched_mc_power_savings_store);
  #endif
  
  #ifdef CONFIG_SCHED_SMT
@@ -6399,8 +6414,26 @@ static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
  {
         return sched_power_savings_store(buf, count, 1);
  }
-SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
-           sched_smt_power_savings_store);
+static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+                  sched_smt_power_savings_store);
+#endif
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+       int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+       if (smt_capable())
+               err = sysfs_create_file(&cls->kset.kobj,
+                                       &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+       if (!err && mc_capable())
+               err = sysfs_create_file(&cls->kset.kobj,
+                                       &attr_sched_mc_power_savings.attr);
+#endif
+       return err;
+}
  #endif
  
  /*