Merge branch 'linus' into tracing/urgent

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 14c447a..1b59e26 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
  #include <linux/completion.h>
  #include <linux/kernel_stat.h>
  #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
  #include <linux/security.h>
  #include <linux/notifier.h>
  #include <linux/profile.h>
@@ -68,7 +69,6 @@
  #include <linux/pagemap.h>
  #include <linux/hrtimer.h>
  #include <linux/tick.h>
-#include <linux/bootmem.h>
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
@@ -240,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
                 delta = ktime_to_ns(ktime_sub(hard, soft));
                 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                               HRTIMER_MODE_ABS, 0);
+                               HRTIMER_MODE_ABS_PINNED, 0);
         }
         spin_unlock(&rt_b->rt_runtime_lock);
  }
@@ -493,6 +493,7 @@ struct rt_rq {
  #endif
  #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
+       unsigned long rt_nr_total;
         int overloaded;
         struct plist_head pushable_tasks;
  #endif
@@ -580,6 +581,7 @@ struct rq {
         struct load_weight load;
         unsigned long nr_load_updates;
         u64 nr_switches;
+       u64 nr_migrations_in;
  
         struct cfs_rq cfs;
         struct rt_rq rt;
@@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
  #define task_rq(p)             cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
  {
         rq->clock = sched_clock_cpu(cpu_of(rq));
  }
@@ -1154,7 +1156,7 @@ static __init void init_hrtick(void)
  static void hrtick_start(struct rq *rq, u64 delay)
  {
         __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL, 0);
+                       HRTIMER_MODE_REL_PINNED, 0);
  }
  
  static inline void init_hrtick(void)
@@ -1969,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 p->se.sleep_start -= clock_offset;
         if (p->se.block_start)
                 p->se.block_start -= clock_offset;
+#endif
         if (old_cpu != new_cpu) {
-               schedstat_inc(p, se.nr_migrations);
+               p->se.nr_migrations++;
+               new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
                 if (task_hot(p, old_rq->clock, NULL))
                         schedstat_inc(p, se.nr_forced2_migrations);
-       }
  #endif
+               perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+                                    1, 1, NULL, 0);
+       }
         p->se.vruntime -= old_cfsrq->min_vruntime -
                                          new_cfsrq->min_vruntime;
  
@@ -2187,6 +2194,7 @@ void kick_process(struct task_struct *p)
                 smp_send_reschedule(cpu);
         preempt_enable();
  }
+EXPORT_SYMBOL_GPL(kick_process);
  
  /*
   * Return a low guess at the load of a migration-source cpu weighted
@@ -2369,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag)
  
  #endif /* CONFIG_SMP */
  
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:         the task to evaluate
+ * @func:      the function to be called
+ * @info:      the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                             void (*func) (void *info), void *info)
+{
+       int cpu;
+
+       preempt_disable();
+       cpu = task_cpu(p);
+       if (task_curr(p))
+               smp_call_function_single(cpu, func, info, 1);
+       preempt_enable();
+}
+
  /***
   * try_to_wake_up - wake up a thread
   * @p: the to-be-woken-up thread
@@ -2536,21 +2565,44 @@ static void __sched_fork(struct task_struct *p)
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
+       p->se.nr_migrations             = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
         p->se.start_runtime             = 0;
         p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_start                = 0;
-       p->se.sum_sleep_runtime         = 0;
-       p->se.sleep_start               = 0;
-       p->se.block_start               = 0;
-       p->se.sleep_max                 = 0;
-       p->se.block_max                 = 0;
-       p->se.exec_max                  = 0;
-       p->se.slice_max                 = 0;
-       p->se.wait_max                  = 0;
+       p->se.wait_start                        = 0;
+       p->se.wait_max                          = 0;
+       p->se.wait_count                        = 0;
+       p->se.wait_sum                          = 0;
+
+       p->se.sleep_start                       = 0;
+       p->se.sleep_max                         = 0;
+       p->se.sum_sleep_runtime                 = 0;
+
+       p->se.block_start                       = 0;
+       p->se.block_max                         = 0;
+       p->se.exec_max                          = 0;
+       p->se.slice_max                         = 0;
+
+       p->se.nr_migrations_cold                = 0;
+       p->se.nr_failed_migrations_affine       = 0;
+       p->se.nr_failed_migrations_running      = 0;
+       p->se.nr_failed_migrations_hot          = 0;
+       p->se.nr_forced_migrations              = 0;
+       p->se.nr_forced2_migrations             = 0;
+
+       p->se.nr_wakeups                        = 0;
+       p->se.nr_wakeups_sync                   = 0;
+       p->se.nr_wakeups_migrate                = 0;
+       p->se.nr_wakeups_local                  = 0;
+       p->se.nr_wakeups_remote                 = 0;
+       p->se.nr_wakeups_affine                 = 0;
+       p->se.nr_wakeups_affine_attempts        = 0;
+       p->se.nr_wakeups_passive                = 0;
+       p->se.nr_wakeups_idle                   = 0;
+
  #endif
  
         INIT_LIST_HEAD(&p->rt.run_list);
@@ -2766,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
+       perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
         if (post_schedule)
@@ -2981,6 +3034,15 @@ static void calc_load_account_active(struct rq *this_rq)
  }
  
  /*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+       return cpu_rq(cpu)->nr_migrations_in;
+}
+
+/*
   * Update rq->cpu_load[] statistics. This function is usually called every
   * scheduler tick (TICK_NSEC).
   */
@@ -4359,6 +4421,11 @@ static struct {
         .load_balancer = ATOMIC_INIT(-1),
  };
  
+int get_nohz_load_balancer(void)
+{
+       return atomic_read(&nohz.load_balancer);
+}
+
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  /**
   * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -5078,6 +5145,8 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         spin_unlock(&rq->lock);
  
+       perf_counter_task_tick(curr, cpu);
+
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
         trigger_load_balance(rq, cpu);
@@ -5293,6 +5362,7 @@ need_resched_nonpreemptible:
  
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
+               perf_counter_task_sched_out(prev, next, cpu);
  
                 rq->nr_switches++;
                 rq->curr = next;
@@ -6494,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
+static inline int should_resched(void)
+{
+       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
  static void __cond_resched(void)
  {
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6513,8 +6588,7 @@ static void __cond_resched(void)
  
  int __sched _cond_resched(void)
  {
-       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
-                                       system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                 __cond_resched();
                 return 1;
         }
@@ -6532,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
   */
  int cond_resched_lock(spinlock_t *lock)
  {
-       int resched = need_resched() && system_state == SYSTEM_RUNNING;
+       int resched = should_resched();
         int ret = 0;
  
         if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
-               if (resched && need_resched())
+               if (resched)
                         __cond_resched();
                 else
                         cpu_relax();
@@ -6552,7 +6626,7 @@ int __sched cond_resched_softirq(void)
  {
         BUG_ON(!in_softirq());
  
-       if (need_resched() && system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                 local_bh_enable();
                 __cond_resched();
                 local_bh_disable();
@@ -6999,7 +7073,7 @@ static int migration_thread(void *data)
  
                 if (cpu_is_offline(cpu)) {
                         spin_unlock_irq(&rq->lock);
-                       goto wait_to_die;
+                       break;
                 }
  
                 if (rq->active_balance) {
@@ -7025,16 +7099,7 @@ static int migration_thread(void *data)
                 complete(&req->done);
         }
         __set_current_state(TASK_RUNNING);
-       return 0;
  
-wait_to_die:
-       /* Wait for kthread_stop */
-       set_current_state(TASK_INTERRUPTIBLE);
-       while (!kthread_should_stop()) {
-               schedule();
-               set_current_state(TASK_INTERRUPTIBLE);
-       }
-       __set_current_state(TASK_RUNNING);
         return 0;
  }
  
@@ -7224,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
  static void calc_global_load_remove(struct rq *rq)
  {
         atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
@@ -7448,7 +7514,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 rq = task_rq_lock(p, &flags);
                 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
                 task_rq_unlock(rq, &flags);
+               get_task_struct(p);
                 cpu_rq(cpu)->migration_thread = p;
+               rq->calc_load_update = calc_load_update;
                 break;
  
         case CPU_ONLINE:
@@ -7459,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 /* Update our root-domain */
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
-               rq->calc_load_update = calc_load_update;
-               rq->calc_load_active = 0;
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  
@@ -7478,6 +7544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 kthread_bind(cpu_rq(cpu)->migration_thread,
                              cpumask_any(cpu_online_mask));
                 kthread_stop(cpu_rq(cpu)->migration_thread);
+               put_task_struct(cpu_rq(cpu)->migration_thread);
                 cpu_rq(cpu)->migration_thread = NULL;
                 break;
  
@@ -7487,6 +7554,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 migrate_live_tasks(cpu);
                 rq = cpu_rq(cpu);
                 kthread_stop(rq->migration_thread);
+               put_task_struct(rq->migration_thread);
                 rq->migration_thread = NULL;
                 /* Idle task back to normal (off runqueue, low prio) */
                 spin_lock_irq(&rq->lock);
@@ -7536,8 +7604,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
   */
  static struct notifier_block __cpuinitdata migration_notifier = {
         .notifier_call = migration_call,
@@ -7780,26 +7850,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                 free_rootdomain(old_rd);
  }
  
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
  {
+       gfp_t gfp = GFP_KERNEL;
+
         memset(rd, 0, sizeof(*rd));
  
-       if (bootmem) {
-               alloc_bootmem_cpumask_var(&def_root_domain.span);
-               alloc_bootmem_cpumask_var(&def_root_domain.online);
-               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-               cpupri_init(&rd->cpupri, true);
-               return 0;
-       }
+       if (bootmem)
+               gfp = GFP_NOWAIT;
  
-       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->span, gfp))
                 goto out;
-       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->online, gfp))
                 goto free_span;
-       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->rto_mask, gfp))
                 goto free_online;
  
-       if (cpupri_init(&rd->cpupri, false) != 0)
+       if (cpupri_init(&rd->cpupri, bootmem) != 0)
                 goto free_rto_mask;
         return 0;
  
@@ -8989,6 +9056,8 @@ void __init sched_init_smp(void)
  }
  #endif /* CONFIG_SMP */
  
+const_debug unsigned int sysctl_timer_migration = 1;
+
  int in_sched_functions(unsigned long addr)
  {
         return in_lock_functions(addr) ||
@@ -9028,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
  #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
-       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+       plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
  #endif
  
         rt_rq->rt_time = 0;
@@ -9123,7 +9192,7 @@ void __init sched_init(void)
          * we use alloc_bootmem().
          */
         if (alloc_size) {
-               ptr = (unsigned long)alloc_bootmem(alloc_size);
+               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 init_task_group.se = (struct sched_entity **)ptr;
@@ -9218,7 +9287,7 @@ void __init sched_init(void)
                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
                  * then A0's share of the cpu resource is:
                  *
-                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                  *
                  * We achieve this by letting init_task_group's tasks sit
                  * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9314,15 +9383,17 @@ void __init sched_init(void)
         current->sched_class = &fair_sched_class;
  
         /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+       alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
  #ifdef CONFIG_SMP
  #ifdef CONFIG_NO_HZ
-       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
-       alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+       alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+       alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
  #endif
-       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+       alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
  #endif /* SMP */
  
+       perf_counter_init();
+
         scheduler_running = 1;
  }