Merge commit 'v2.6.31-rc6' into core/rcu
[safe/jmp/linux-2.6] / kernel / sched.c
index 228acae..cda8b81 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
 #include "sched_cpupri.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  */
 #define RUNTIME_INF    ((u64)~0ULL)
 
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
 #ifdef CONFIG_SMP
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -244,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
                delta = ktime_to_ns(ktime_sub(hard, soft));
                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                               HRTIMER_MODE_ABS, 0);
+                               HRTIMER_MODE_ABS_PINNED, 0);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -497,6 +493,7 @@ struct rt_rq {
 #endif
 #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
+       unsigned long rt_nr_total;
        int overloaded;
        struct plist_head pushable_tasks;
 #endif
@@ -584,6 +581,7 @@ struct rq {
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
+       u64 nr_migrations_in;
 
        struct cfs_rq cfs;
        struct rt_rq rt;
@@ -696,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
        rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -1158,7 +1156,7 @@ static __init void init_hrtick(void)
 static void hrtick_start(struct rq *rq, u64 delay)
 {
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL, 0);
+                       HRTIMER_MODE_REL_PINNED, 0);
 }
 
 static inline void init_hrtick(void)
@@ -1964,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
        clock_offset = old_rq->clock - new_rq->clock;
 
-       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+       trace_sched_migrate_task(p, new_cpu);
 
 #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
@@ -1973,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                p->se.sleep_start -= clock_offset;
        if (p->se.block_start)
                p->se.block_start -= clock_offset;
+#endif
        if (old_cpu != new_cpu) {
-               schedstat_inc(p, se.nr_migrations);
+               p->se.nr_migrations++;
+               new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
-       }
 #endif
+               perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+                                    1, 1, NULL, 0);
+       }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
 
@@ -2021,6 +2024,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 }
 
 /*
+ * wait_task_context_switch -  wait for a thread to complete at least one
+ *                             context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+       unsigned long nvcsw, nivcsw, flags;
+       int running;
+       struct rq *rq;
+
+       nvcsw   = p->nvcsw;
+       nivcsw  = p->nivcsw;
+       for (;;) {
+               /*
+                * The runqueue is assigned before the actual context
+                * switch. We need to take the runqueue lock.
+                *
+                * We could check initially without the lock but it is
+                * very likely that we need to take the lock in every
+                * iteration.
+                */
+               rq = task_rq_lock(p, &flags);
+               running = task_running(rq, p);
+               task_rq_unlock(rq, &flags);
+
+               if (likely(!running))
+                       break;
+               /*
+                * The switch count is incremented before the actual
+                * context switch. We thus wait for two switches to be
+                * sure at least one completed.
+                */
+               if ((p->nvcsw - nvcsw) > 1)
+                       break;
+               if ((p->nivcsw - nivcsw) > 1)
+                       break;
+
+               cpu_relax();
+       }
+}
+
+/*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2148,6 +2194,7 @@ void kick_process(struct task_struct *p)
                smp_send_reschedule(cpu);
        preempt_enable();
 }
+EXPORT_SYMBOL_GPL(kick_process);
 
 /*
  * Return a low guess at the load of a migration-source cpu weighted
@@ -2330,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:         the task to evaluate
+ * @func:      the function to be called
+ * @info:      the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                             void (*func) (void *info), void *info)
+{
+       int cpu;
+
+       preempt_disable();
+       cpu = task_cpu(p);
+       if (task_curr(p))
+               smp_call_function_single(cpu, func, info, 1);
+       preempt_enable();
+}
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2464,6 +2532,17 @@ out:
        return success;
 }
 
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
 int wake_up_process(struct task_struct *p)
 {
        return try_to_wake_up(p, TASK_ALL, 0);
@@ -2486,21 +2565,44 @@ static void __sched_fork(struct task_struct *p)
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
+       p->se.nr_migrations             = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_start                = 0;
-       p->se.sum_sleep_runtime         = 0;
-       p->se.sleep_start               = 0;
-       p->se.block_start               = 0;
-       p->se.sleep_max                 = 0;
-       p->se.block_max                 = 0;
-       p->se.exec_max                  = 0;
-       p->se.slice_max                 = 0;
-       p->se.wait_max                  = 0;
+       p->se.wait_start                        = 0;
+       p->se.wait_max                          = 0;
+       p->se.wait_count                        = 0;
+       p->se.wait_sum                          = 0;
+
+       p->se.sleep_start                       = 0;
+       p->se.sleep_max                         = 0;
+       p->se.sum_sleep_runtime                 = 0;
+
+       p->se.block_start                       = 0;
+       p->se.block_max                         = 0;
+       p->se.exec_max                          = 0;
+       p->se.slice_max                         = 0;
+
+       p->se.nr_migrations_cold                = 0;
+       p->se.nr_failed_migrations_affine       = 0;
+       p->se.nr_failed_migrations_running      = 0;
+       p->se.nr_failed_migrations_hot          = 0;
+       p->se.nr_forced_migrations              = 0;
+       p->se.nr_forced2_migrations             = 0;
+
+       p->se.nr_wakeups                        = 0;
+       p->se.nr_wakeups_sync                   = 0;
+       p->se.nr_wakeups_migrate                = 0;
+       p->se.nr_wakeups_local                  = 0;
+       p->se.nr_wakeups_remote                 = 0;
+       p->se.nr_wakeups_affine                 = 0;
+       p->se.nr_wakeups_affine_attempts        = 0;
+       p->se.nr_wakeups_passive                = 0;
+       p->se.nr_wakeups_idle                   = 0;
+
 #endif
 
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2716,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+       perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
        if (post_schedule)
@@ -2772,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * combine the page table reload and the switch backend into
         * one hypercall.
         */
-       arch_enter_lazy_cpu_mode();
+       arch_start_context_switch(prev);
 
        if (unlikely(!mm)) {
                next->active_mm = oldmm;
@@ -2931,6 +3034,15 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 
 /*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+       return cpu_rq(cpu)->nr_migrations_in;
+}
+
+/*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
@@ -4309,6 +4421,11 @@ static struct {
        .load_balancer = ATOMIC_INIT(-1),
 };
 
+int get_nohz_load_balancer(void)
+{
+       return atomic_read(&nohz.load_balancer);
+}
+
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 /**
  * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -5028,6 +5145,8 @@ void scheduler_tick(void)
        curr->sched_class->task_tick(rq, curr, 0);
        spin_unlock(&rq->lock);
 
+       perf_counter_task_tick(curr, cpu);
+
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
@@ -5243,6 +5362,7 @@ need_resched_nonpreemptible:
 
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
+               perf_counter_task_sched_out(prev, next, cpu);
 
                rq->nr_switches++;
                rq->curr = next;
@@ -5425,6 +5545,9 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5463,6 +5586,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5499,6 +5625,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);        /* For internal use only */
  * awakened in the same order in which they were queued.
  *
  * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void complete(struct completion *x)
 {
@@ -5516,6 +5645,9 @@ EXPORT_SYMBOL(complete);
  * @x:  holds the state of this particular completion
  *
  * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void complete_all(struct completion *x)
 {
@@ -6432,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
 
+static inline int should_resched(void)
+{
+       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
 static void __cond_resched(void)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6451,8 +6588,7 @@ static void __cond_resched(void)
 
 int __sched _cond_resched(void)
 {
-       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
-                                       system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                __cond_resched();
                return 1;
        }
@@ -6470,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
  */
 int cond_resched_lock(spinlock_t *lock)
 {
-       int resched = need_resched() && system_state == SYSTEM_RUNNING;
+       int resched = should_resched();
        int ret = 0;
 
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
-               if (resched && need_resched())
+               if (resched)
                        __cond_resched();
                else
                        cpu_relax();
@@ -6490,7 +6626,7 @@ int __sched cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
 
-       if (need_resched() && system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                local_bh_enable();
                __cond_resched();
                local_bh_disable();
@@ -6915,6 +7051,11 @@ fail:
        return ret;
 }
 
+#define RCU_MIGRATION_IDLE     0
+#define RCU_MIGRATION_NEED_QS  1
+#define RCU_MIGRATION_GOT_QS   2
+#define RCU_MIGRATION_MUST_SYNC        3
+
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
@@ -6922,6 +7063,7 @@ fail:
  */
 static int migration_thread(void *data)
 {
+       int badcpu;
        int cpu = (long)data;
        struct rq *rq;
 
@@ -6937,7 +7079,7 @@ static int migration_thread(void *data)
 
                if (cpu_is_offline(cpu)) {
                        spin_unlock_irq(&rq->lock);
-                       goto wait_to_die;
+                       break;
                }
 
                if (rq->active_balance) {
@@ -6956,23 +7098,23 @@ static int migration_thread(void *data)
                req = list_entry(head->next, struct migration_req, list);
                list_del_init(head->next);
 
-               spin_unlock(&rq->lock);
-               __migrate_task(req->task, cpu, req->dest_cpu);
+               if (req->task != NULL) {
+                       spin_unlock(&rq->lock);
+                       __migrate_task(req->task, cpu, req->dest_cpu);
+               } else if (likely(cpu == (badcpu = smp_processor_id()))) {
+                       req->dest_cpu = RCU_MIGRATION_GOT_QS;
+                       spin_unlock(&rq->lock);
+               } else {
+                       req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
+                       spin_unlock(&rq->lock);
+                       WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
+               }
                local_irq_enable();
 
                complete(&req->done);
        }
        __set_current_state(TASK_RUNNING);
-       return 0;
 
-wait_to_die:
-       /* Wait for kthread_stop */
-       set_current_state(TASK_INTERRUPTIBLE);
-       while (!kthread_should_stop()) {
-               schedule();
-               set_current_state(TASK_INTERRUPTIBLE);
-       }
-       __set_current_state(TASK_RUNNING);
        return 0;
 }
 
@@ -7162,6 +7304,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 static void calc_global_load_remove(struct rq *rq)
 {
        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
@@ -7386,7 +7529,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq = task_rq_lock(p, &flags);
                __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
                task_rq_unlock(rq, &flags);
+               get_task_struct(p);
                cpu_rq(cpu)->migration_thread = p;
+               rq->calc_load_update = calc_load_update;
                break;
 
        case CPU_ONLINE:
@@ -7397,8 +7542,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                /* Update our root-domain */
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
-               rq->calc_load_update = calc_load_update;
-               rq->calc_load_active = 0;
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
@@ -7416,6 +7559,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                kthread_bind(cpu_rq(cpu)->migration_thread,
                             cpumask_any(cpu_online_mask));
                kthread_stop(cpu_rq(cpu)->migration_thread);
+               put_task_struct(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
 
@@ -7425,6 +7569,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                migrate_live_tasks(cpu);
                rq = cpu_rq(cpu);
                kthread_stop(rq->migration_thread);
+               put_task_struct(rq->migration_thread);
                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                spin_lock_irq(&rq->lock);
@@ -7474,8 +7619,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
 
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
        .notifier_call = migration_call,
@@ -7718,26 +7865,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                free_rootdomain(old_rd);
 }
 
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+       gfp_t gfp = GFP_KERNEL;
+
        memset(rd, 0, sizeof(*rd));
 
-       if (bootmem) {
-               alloc_bootmem_cpumask_var(&def_root_domain.span);
-               alloc_bootmem_cpumask_var(&def_root_domain.online);
-               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-               cpupri_init(&rd->cpupri, true);
-               return 0;
-       }
+       if (bootmem)
+               gfp = GFP_NOWAIT;
 
-       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->span, gfp))
                goto out;
-       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->online, gfp))
                goto free_span;
-       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->rto_mask, gfp))
                goto free_online;
 
-       if (cpupri_init(&rd->cpupri, false) != 0)
+       if (cpupri_init(&rd->cpupri, bootmem) != 0)
                goto free_rto_mask;
        return 0;
 
@@ -8927,6 +9071,8 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
+const_debug unsigned int sysctl_timer_migration = 1;
+
 int in_sched_functions(unsigned long addr)
 {
        return in_lock_functions(addr) ||
@@ -8966,7 +9112,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+       plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
 #endif
 
        rt_rq->rt_time = 0;
@@ -9061,7 +9207,7 @@ void __init sched_init(void)
         * we use alloc_bootmem().
         */
        if (alloc_size) {
-               ptr = (unsigned long)alloc_bootmem(alloc_size);
+               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
                init_task_group.se = (struct sched_entity **)ptr;
@@ -9156,7 +9302,7 @@ void __init sched_init(void)
                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
                 * then A0's share of the cpu resource is:
                 *
-                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                 *
                 * We achieve this by letting init_task_group's tasks sit
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9252,15 +9398,17 @@ void __init sched_init(void)
        current->sched_class = &fair_sched_class;
 
        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+       alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
-       alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+       alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+       alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+       alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
+       perf_counter_init();
+
        scheduler_running = 1;
 }
 
@@ -10448,3 +10596,113 @@ struct cgroup_subsys cpuacct_subsys = {
        .subsys_id = cpuacct_subsys_id,
 };
 #endif /* CONFIG_CGROUP_CPUACCT */
+
+#ifndef CONFIG_SMP
+
+int rcu_expedited_torture_stats(char *page)
+{
+       return 0;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+void synchronize_sched_expedited(void)
+{
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+static DEFINE_MUTEX(rcu_sched_expedited_mutex);
+
+#define RCU_EXPEDITED_STATE_POST -2
+#define RCU_EXPEDITED_STATE_IDLE -1
+
+static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+
+int rcu_expedited_torture_stats(char *page)
+{
+       int cnt = 0;
+       int cpu;
+
+       cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+       for_each_online_cpu(cpu) {
+                cnt += sprintf(&page[cnt], " %d:%d",
+                               cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+       }
+       cnt += sprintf(&page[cnt], "\n");
+       return cnt;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+static long synchronize_sched_expedited_count;
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ */
+void synchronize_sched_expedited(void)
+{
+       int cpu;
+       unsigned long flags;
+       bool need_full_sync = 0;
+       struct rq *rq;
+       struct migration_req *req;
+       long snap;
+       int trycount = 0;
+
+       smp_mb();  /* ensure prior mod happens before capturing snap. */
+       snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+       get_online_cpus();
+       while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+               put_online_cpus();
+               if (trycount++ < 10)
+                       udelay(trycount * num_online_cpus());
+               else {
+                       synchronize_sched();
+                       return;
+               }
+               if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       return;
+               }
+               get_online_cpus();
+       }
+       rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+       for_each_online_cpu(cpu) {
+               rq = cpu_rq(cpu);
+               req = &per_cpu(rcu_migration_req, cpu);
+               init_completion(&req->done);
+               req->task = NULL;
+               req->dest_cpu = RCU_MIGRATION_NEED_QS;
+               spin_lock_irqsave(&rq->lock, flags);
+               list_add(&req->list, &rq->migration_queue);
+               spin_unlock_irqrestore(&rq->lock, flags);
+               wake_up_process(rq->migration_thread);
+       }
+       for_each_online_cpu(cpu) {
+               rcu_expedited_state = cpu;
+               req = &per_cpu(rcu_migration_req, cpu);
+               rq = cpu_rq(cpu);
+               wait_for_completion(&req->done);
+               spin_lock_irqsave(&rq->lock, flags);
+               if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
+                       need_full_sync = 1;
+               req->dest_cpu = RCU_MIGRATION_IDLE;
+               spin_unlock_irqrestore(&rq->lock, flags);
+       }
+       rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+       mutex_unlock(&rcu_sched_expedited_mutex);
+       put_online_cpus();
+       if (need_full_sync)
+               synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */