Merge branch 'cpu_stop' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc...
authorIngo Molnar <mingo@elte.hu>
Sat, 8 May 2010 16:11:19 +0000 (18:11 +0200)
committerIngo Molnar <mingo@elte.hu>
Sat, 8 May 2010 16:11:19 +0000 (18:11 +0200)
13 files changed:
Documentation/RCU/torture.txt
arch/s390/kernel/time.c
drivers/xen/manage.c
include/linux/rcutiny.h
include/linux/rcutree.h
include/linux/stop_machine.h
kernel/Makefile
kernel/cpu.c
kernel/module.c
kernel/rcutorture.c
kernel/sched.c
kernel/sched_fair.c
kernel/stop_machine.c

index 0e50bc2..5d90167 100644 (file)
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
        sched_expedited-torture: Reader Pipe:  12660320201 95875 0 0 0 0 0 0 0 0 0
        sched_expedited-torture: Reader Batch:  12660424885 0 0 0 0 0 0 0 0 0 0
        sched_expedited-torture: Free-Block Circulation:  1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
-       state: -1 / 0:0 3:0 4:0
-
-As before, the first four lines are similar to those for RCU.
-The last line shows the task-migration state.  The first number is
--1 if synchronize_sched_expedited() is idle, -2 if in the process of
-posting wakeups to the migration kthreads, and N when waiting on CPU N.
-Each of the colon-separated fields following the "/" is a CPU:state pair.
-Valid states are "0" for idle, "1" for waiting for quiescent state,
-"2" for passed through quiescent state, and "3" when a race with a
-CPU-hotplug event forces use of the synchronize_sched() primitive.
 
 
 USAGE
index d906bf1..a2163c9 100644 (file)
@@ -391,7 +391,6 @@ static void __init time_init_wq(void)
        if (time_sync_wq)
                return;
        time_sync_wq = create_singlethread_workqueue("timesync");
-       stop_machine_create();
 }
 
 /*
index 2ac4440..8943b8c 100644 (file)
@@ -80,12 +80,6 @@ static void do_suspend(void)
 
        shutting_down = SHUTDOWN_SUSPEND;
 
-       err = stop_machine_create();
-       if (err) {
-               printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err);
-               goto out;
-       }
-
 #ifdef CONFIG_PREEMPT
        /* If the kernel is preemptible, we need to freeze all the processes
           to prevent them from being in the middle of a pagetable update
@@ -93,7 +87,7 @@ static void do_suspend(void)
        err = freeze_processes();
        if (err) {
                printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
-               goto out_destroy_sm;
+               goto out;
        }
 #endif
 
@@ -136,12 +130,8 @@ out_resume:
 out_thaw:
 #ifdef CONFIG_PREEMPT
        thaw_processes();
-
-out_destroy_sm:
-#endif
-       stop_machine_destroy();
-
 out:
+#endif
        shutting_down = SHUTDOWN_INVALID;
 }
 #endif /* CONFIG_PM_SLEEP */
index a519587..0006b2d 100644 (file)
@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
        return 0;
 }
 
-extern int rcu_expedited_torture_stats(char *page);
-
 static inline void rcu_force_quiescent_state(void)
 {
 }
index 42cc3a0..24e467e 100644 (file)
@@ -35,7 +35,6 @@ struct notifier_block;
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern int rcu_needs_cpu(int cpu);
-extern int rcu_expedited_torture_stats(char *page);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
index baba3a2..6b524a0 100644 (file)
 #ifndef _LINUX_STOP_MACHINE
 #define _LINUX_STOP_MACHINE
-/* "Bogolock": stop the entire machine, disable interrupts.  This is a
-   very heavy lock, which is equivalent to grabbing every spinlock
-   (and more).  So the "read" side to such a lock is anything which
-   disables preeempt. */
+
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/list.h>
 #include <asm/system.h>
 
+/*
+ * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
+ * monopolization mechanism.  The caller can specify a non-sleeping
+ * function to be executed on a single or multiple cpus preempting all
+ * other processes and monopolizing those cpus until it finishes.
+ *
+ * Resources for this mechanism are preallocated when a cpu is brought
+ * up and requests are guaranteed to be served as long as the target
+ * cpus are online.
+ */
+typedef int (*cpu_stop_fn_t)(void *arg);
+
+#ifdef CONFIG_SMP
+
+struct cpu_stop_work {
+       struct list_head        list;           /* cpu_stopper->works */
+       cpu_stop_fn_t           fn;
+       void                    *arg;
+       struct cpu_stop_done    *done;
+};
+
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+                        struct cpu_stop_work *work_buf);
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+
+#else  /* CONFIG_SMP */
+
+#include <linux/workqueue.h>
+
+struct cpu_stop_work {
+       struct work_struct      work;
+       cpu_stop_fn_t           fn;
+       void                    *arg;
+};
+
+static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+       int ret = -ENOENT;
+       preempt_disable();
+       if (cpu == smp_processor_id())
+               ret = fn(arg);
+       preempt_enable();
+       return ret;
+}
+
+static void stop_one_cpu_nowait_workfn(struct work_struct *work)
+{
+       struct cpu_stop_work *stwork =
+               container_of(work, struct cpu_stop_work, work);
+       preempt_disable();
+       stwork->fn(stwork->arg);
+       preempt_enable();
+}
+
+static inline void stop_one_cpu_nowait(unsigned int cpu,
+                                      cpu_stop_fn_t fn, void *arg,
+                                      struct cpu_stop_work *work_buf)
+{
+       if (cpu == smp_processor_id()) {
+               INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn);
+               work_buf->fn = fn;
+               work_buf->arg = arg;
+               schedule_work(&work_buf->work);
+       }
+}
+
+static inline int stop_cpus(const struct cpumask *cpumask,
+                           cpu_stop_fn_t fn, void *arg)
+{
+       if (cpumask_test_cpu(raw_smp_processor_id(), cpumask))
+               return stop_one_cpu(raw_smp_processor_id(), fn, arg);
+       return -ENOENT;
+}
+
+static inline int try_stop_cpus(const struct cpumask *cpumask,
+                               cpu_stop_fn_t fn, void *arg)
+{
+       return stop_cpus(cpumask, fn, arg);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * stop_machine "Bogolock": stop the entire machine, disable
+ * interrupts.  This is a very heavy lock, which is equivalent to
+ * grabbing every spinlock (and more).  So the "read" side to such a
+ * lock is anything which disables preeempt.
+ */
 #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 
 /**
@@ -36,24 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-/**
- * stop_machine_create: create all stop_machine threads
- *
- * Description: This causes all stop_machine threads to be created before
- * stop_machine actually gets called. This can be used by subsystems that
- * need a non failing stop_machine infrastructure.
- */
-int stop_machine_create(void);
-
-/**
- * stop_machine_destroy: destroy all stop_machine threads
- *
- * Description: This causes all stop_machine threads which were created with
- * stop_machine_create to be destroyed again.
- */
-void stop_machine_destroy(void);
-
-#else
+#else   /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
 static inline int stop_machine(int (*fn)(void *), void *data,
                               const struct cpumask *cpus)
@@ -65,8 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
        return ret;
 }
 
-static inline int stop_machine_create(void) { return 0; }
-static inline void stop_machine_destroy(void) { }
-
-#endif /* CONFIG_SMP */
-#endif /* _LINUX_STOP_MACHINE */
+#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */
+#endif /* _LINUX_STOP_MACHINE */
index a987aa1..149e18e 100644 (file)
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
index 914aedc..5457775 100644 (file)
@@ -266,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
 {
        int err;
 
-       err = stop_machine_create();
-       if (err)
-               return err;
        cpu_maps_update_begin();
 
        if (cpu_hotplug_disabled) {
@@ -280,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
 
 out:
        cpu_maps_update_done();
-       stop_machine_destroy();
        return err;
 }
 EXPORT_SYMBOL(cpu_down);
@@ -361,9 +357,6 @@ int disable_nonboot_cpus(void)
 {
        int cpu, first_cpu, error;
 
-       error = stop_machine_create();
-       if (error)
-               return error;
        cpu_maps_update_begin();
        first_cpu = cpumask_first(cpu_online_mask);
        /*
@@ -394,7 +387,6 @@ int disable_nonboot_cpus(void)
                printk(KERN_ERR "Non-boot CPUs are not disabled\n");
        }
        cpu_maps_update_done();
-       stop_machine_destroy();
        return error;
 }
 
index 1016b75..0838246 100644 (file)
@@ -723,16 +723,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
 
-       /* Create stop_machine threads since free_module relies on
-        * a non-failing stop_machine call. */
-       ret = stop_machine_create();
-       if (ret)
-               return ret;
-
-       if (mutex_lock_interruptible(&module_mutex) != 0) {
-               ret = -EINTR;
-               goto out_stop;
-       }
+       if (mutex_lock_interruptible(&module_mutex) != 0)
+               return -EINTR;
 
        mod = find_module(name);
        if (!mod) {
@@ -792,8 +784,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 
  out:
        mutex_unlock(&module_mutex);
-out_stop:
-       stop_machine_destroy();
        return ret;
 }
 
index 58df55b..2b676f3 100644 (file)
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
        .sync           = synchronize_sched_expedited,
        .cb_barrier     = NULL,
        .fqs            = rcu_sched_force_quiescent_state,
-       .stats          = rcu_expedited_torture_stats,
+       .stats          = NULL,
        .irq_capable    = 1,
        .name           = "sched_expedited"
 };
index 11ac0eb..39aa9c7 100644 (file)
@@ -55,9 +55,9 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
-#include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -539,15 +539,13 @@ struct rq {
        int post_schedule;
        int active_balance;
        int push_cpu;
+       struct cpu_stop_work active_balance_work;
        /* cpu of this runqueue: */
        int cpu;
        int online;
 
        unsigned long avg_load_per_task;
 
-       struct task_struct *migration_thread;
-       struct list_head migration_queue;
-
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -2037,21 +2035,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
 
-struct migration_req {
-       struct list_head list;
-
+struct migration_arg {
        struct task_struct *task;
        int dest_cpu;
-
-       struct completion done;
 };
 
+static int migration_cpu_stop(void *data);
+
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static int
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+static bool migrate_task(struct task_struct *p, int dest_cpu)
 {
        struct rq *rq = task_rq(p);
 
@@ -2059,15 +2054,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
         */
-       if (!p->se.on_rq && !task_running(rq, p))
-               return 0;
-
-       init_completion(&req->done);
-       req->task = p;
-       req->dest_cpu = dest_cpu;
-       list_add(&req->list, &rq->migration_queue);
-
-       return 1;
+       return p->se.on_rq || task_running(rq, p);
 }
 
 /*
@@ -3110,7 +3097,6 @@ static void update_cpu_load(struct rq *this_rq)
 void sched_exec(void)
 {
        struct task_struct *p = current;
-       struct migration_req req;
        unsigned long flags;
        struct rq *rq;
        int dest_cpu;
@@ -3124,17 +3110,11 @@ void sched_exec(void)
         * select_task_rq() can race against ->cpus_allowed
         */
        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-           likely(cpu_active(dest_cpu)) &&
-           migrate_task(p, dest_cpu, &req)) {
-               /* Need to wait for migration thread (might exit: take ref). */
-               struct task_struct *mt = rq->migration_thread;
+           likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+               struct migration_arg arg = { p, dest_cpu };
 
-               get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-               wake_up_process(mt);
-               put_task_struct(mt);
-               wait_for_completion(&req.done);
-
+               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                return;
        }
 unlock:
@@ -5290,17 +5270,15 @@ static inline void sched_init_granularity(void)
 /*
  * This is how migration works:
  *
- * 1) we queue a struct migration_req structure in the source CPU's
- *    runqueue and wake up that CPU's migration thread.
- * 2) we down() the locked semaphore => thread blocks.
- * 3) migration thread wakes up (implicitly it forces the migrated
- *    thread off the CPU)
- * 4) it gets the migration request and checks whether the migrated
- *    task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
- * 7) we wake up and the migration is done.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
  */
 
 /*
@@ -5314,9 +5292,9 @@ static inline void sched_init_granularity(void)
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-       struct migration_req req;
        unsigned long flags;
        struct rq *rq;
+       unsigned int dest_cpu;
        int ret = 0;
 
        /*
@@ -5354,15 +5332,12 @@ again:
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
 
-       if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+       if (migrate_task(p, dest_cpu)) {
+               struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-               struct task_struct *mt = rq->migration_thread;
-
-               get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-               wake_up_process(mt);
-               put_task_struct(mt);
-               wait_for_completion(&req.done);
+               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
        }
@@ -5420,70 +5395,22 @@ fail:
        return ret;
 }
 
-#define RCU_MIGRATION_IDLE     0
-#define RCU_MIGRATION_NEED_QS  1
-#define RCU_MIGRATION_GOT_QS   2
-#define RCU_MIGRATION_MUST_SYNC        3
-
 /*
- * migration_thread - this is a highprio system thread that performs
- * thread migration by bumping thread off CPU then 'pushing' onto
- * another runqueue.
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
  */
-static int migration_thread(void *data)
+static int migration_cpu_stop(void *data)
 {
-       int badcpu;
-       int cpu = (long)data;
-       struct rq *rq;
-
-       rq = cpu_rq(cpu);
-       BUG_ON(rq->migration_thread != current);
-
-       set_current_state(TASK_INTERRUPTIBLE);
-       while (!kthread_should_stop()) {
-               struct migration_req *req;
-               struct list_head *head;
-
-               raw_spin_lock_irq(&rq->lock);
-
-               if (cpu_is_offline(cpu)) {
-                       raw_spin_unlock_irq(&rq->lock);
-                       break;
-               }
-
-               if (rq->active_balance) {
-                       active_load_balance(rq, cpu);
-                       rq->active_balance = 0;
-               }
-
-               head = &rq->migration_queue;
-
-               if (list_empty(head)) {
-                       raw_spin_unlock_irq(&rq->lock);
-                       schedule();
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       continue;
-               }
-               req = list_entry(head->next, struct migration_req, list);
-               list_del_init(head->next);
-
-               if (req->task != NULL) {
-                       raw_spin_unlock(&rq->lock);
-                       __migrate_task(req->task, cpu, req->dest_cpu);
-               } else if (likely(cpu == (badcpu = smp_processor_id()))) {
-                       req->dest_cpu = RCU_MIGRATION_GOT_QS;
-                       raw_spin_unlock(&rq->lock);
-               } else {
-                       req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-                       raw_spin_unlock(&rq->lock);
-                       WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
-               }
-               local_irq_enable();
-
-               complete(&req->done);
-       }
-       __set_current_state(TASK_RUNNING);
+       struct migration_arg *arg = data;
 
+       /*
+        * The original target cpu might have gone down and we might
+        * be on another cpu but it doesn't matter.
+        */
+       local_irq_disable();
+       __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+       local_irq_enable();
        return 0;
 }
 
@@ -5850,35 +5777,20 @@ static void set_rq_offline(struct rq *rq)
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-       struct task_struct *p;
        int cpu = (long)hcpu;
        unsigned long flags;
-       struct rq *rq;
+       struct rq *rq = cpu_rq(cpu);
 
        switch (action) {
 
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-               p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
-               if (IS_ERR(p))
-                       return NOTIFY_BAD;
-               kthread_bind(p, cpu);
-               /* Must be high prio: stop_machine expects to yield to it. */
-               rq = task_rq_lock(p, &flags);
-               __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-               task_rq_unlock(rq, &flags);
-               get_task_struct(p);
-               cpu_rq(cpu)->migration_thread = p;
                rq->calc_load_update = calc_load_update;
                break;
 
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-               /* Strictly unnecessary, as first user will wake it. */
-               wake_up_process(cpu_rq(cpu)->migration_thread);
-
                /* Update our root-domain */
-               rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5889,25 +5801,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-               if (!cpu_rq(cpu)->migration_thread)
-                       break;
-               /* Unbind it from offline cpu so it can run. Fall thru. */
-               kthread_bind(cpu_rq(cpu)->migration_thread,
-                            cpumask_any(cpu_online_mask));
-               kthread_stop(cpu_rq(cpu)->migration_thread);
-               put_task_struct(cpu_rq(cpu)->migration_thread);
-               cpu_rq(cpu)->migration_thread = NULL;
-               break;
-
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                migrate_live_tasks(cpu);
-               rq = cpu_rq(cpu);
-               kthread_stop(rq->migration_thread);
-               put_task_struct(rq->migration_thread);
-               rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                raw_spin_lock_irq(&rq->lock);
                deactivate_task(rq, rq->idle, 0);
@@ -5918,29 +5814,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
                calc_global_load_remove(rq);
-               /*
-                * No need to migrate the tasks: it was best-effort if
-                * they didn't take sched_hotcpu_mutex. Just wake up
-                * the requestors.
-                */
-               raw_spin_lock_irq(&rq->lock);
-               while (!list_empty(&rq->migration_queue)) {
-                       struct migration_req *req;
-
-                       req = list_entry(rq->migration_queue.next,
-                                        struct migration_req, list);
-                       list_del_init(&req->list);
-                       raw_spin_unlock_irq(&rq->lock);
-                       complete(&req->done);
-                       raw_spin_lock_irq(&rq->lock);
-               }
-               raw_spin_unlock_irq(&rq->lock);
                break;
 
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /* Update our root-domain */
-               rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7757,10 +7635,8 @@ void __init sched_init(void)
                rq->push_cpu = 0;
                rq->cpu = i;
                rq->online = 0;
-               rq->migration_thread = NULL;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
-               INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
 #endif
                init_rq_hrtick(rq);
@@ -9054,43 +8930,32 @@ struct cgroup_subsys cpuacct_subsys = {
 
 #ifndef CONFIG_SMP
 
-int rcu_expedited_torture_stats(char *page)
-{
-       return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
 void synchronize_sched_expedited(void)
 {
+       barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
 #else /* #ifndef CONFIG_SMP */
 
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
-
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
 
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-       int cnt = 0;
-       int cpu;
-
-       cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
-       for_each_online_cpu(cpu) {
-                cnt += sprintf(&page[cnt], " %d:%d",
-                               cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
-       }
-       cnt += sprintf(&page[cnt], "\n");
-       return cnt;
+       /*
+        * There must be a full memory barrier on each affected CPU
+        * between the time that try_stop_cpus() is called and the
+        * time that it returns.
+        *
+        * In the current initial implementation of cpu_stop, the
+        * above condition is already met when the control reaches
+        * this point and the following smp_mb() is not strictly
+        * necessary.  Do smp_mb() anyway for documentation and
+        * robustness against future implementation changes.
+        */
+       smp_mb(); /* See above comment block. */
+       return 0;
 }
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
-static long synchronize_sched_expedited_count;
 
 /*
  * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9104,18 +8969,14 @@ static long synchronize_sched_expedited_count;
  */
 void synchronize_sched_expedited(void)
 {
-       int cpu;
-       unsigned long flags;
-       bool need_full_sync = 0;
-       struct rq *rq;
-       struct migration_req *req;
-       long snap;
-       int trycount = 0;
+       int snap, trycount = 0;
 
        smp_mb();  /* ensure prior mod happens before capturing snap. */
-       snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+       snap = atomic_read(&synchronize_sched_expedited_count) + 1;
        get_online_cpus();
-       while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+       while (try_stop_cpus(cpu_online_mask,
+                            synchronize_sched_expedited_cpu_stop,
+                            NULL) == -EAGAIN) {
                put_online_cpus();
                if (trycount++ < 10)
                        udelay(trycount * num_online_cpus());
@@ -9123,41 +8984,15 @@ void synchronize_sched_expedited(void)
                        synchronize_sched();
                        return;
                }
-               if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+               if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
                        smp_mb(); /* ensure test happens before caller kfree */
                        return;
                }
                get_online_cpus();
        }
-       rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
-       for_each_online_cpu(cpu) {
-               rq = cpu_rq(cpu);
-               req = &per_cpu(rcu_migration_req, cpu);
-               init_completion(&req->done);
-               req->task = NULL;
-               req->dest_cpu = RCU_MIGRATION_NEED_QS;
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               list_add(&req->list, &rq->migration_queue);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-               wake_up_process(rq->migration_thread);
-       }
-       for_each_online_cpu(cpu) {
-               rcu_expedited_state = cpu;
-               req = &per_cpu(rcu_migration_req, cpu);
-               rq = cpu_rq(cpu);
-               wait_for_completion(&req->done);
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
-                       need_full_sync = 1;
-               req->dest_cpu = RCU_MIGRATION_IDLE;
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-       }
-       rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-       synchronize_sched_expedited_count++;
-       mutex_unlock(&rcu_sched_expedited_mutex);
+       atomic_inc(&synchronize_sched_expedited_count);
+       smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
        put_online_cpus();
-       if (need_full_sync)
-               synchronize_sched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
index cbd8b8a..217e4a9 100644 (file)
@@ -2798,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
+static int active_load_balance_cpu_stop(void *data);
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2887,8 +2889,9 @@ redo:
                if (need_active_balance(sd, sd_idle, idle)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
 
-                       /* don't kick the migration_thread, if the curr
-                        * task on busiest cpu can't be moved to this_cpu
+                       /* don't kick the active_load_balance_cpu_stop,
+                        * if the curr task on busiest cpu can't be
+                        * moved to this_cpu
                         */
                        if (!cpumask_test_cpu(this_cpu,
                                              &busiest->curr->cpus_allowed)) {
@@ -2898,14 +2901,22 @@ redo:
                                goto out_one_pinned;
                        }
 
+                       /*
+                        * ->active_balance synchronizes accesses to
+                        * ->active_balance_work.  Once set, it's cleared
+                        * only after active load balance is finished.
+                        */
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
                        if (active_balance)
-                               wake_up_process(busiest->migration_thread);
+                               stop_one_cpu_nowait(cpu_of(busiest),
+                                       active_load_balance_cpu_stop, busiest,
+                                       &busiest->active_balance_work);
 
                        /*
                         * We've kicked active balancing, reset the failure
@@ -3012,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 }
 
 /*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
  */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
 {
+       struct rq *busiest_rq = data;
+       int busiest_cpu = cpu_of(busiest_rq);
        int target_cpu = busiest_rq->push_cpu;
+       struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
-       struct rq *target_rq;
+
+       raw_spin_lock_irq(&busiest_rq->lock);
+
+       /* make sure the requested cpu hasn't gone down in the meantime */
+       if (unlikely(busiest_cpu != smp_processor_id() ||
+                    !busiest_rq->active_balance))
+               goto out_unlock;
 
        /* Is there any task to move? */
        if (busiest_rq->nr_running <= 1)
-               return;
-
-       target_rq = cpu_rq(target_cpu);
+               goto out_unlock;
 
        /*
         * This condition is "impossible", if it occurs
@@ -3058,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
                        schedstat_inc(sd, alb_failed);
        }
        double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+       busiest_rq->active_balance = 0;
+       raw_spin_unlock_irq(&busiest_rq->lock);
+       return 0;
 }
 
 #ifdef CONFIG_NO_HZ
index 9bb9fb1..ef51d1f 100644 (file)
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
- * GPL v2 and any later version.
+/*
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005    IBM Corporation.
+ * Copyright (C) 2008, 2005    Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010          SUSE Linux Products GmbH
+ * Copyright (C) 2010          Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
  */
+#include <linux/completion.h>
 #include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 
 #include <asm/atomic.h>
-#include <asm/uaccess.h>
+
+/*
+ * Structure to determine completion condition and record errors.  May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+       atomic_t                nr_todo;        /* nr left to execute */
+       bool                    executed;       /* actually executed? */
+       int                     ret;            /* collected return value */
+       struct completion       completion;     /* fired if nr_todo reaches 0 */
+};
+
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+       spinlock_t              lock;
+       struct list_head        works;          /* list of pending works */
+       struct task_struct      *thread;        /* stopper thread */
+       bool                    enabled;        /* is this stopper enabled? */
+};
+
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+       memset(done, 0, sizeof(*done));
+       atomic_set(&done->nr_todo, nr_todo);
+       init_completion(&done->completion);
+}
+
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+       if (done) {
+               if (executed)
+                       done->executed = true;
+               if (atomic_dec_and_test(&done->nr_todo))
+                       complete(&done->completion);
+       }
+}
+
+/* queue @work to @stopper.  if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+                               struct cpu_stop_work *work)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&stopper->lock, flags);
+
+       if (stopper->enabled) {
+               list_add_tail(&work->list, &stopper->works);
+               wake_up_process(stopper->thread);
+       } else
+               cpu_stop_signal_done(work->done, false);
+
+       spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it.  This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes.  If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus.  @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+       struct cpu_stop_done done;
+       struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+
+       cpu_stop_init_done(&done, 1);
+       cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+       wait_for_completion(&done.completion);
+       return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion.  The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+                       struct cpu_stop_work *work_buf)
+{
+       *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+       cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+       struct cpu_stop_work *work;
+       struct cpu_stop_done done;
+       unsigned int cpu;
+
+       /* initialize works and done */
+       for_each_cpu(cpu, cpumask) {
+               work = &per_cpu(stop_cpus_work, cpu);
+               work->fn = fn;
+               work->arg = arg;
+               work->done = &done;
+       }
+       cpu_stop_init_done(&done, cpumask_weight(cpumask));
+
+       /*
+        * Disable preemption while queueing to avoid getting
+        * preempted by a stopper which might wait for other stoppers
+        * to enter @fn which can lead to deadlock.
+        */
+       preempt_disable();
+       for_each_cpu(cpu, cpumask)
+               cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+                                   &per_cpu(stop_cpus_work, cpu));
+       preempt_enable();
+
+       wait_for_completion(&done.completion);
+       return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it.  This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes.  If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus.  @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+       int ret;
+
+       /* static works are used, process one request at a time */
+       mutex_lock(&stop_cpus_mutex);
+       ret = __stop_cpus(cpumask, fn, arg);
+       mutex_unlock(&stop_cpus_mutex);
+       return ret;
+}
+
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+       int ret;
+
+       /* static works are used, process one request at a time */
+       if (!mutex_trylock(&stop_cpus_mutex))
+               return -EAGAIN;
+       ret = __stop_cpus(cpumask, fn, arg);
+       mutex_unlock(&stop_cpus_mutex);
+       return ret;
+}
+
+static int cpu_stopper_thread(void *data)
+{
+       struct cpu_stopper *stopper = data;
+       struct cpu_stop_work *work;
+       int ret;
+
+repeat:
+       set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
+
+       if (kthread_should_stop()) {
+               __set_current_state(TASK_RUNNING);
+               return 0;
+       }
+
+       work = NULL;
+       spin_lock_irq(&stopper->lock);
+       if (!list_empty(&stopper->works)) {
+               work = list_first_entry(&stopper->works,
+                                       struct cpu_stop_work, list);
+               list_del_init(&work->list);
+       }
+       spin_unlock_irq(&stopper->lock);
+
+       if (work) {
+               cpu_stop_fn_t fn = work->fn;
+               void *arg = work->arg;
+               struct cpu_stop_done *done = work->done;
+               char ksym_buf[KSYM_NAME_LEN];
+
+               __set_current_state(TASK_RUNNING);
+
+               /* cpu stop callbacks are not allowed to sleep */
+               preempt_disable();
+
+               ret = fn(arg);
+               if (ret)
+                       done->ret = ret;
+
+               /* restore preemption and check it's still balanced */
+               preempt_enable();
+               WARN_ONCE(preempt_count(),
+                         "cpu_stop: %s(%p) leaked preempt count\n",
+                         kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+                                         ksym_buf), arg);
+
+               cpu_stop_signal_done(done, true);
+       } else
+               schedule();
+
+       goto repeat;
+}
+
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+                                          unsigned long action, void *hcpu)
+{
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+       unsigned int cpu = (unsigned long)hcpu;
+       struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+       struct cpu_stop_work *work;
+       struct task_struct *p;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               BUG_ON(stopper->thread || stopper->enabled ||
+                      !list_empty(&stopper->works));
+               p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+                                  cpu);
+               if (IS_ERR(p))
+                       return NOTIFY_BAD;
+               sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+               get_task_struct(p);
+               stopper->thread = p;
+               break;
+
+       case CPU_ONLINE:
+               kthread_bind(stopper->thread, cpu);
+               /* strictly unnecessary, as first user will wake it */
+               wake_up_process(stopper->thread);
+               /* mark enabled */
+               spin_lock_irq(&stopper->lock);
+               stopper->enabled = true;
+               spin_unlock_irq(&stopper->lock);
+               break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               /* kill the stopper */
+               kthread_stop(stopper->thread);
+               /* drain remaining works */
+               spin_lock_irq(&stopper->lock);
+               list_for_each_entry(work, &stopper->works, list)
+                       cpu_stop_signal_done(work->done, false);
+               stopper->enabled = false;
+               spin_unlock_irq(&stopper->lock);
+               /* release the stopper */
+               put_task_struct(stopper->thread);
+               stopper->thread = NULL;
+               break;
+#endif
+       }
+
+       return NOTIFY_OK;
+}
+
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers.  It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+       .notifier_call  = cpu_stop_cpu_callback,
+       .priority       = 10,
+};
+
+static int __init cpu_stop_init(void)
+{
+       void *bcpu = (void *)(long)smp_processor_id();
+       unsigned int cpu;
+       int err;
+
+       for_each_possible_cpu(cpu) {
+               struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+               spin_lock_init(&stopper->lock);
+               INIT_LIST_HEAD(&stopper->works);
+       }
+
+       /* start one for the boot cpu */
+       err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+                                   bcpu);
+       BUG_ON(err == NOTIFY_BAD);
+       cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+       register_cpu_notifier(&cpu_stop_cpu_notifier);
+
+       return 0;
+}
+early_initcall(cpu_stop_init);
+
+#ifdef CONFIG_STOP_MACHINE
 
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
@@ -26,174 +390,94 @@ enum stopmachine_state {
        /* Exit */
        STOPMACHINE_EXIT,
 };
-static enum stopmachine_state state;
 
 struct stop_machine_data {
-       int (*fn)(void *);
-       void *data;
-       int fnret;
+       int                     (*fn)(void *);
+       void                    *data;
+       /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+       unsigned int            num_threads;
+       const struct cpumask    *active_cpus;
+
+       enum stopmachine_state  state;
+       atomic_t                thread_ack;
 };
 
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void __percpu *stop_machine_work;
-
-static void set_state(enum stopmachine_state newstate)
+static void set_state(struct stop_machine_data *smdata,
+                     enum stopmachine_state newstate)
 {
        /* Reset ack counter. */
-       atomic_set(&thread_ack, num_threads);
+       atomic_set(&smdata->thread_ack, smdata->num_threads);
        smp_wmb();
-       state = newstate;
+       smdata->state = newstate;
 }
 
 /* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
 {
-       if (atomic_dec_and_test(&thread_ack))
-               set_state(state + 1);
+       if (atomic_dec_and_test(&smdata->thread_ack))
+               set_state(smdata, smdata->state + 1);
 }
 
-/* This is the actual function which stops the CPU. It runs
- * in the context of a dedicated stopmachine workqueue. */
-static void stop_cpu(struct work_struct *unused)
+/* This is the cpu_stop function which stops the CPU. */
+static int stop_machine_cpu_stop(void *data)
 {
+       struct stop_machine_data *smdata = data;
        enum stopmachine_state curstate = STOPMACHINE_NONE;
-       struct stop_machine_data *smdata = &idle;
-       int cpu = smp_processor_id();
-       int err;
+       int cpu = smp_processor_id(), err = 0;
+       bool is_active;
+
+       if (!smdata->active_cpus)
+               is_active = cpu == cpumask_first(cpu_online_mask);
+       else
+               is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
 
-       if (!active_cpus) {
-               if (cpu == cpumask_first(cpu_online_mask))
-                       smdata = &active;
-       } else {
-               if (cpumask_test_cpu(cpu, active_cpus))
-                       smdata = &active;
-       }
        /* Simple state machine */
        do {
                /* Chill out and ensure we re-read stopmachine_state. */
                cpu_relax();
-               if (state != curstate) {
-                       curstate = state;
+               if (smdata->state != curstate) {
+                       curstate = smdata->state;
                        switch (curstate) {
                        case STOPMACHINE_DISABLE_IRQ:
                                local_irq_disable();
                                hard_irq_disable();
                                break;
                        case STOPMACHINE_RUN:
-                               /* On multiple CPUs only a single error code
-                                * is needed to tell that something failed. */
-                               err = smdata->fn(smdata->data);
-                               if (err)
-                                       smdata->fnret = err;
+                               if (is_active)
+                                       err = smdata->fn(smdata->data);
                                break;
                        default:
                                break;
                        }
-                       ack_state();
+                       ack_state(smdata);
                }
        } while (curstate != STOPMACHINE_EXIT);
 
        local_irq_enable();
+       return err;
 }
 
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
-       return 0;
-}
-
-int stop_machine_create(void)
-{
-       mutex_lock(&setup_lock);
-       if (refcount)
-               goto done;
-       stop_machine_wq = create_rt_workqueue("kstop");
-       if (!stop_machine_wq)
-               goto err_out;
-       stop_machine_work = alloc_percpu(struct work_struct);
-       if (!stop_machine_work)
-               goto err_out;
-done:
-       refcount++;
-       mutex_unlock(&setup_lock);
-       return 0;
-
-err_out:
-       if (stop_machine_wq)
-               destroy_workqueue(stop_machine_wq);
-       mutex_unlock(&setup_lock);
-       return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-
-void stop_machine_destroy(void)
-{
-       mutex_lock(&setup_lock);
-       refcount--;
-       if (refcount)
-               goto done;
-       destroy_workqueue(stop_machine_wq);
-       free_percpu(stop_machine_work);
-done:
-       mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
-
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-       struct work_struct *sm_work;
-       int i, ret;
-
-       /* Set up initial state. */
-       mutex_lock(&lock);
-       num_threads = num_online_cpus();
-       active_cpus = cpus;
-       active.fn = fn;
-       active.data = data;
-       active.fnret = 0;
-       idle.fn = chill;
-       idle.data = NULL;
-
-       set_state(STOPMACHINE_PREPARE);
-
-       /* Schedule the stop_cpu work on all cpus: hold this CPU so one
-        * doesn't hit this CPU until we're ready. */
-       get_cpu();
-       for_each_online_cpu(i) {
-               sm_work = per_cpu_ptr(stop_machine_work, i);
-               INIT_WORK(sm_work, stop_cpu);
-               queue_work_on(i, stop_machine_wq, sm_work);
-       }
-       /* This will release the thread on our CPU. */
-       put_cpu();
-       flush_workqueue(stop_machine_wq);
-       ret = active.fnret;
-       mutex_unlock(&lock);
-       return ret;
+       struct stop_machine_data smdata = { .fn = fn, .data = data,
+                                           .num_threads = num_online_cpus(),
+                                           .active_cpus = cpus };
+
+       /* Set the initial state and stop all online cpus. */
+       set_state(&smdata, STOPMACHINE_PREPARE);
+       return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
 }
 
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
        int ret;
 
-       ret = stop_machine_create();
-       if (ret)
-               return ret;
        /* No CPUs can come up or down during this. */
        get_online_cpus();
        ret = __stop_machine(fn, data, cpus);
        put_online_cpus();
-       stop_machine_destroy();
        return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+
+#endif /* CONFIG_STOP_MACHINE */