Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[safe/jmp/linux-2.6] / drivers / cpufreq / cpufreq.c
index 346906a..47d2ad0 100644 (file)
@@ -4,13 +4,17 @@
  *  Copyright (C) 2001 Russell King
  *            (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
  *
+ *  Oct 2005 - Ashok Raj <ashok.raj@intel.com>
+ *     Added handling for CPU hotplug
+ *  Feb 2006 - Jacob Shin <jacob.shin@amd.com>
+ *     Fix handling for CPU hotplug -- affected CPUs
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
  */
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/completion.h>
+#include <linux/mutex.h>
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, "cpufreq-core", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \
+                                               "cpufreq-core", msg)
 
 /**
- * The "cpufreq driver" - the arch- or hardware-dependend low
+ * The "cpufreq driver" - the arch- or hardware-dependent low
  * level driver of CPUFreq support, and its spinlock. This lock
  * also protects the cpufreq_cpu_data array.
  */
-static struct cpufreq_driver           *cpufreq_driver;
-static struct cpufreq_policy   *cpufreq_cpu_data[NR_CPUS];
+static struct cpufreq_driver *cpufreq_driver;
+static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
+#ifdef CONFIG_HOTPLUG_CPU
+/* This one keeps track of the previously set governor of a removed CPU */
+static DEFINE_PER_CPU(struct cpufreq_governor *, cpufreq_cpu_governor);
+#endif
 static DEFINE_SPINLOCK(cpufreq_driver_lock);
 
+/*
+ * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
+ * all cpufreq/hotplug/workqueue/etc related lock issues.
+ *
+ * The rules for this semaphore:
+ * - Any routine that wants to read from the policy structure will
+ *   do a down_read on this semaphore.
+ * - Any routine that will write to the policy structure and/or may take away
+ *   the policy altogether (eg. CPU hotplug), will hold this lock in write
+ *   mode before doing so.
+ *
+ * Additional rules:
+ * - All holders of the lock should check to make sure that the CPU they
+ *   are concerned with are online after they get the lock.
+ * - Governor routines that can be called in cpufreq hotplug path should not
+ *   take this sem as top level hotplug notifier handler takes this.
+ */
+static DEFINE_PER_CPU(int, policy_cpu);
+static DEFINE_PER_CPU(struct rw_semaphore, cpu_policy_rwsem);
+
+#define lock_policy_rwsem(mode, cpu)                                   \
+int lock_policy_rwsem_##mode                                           \
+(int cpu)                                                              \
+{                                                                      \
+       int policy_cpu = per_cpu(policy_cpu, cpu);                      \
+       BUG_ON(policy_cpu == -1);                                       \
+       down_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu));            \
+       if (unlikely(!cpu_online(cpu))) {                               \
+               up_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu));      \
+               return -1;                                              \
+       }                                                               \
+                                                                       \
+       return 0;                                                       \
+}
+
+lock_policy_rwsem(read, cpu);
+EXPORT_SYMBOL_GPL(lock_policy_rwsem_read);
+
+lock_policy_rwsem(write, cpu);
+EXPORT_SYMBOL_GPL(lock_policy_rwsem_write);
+
+void unlock_policy_rwsem_read(int cpu)
+{
+       int policy_cpu = per_cpu(policy_cpu, cpu);
+       BUG_ON(policy_cpu == -1);
+       up_read(&per_cpu(cpu_policy_rwsem, policy_cpu));
+}
+EXPORT_SYMBOL_GPL(unlock_policy_rwsem_read);
+
+void unlock_policy_rwsem_write(int cpu)
+{
+       int policy_cpu = per_cpu(policy_cpu, cpu);
+       BUG_ON(policy_cpu == -1);
+       up_write(&per_cpu(cpu_policy_rwsem, policy_cpu));
+}
+EXPORT_SYMBOL_GPL(unlock_policy_rwsem_write);
+
 
 /* internal prototypes */
-static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
-static void handle_update(void *data);
-static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci);
+static int __cpufreq_governor(struct cpufreq_policy *policy,
+               unsigned int event);
+static unsigned int __cpufreq_get(unsigned int cpu);
+static void handle_update(struct work_struct *work);
 
 /**
- * Two notifier lists: the "policy" list is involved in the 
- * validation process for a new CPU frequency policy; the 
+ * Two notifier lists: the "policy" list is involved in the
+ * validation process for a new CPU frequency policy; the
  * "transition" list for kernel code that needs to handle
  * changes to devices when the CPU clock speed changes.
  * The mutex locks both lists.
  */
-static struct notifier_block    *cpufreq_policy_notifier_list;
-static struct notifier_block    *cpufreq_transition_notifier_list;
-static DECLARE_RWSEM           (cpufreq_notifier_rwsem);
+static BLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list);
+static struct srcu_notifier_head cpufreq_transition_notifier_list;
 
+static bool init_cpufreq_transition_notifier_list_called;
+static int __init init_cpufreq_transition_notifier_list(void)
+{
+       srcu_init_notifier_head(&cpufreq_transition_notifier_list);
+       init_cpufreq_transition_notifier_list_called = true;
+       return 0;
+}
+pure_initcall(init_cpufreq_transition_notifier_list);
 
 static LIST_HEAD(cpufreq_governor_list);
-static DECLARE_MUTEX           (cpufreq_governor_sem);
+static DEFINE_MUTEX(cpufreq_governor_mutex);
 
-struct cpufreq_policy * cpufreq_cpu_get(unsigned int cpu)
+struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu)
 {
        struct cpufreq_policy *data;
        unsigned long flags;
 
-       if (cpu >= NR_CPUS)
+       if (cpu >= nr_cpu_ids)
                goto err_out;
 
        /* get the cpufreq driver */
@@ -75,7 +150,7 @@ struct cpufreq_policy * cpufreq_cpu_get(unsigned int cpu)
 
 
        /* get the CPU */
-       data = cpufreq_cpu_data[cpu];
+       data = per_cpu(cpufreq_cpu_data, cpu);
 
        if (!data)
                goto err_out_put_module;
@@ -83,20 +158,19 @@ struct cpufreq_policy * cpufreq_cpu_get(unsigned int cpu)
        if (!kobject_get(&data->kobj))
                goto err_out_put_module;
 
-
        spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
        return data;
 
- err_out_put_module:
+err_out_put_module:
        module_put(cpufreq_driver->owner);
- err_out_unlock:
+err_out_unlock:
        spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
- err_out:
+err_out:
        return NULL;
 }
 EXPORT_SYMBOL_GPL(cpufreq_cpu_get);
 
+
 void cpufreq_cpu_put(struct cpufreq_policy *data)
 {
        kobject_put(&data->kobj);
@@ -125,7 +199,7 @@ static unsigned int debug_ratelimit = 1;
 static unsigned int disable_ratelimit = 1;
 static DEFINE_SPINLOCK(disable_ratelimit_lock);
 
-static inline void cpufreq_debug_enable_ratelimit(void)
+static void cpufreq_debug_enable_ratelimit(void)
 {
        unsigned long flags;
 
@@ -135,7 +209,7 @@ static inline void cpufreq_debug_enable_ratelimit(void)
        spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
 }
 
-static inline void cpufreq_debug_disable_ratelimit(void)
+static void cpufreq_debug_disable_ratelimit(void)
 {
        unsigned long flags;
 
@@ -144,17 +218,19 @@ static inline void cpufreq_debug_disable_ratelimit(void)
        spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
 }
 
-void cpufreq_debug_printk(unsigned int type, const char *prefix, const char *fmt, ...)
+void cpufreq_debug_printk(unsigned int type, const char *prefix,
+                       const char *fmt, ...)
 {
        char s[256];
        va_list args;
        unsigned int len;
        unsigned long flags;
-       
+
        WARN_ON(!prefix);
        if (type & debug) {
                spin_lock_irqsave(&disable_ratelimit_lock, flags);
-               if (!disable_ratelimit && debug_ratelimit && !printk_ratelimit()) {
+               if (!disable_ratelimit && debug_ratelimit
+                                       && !printk_ratelimit()) {
                        spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
                        return;
                }
@@ -175,10 +251,12 @@ EXPORT_SYMBOL(cpufreq_debug_printk);
 
 
 module_param(debug, uint, 0644);
-MODULE_PARM_DESC(debug, "CPUfreq debugging: add 1 to debug core, 2 to debug drivers, and 4 to debug governors.");
+MODULE_PARM_DESC(debug, "CPUfreq debugging: add 1 to debug core,"
+                       " 2 to debug drivers, and 4 to debug governors.");
 
 module_param(debug_ratelimit, uint, 0644);
-MODULE_PARM_DESC(debug_ratelimit, "CPUfreq debugging: set to 0 to disable ratelimiting.");
+MODULE_PARM_DESC(debug_ratelimit, "CPUfreq debugging:"
+                                       " set to 0 to disable ratelimiting.");
 
 #else /* !CONFIG_CPU_FREQ_DEBUG */
 
@@ -197,14 +275,14 @@ static inline void cpufreq_debug_disable_ratelimit(void) { return; }
  *
  * This function alters the system "loops_per_jiffy" for the clock
  * speed change. Note that loops_per_jiffy cannot be updated on SMP
- * systems as each CPU might be scaled differently. So, use the arch 
+ * systems as each CPU might be scaled differently. So, use the arch
  * per-CPU loops_per_jiffy value wherever possible.
  */
 #ifndef CONFIG_SMP
 static unsigned long l_p_j_ref;
 static unsigned int  l_p_j_ref_freq;
 
-static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
+static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 {
        if (ci->flags & CPUFREQ_CONST_LOOPS)
                return;
@@ -212,62 +290,74 @@ static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
        if (!l_p_j_ref_freq) {
                l_p_j_ref = loops_per_jiffy;
                l_p_j_ref_freq = ci->old;
-               dprintk("saving %lu as reference value for loops_per_jiffy; freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq);
+               dprintk("saving %lu as reference value for loops_per_jiffy; "
+                       "freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq);
        }
        if ((val == CPUFREQ_PRECHANGE  && ci->old < ci->new) ||
            (val == CPUFREQ_POSTCHANGE && ci->old > ci->new) ||
            (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE)) {
-               loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq, ci->new);
-               dprintk("scaling loops_per_jiffy to %lu for frequency %u kHz\n", loops_per_jiffy, ci->new);
+               loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq,
+                                                               ci->new);
+               dprintk("scaling loops_per_jiffy to %lu "
+                       "for frequency %u kHz\n", loops_per_jiffy, ci->new);
        }
 }
 #else
-static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) { return; }
+static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
+{
+       return;
+}
 #endif
 
 
 /**
- * cpufreq_notify_transition - call notifier chain and adjust_jiffies on frequency transition
+ * cpufreq_notify_transition - call notifier chain and adjust_jiffies
+ * on frequency transition.
  *
- * This function calls the transition notifiers and the "adjust_jiffies" function. It is called
- * twice on all CPU frequency changes that have external effects. 
+ * This function calls the transition notifiers and the "adjust_jiffies"
+ * function. It is called twice on all CPU frequency changes that have
+ * external effects.
  */
 void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 {
+       struct cpufreq_policy *policy;
+
        BUG_ON(irqs_disabled());
 
        freqs->flags = cpufreq_driver->flags;
-       dprintk("notification %u of frequency transition to %u kHz\n", state, freqs->new);
+       dprintk("notification %u of frequency transition to %u kHz\n",
+               state, freqs->new);
 
-       down_read(&cpufreq_notifier_rwsem);
+       policy = per_cpu(cpufreq_cpu_data, freqs->cpu);
        switch (state) {
+
        case CPUFREQ_PRECHANGE:
-               /* detect if the driver reported a value as "old frequency" which
-                * is not equal to what the cpufreq core thinks is "old frequency".
+               /* detect if the driver reported a value as "old frequency"
+                * which is not equal to what the cpufreq core thinks is
+                * "old frequency".
                 */
                if (!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {
-                       if ((likely(cpufreq_cpu_data[freqs->cpu])) &&
-                           (likely(cpufreq_cpu_data[freqs->cpu]->cpu == freqs->cpu)) &&
-                           (likely(cpufreq_cpu_data[freqs->cpu]->cur)) &&
-                           (unlikely(freqs->old != cpufreq_cpu_data[freqs->cpu]->cur)))
-                       {
-                               dprintk(KERN_WARNING "Warning: CPU frequency is %u, "
-                                      "cpufreq assumed %u kHz.\n", freqs->old, cpufreq_cpu_data[freqs->cpu]->cur);
-                               freqs->old = cpufreq_cpu_data[freqs->cpu]->cur;
+                       if ((policy) && (policy->cpu == freqs->cpu) &&
+                           (policy->cur) && (policy->cur != freqs->old)) {
+                               dprintk("Warning: CPU frequency is"
+                                       " %u, cpufreq assumed %u kHz.\n",
+                                       freqs->old, policy->cur);
+                               freqs->old = policy->cur;
                        }
                }
-               notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_PRECHANGE, freqs);
+               srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
+                               CPUFREQ_PRECHANGE, freqs);
                adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
                break;
+
        case CPUFREQ_POSTCHANGE:
                adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
-               notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_POSTCHANGE, freqs);
-               if ((likely(cpufreq_cpu_data[freqs->cpu])) && 
-                   (likely(cpufreq_cpu_data[freqs->cpu]->cpu == freqs->cpu)))
-                       cpufreq_cpu_data[freqs->cpu]->cur = freqs->new;
+               srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
+                               CPUFREQ_POSTCHANGE, freqs);
+               if (likely(policy) && likely(policy->cpu == freqs->cpu))
+                       policy->cur = freqs->new;
                break;
        }
-       up_read(&cpufreq_notifier_rwsem);
 }
 EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
 
@@ -277,73 +367,105 @@ EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
  *                          SYSFS INTERFACE                          *
  *********************************************************************/
 
+static struct cpufreq_governor *__find_governor(const char *str_governor)
+{
+       struct cpufreq_governor *t;
+
+       list_for_each_entry(t, &cpufreq_governor_list, governor_list)
+               if (!strnicmp(str_governor, t->name, CPUFREQ_NAME_LEN))
+                       return t;
+
+       return NULL;
+}
+
 /**
  * cpufreq_parse_governor - parse a governor string
  */
-static int cpufreq_parse_governor (char *str_governor, unsigned int *policy,
+static int cpufreq_parse_governor(char *str_governor, unsigned int *policy,
                                struct cpufreq_governor **governor)
 {
+       int err = -EINVAL;
+
        if (!cpufreq_driver)
-               return -EINVAL;
+               goto out;
+
        if (cpufreq_driver->setpolicy) {
                if (!strnicmp(str_governor, "performance", CPUFREQ_NAME_LEN)) {
                        *policy = CPUFREQ_POLICY_PERFORMANCE;
-                       return 0;
-               } else if (!strnicmp(str_governor, "powersave", CPUFREQ_NAME_LEN)) {
+                       err = 0;
+               } else if (!strnicmp(str_governor, "powersave",
+                                               CPUFREQ_NAME_LEN)) {
                        *policy = CPUFREQ_POLICY_POWERSAVE;
-                       return 0;
+                       err = 0;
                }
-               return -EINVAL;
-       } else {
+       } else if (cpufreq_driver->target) {
                struct cpufreq_governor *t;
-               down(&cpufreq_governor_sem);
-               if (!cpufreq_driver || !cpufreq_driver->target)
-                       goto out;
-               list_for_each_entry(t, &cpufreq_governor_list, governor_list) {
-                       if (!strnicmp(str_governor,t->name,CPUFREQ_NAME_LEN)) {
-                               *governor = t;
-                               up(&cpufreq_governor_sem);
-                               return 0;
+
+               mutex_lock(&cpufreq_governor_mutex);
+
+               t = __find_governor(str_governor);
+
+               if (t == NULL) {
+                       char *name = kasprintf(GFP_KERNEL, "cpufreq_%s",
+                                                               str_governor);
+
+                       if (name) {
+                               int ret;
+
+                               mutex_unlock(&cpufreq_governor_mutex);
+                               ret = request_module("%s", name);
+                               mutex_lock(&cpufreq_governor_mutex);
+
+                               if (ret == 0)
+                                       t = __find_governor(str_governor);
                        }
+
+                       kfree(name);
                }
-       out:
-               up(&cpufreq_governor_sem);
-       }
-       return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(cpufreq_parse_governor);
 
+               if (t != NULL) {
+                       *governor = t;
+                       err = 0;
+               }
 
-/* drivers/base/cpu.c */
-extern struct sysdev_class cpu_sysdev_class;
+               mutex_unlock(&cpufreq_governor_mutex);
+       }
+out:
+       return err;
+}
 
 
 /**
- * cpufreq_per_cpu_attr_read() / show_##file_name() - print out cpufreq information
+ * cpufreq_per_cpu_attr_read() / show_##file_name() -
+ * print out cpufreq information
  *
  * Write out information from cpufreq_driver->policy[cpu]; object must be
  * "unsigned int".
  */
 
-#define show_one(file_name, object)                                    \
-static ssize_t show_##file_name                                        \
-(struct cpufreq_policy * policy, char *buf)                            \
-{                                                                      \
-       return sprintf (buf, "%u\n", policy->object);                   \
+#define show_one(file_name, object)                    \
+static ssize_t show_##file_name                                \
+(struct cpufreq_policy *policy, char *buf)             \
+{                                                      \
+       return sprintf(buf, "%u\n", policy->object);    \
 }
 
 show_one(cpuinfo_min_freq, cpuinfo.min_freq);
 show_one(cpuinfo_max_freq, cpuinfo.max_freq);
+show_one(cpuinfo_transition_latency, cpuinfo.transition_latency);
 show_one(scaling_min_freq, min);
 show_one(scaling_max_freq, max);
 show_one(scaling_cur_freq, cur);
 
+static int __cpufreq_set_policy(struct cpufreq_policy *data,
+                               struct cpufreq_policy *policy);
+
 /**
  * cpufreq_per_cpu_attr_write() / store_##file_name() - sysfs write access
  */
 #define store_one(file_name, object)                   \
 static ssize_t store_##file_name                                       \
-(struct cpufreq_policy * policy, const char *buf, size_t count)                \
+(struct cpufreq_policy *policy, const char *buf, size_t count)         \
 {                                                                      \
        unsigned int ret = -EINVAL;                                     \
        struct cpufreq_policy new_policy;                               \
@@ -352,24 +474,26 @@ static ssize_t store_##file_name                                  \
        if (ret)                                                        \
                return -EINVAL;                                         \
                                                                        \
-       ret = sscanf (buf, "%u", &new_policy.object);                   \
+       ret = sscanf(buf, "%u", &new_policy.object);                    \
        if (ret != 1)                                                   \
                return -EINVAL;                                         \
                                                                        \
-       ret = cpufreq_set_policy(&new_policy);                          \
+       ret = __cpufreq_set_policy(policy, &new_policy);                \
+       policy->user_policy.object = policy->object;                    \
                                                                        \
        return ret ? ret : count;                                       \
 }
 
-store_one(scaling_min_freq,min);
-store_one(scaling_max_freq,max);
+store_one(scaling_min_freq, min);
+store_one(scaling_max_freq, max);
 
 /**
  * show_cpuinfo_cur_freq - current CPU frequency as detected by hardware
  */
-static ssize_t show_cpuinfo_cur_freq (struct cpufreq_policy * policy, char *buf)
+static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy,
+                                       char *buf)
 {
-       unsigned int cur_freq = cpufreq_get(policy->cpu);
+       unsigned int cur_freq = __cpufreq_get(policy->cpu);
        if (!cur_freq)
                return sprintf(buf, "<unknown>");
        return sprintf(buf, "%u\n", cur_freq);
@@ -379,14 +503,15 @@ static ssize_t show_cpuinfo_cur_freq (struct cpufreq_policy * policy, char *buf)
 /**
  * show_scaling_governor - show the current policy for the specified CPU
  */
-static ssize_t show_scaling_governor (struct cpufreq_policy * policy, char *buf)
+static ssize_t show_scaling_governor(struct cpufreq_policy *policy, char *buf)
 {
-       if(policy->policy == CPUFREQ_POLICY_POWERSAVE)
+       if (policy->policy == CPUFREQ_POLICY_POWERSAVE)
                return sprintf(buf, "powersave\n");
        else if (policy->policy == CPUFREQ_POLICY_PERFORMANCE)
                return sprintf(buf, "performance\n");
        else if (policy->governor)
-               return scnprintf(buf, CPUFREQ_NAME_LEN, "%s\n", policy->governor->name);
+               return scnprintf(buf, CPUFREQ_NAME_LEN, "%s\n",
+                               policy->governor->name);
        return -EINVAL;
 }
 
@@ -394,8 +519,8 @@ static ssize_t show_scaling_governor (struct cpufreq_policy * policy, char *buf)
 /**
  * store_scaling_governor - store policy for the specified CPU
  */
-static ssize_t store_scaling_governor (struct cpufreq_policy * policy, 
-                                      const char *buf, size_t count) 
+static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
+                                       const char *buf, size_t count)
 {
        unsigned int ret = -EINVAL;
        char    str_governor[16];
@@ -405,22 +530,31 @@ static ssize_t store_scaling_governor (struct cpufreq_policy * policy,
        if (ret)
                return ret;
 
-       ret = sscanf (buf, "%15s", str_governor);
+       ret = sscanf(buf, "%15s", str_governor);
        if (ret != 1)
                return -EINVAL;
 
-       if (cpufreq_parse_governor(str_governor, &new_policy.policy, &new_policy.governor))
+       if (cpufreq_parse_governor(str_governor, &new_policy.policy,
+                                               &new_policy.governor))
                return -EINVAL;
 
-       ret = cpufreq_set_policy(&new_policy);
+       /* Do not use cpufreq_set_policy here or the user_policy.max
+          will be wrongly overridden */
+       ret = __cpufreq_set_policy(policy, &new_policy);
 
-       return ret ? ret : count;
+       policy->user_policy.policy = policy->policy;
+       policy->user_policy.governor = policy->governor;
+
+       if (ret)
+               return ret;
+       else
+               return count;
 }
 
 /**
  * show_scaling_driver - show the cpufreq driver currently loaded
  */
-static ssize_t show_scaling_driver (struct cpufreq_policy * policy, char *buf)
+static ssize_t show_scaling_driver(struct cpufreq_policy *policy, char *buf)
 {
        return scnprintf(buf, CPUFREQ_NAME_LEN, "%s\n", cpufreq_driver->name);
 }
@@ -428,8 +562,8 @@ static ssize_t show_scaling_driver (struct cpufreq_policy * policy, char *buf)
 /**
  * show_scaling_available_governors - show the available CPUfreq governors
  */
-static ssize_t show_scaling_available_governors (struct cpufreq_policy * policy,
-                               char *buf)
+static ssize_t show_scaling_available_governors(struct cpufreq_policy *policy,
+                                               char *buf)
 {
        ssize_t i = 0;
        struct cpufreq_governor *t;
@@ -440,33 +574,76 @@ static ssize_t show_scaling_available_governors (struct cpufreq_policy * policy,
        }
 
        list_for_each_entry(t, &cpufreq_governor_list, governor_list) {
-               if (i >= (ssize_t) ((PAGE_SIZE / sizeof(char)) - (CPUFREQ_NAME_LEN + 2)))
+               if (i >= (ssize_t) ((PAGE_SIZE / sizeof(char))
+                   - (CPUFREQ_NAME_LEN + 2)))
                        goto out;
                i += scnprintf(&buf[i], CPUFREQ_NAME_LEN, "%s ", t->name);
        }
- out:
+out:
        i += sprintf(&buf[i], "\n");
        return i;
 }
-/**
- * show_affected_cpus - show the CPUs affected by each transition
- */
-static ssize_t show_affected_cpus (struct cpufreq_policy * policy, char *buf)
+
+static ssize_t show_cpus(const struct cpumask *mask, char *buf)
 {
        ssize_t i = 0;
        unsigned int cpu;
 
-       for_each_cpu_mask(cpu, policy->cpus) {
+       for_each_cpu(cpu, mask) {
                if (i)
                        i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), " ");
                i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), "%u", cpu);
                if (i >= (PAGE_SIZE - 5))
-                   break;
+                       break;
        }
        i += sprintf(&buf[i], "\n");
        return i;
 }
 
+/**
+ * show_related_cpus - show the CPUs affected by each transition even if
+ * hw coordination is in use
+ */
+static ssize_t show_related_cpus(struct cpufreq_policy *policy, char *buf)
+{
+       if (cpumask_empty(policy->related_cpus))
+               return show_cpus(policy->cpus, buf);
+       return show_cpus(policy->related_cpus, buf);
+}
+
+/**
+ * show_affected_cpus - show the CPUs affected by each transition
+ */
+static ssize_t show_affected_cpus(struct cpufreq_policy *policy, char *buf)
+{
+       return show_cpus(policy->cpus, buf);
+}
+
+static ssize_t store_scaling_setspeed(struct cpufreq_policy *policy,
+                                       const char *buf, size_t count)
+{
+       unsigned int freq = 0;
+       unsigned int ret;
+
+       if (!policy->governor || !policy->governor->store_setspeed)
+               return -EINVAL;
+
+       ret = sscanf(buf, "%u", &freq);
+       if (ret != 1)
+               return -EINVAL;
+
+       policy->governor->store_setspeed(policy, freq);
+
+       return count;
+}
+
+static ssize_t show_scaling_setspeed(struct cpufreq_policy *policy, char *buf)
+{
+       if (!policy->governor || !policy->governor->show_setspeed)
+               return sprintf(buf, "<unsupported>\n");
+
+       return policy->governor->show_setspeed(policy, buf);
+}
 
 #define define_one_ro(_name) \
 static struct freq_attr _name = \
@@ -483,59 +660,87 @@ __ATTR(_name, 0644, show_##_name, store_##_name)
 define_one_ro0400(cpuinfo_cur_freq);
 define_one_ro(cpuinfo_min_freq);
 define_one_ro(cpuinfo_max_freq);
+define_one_ro(cpuinfo_transition_latency);
 define_one_ro(scaling_available_governors);
 define_one_ro(scaling_driver);
 define_one_ro(scaling_cur_freq);
+define_one_ro(related_cpus);
 define_one_ro(affected_cpus);
 define_one_rw(scaling_min_freq);
 define_one_rw(scaling_max_freq);
 define_one_rw(scaling_governor);
+define_one_rw(scaling_setspeed);
 
-static struct attribute * default_attrs[] = {
+static struct attribute *default_attrs[] = {
        &cpuinfo_min_freq.attr,
        &cpuinfo_max_freq.attr,
+       &cpuinfo_transition_latency.attr,
        &scaling_min_freq.attr,
        &scaling_max_freq.attr,
        &affected_cpus.attr,
+       &related_cpus.attr,
        &scaling_governor.attr,
        &scaling_driver.attr,
        &scaling_available_governors.attr,
+       &scaling_setspeed.attr,
        NULL
 };
 
-#define to_policy(k) container_of(k,struct cpufreq_policy,kobj)
-#define to_attr(a) container_of(a,struct freq_attr,attr)
+#define to_policy(k) container_of(k, struct cpufreq_policy, kobj)
+#define to_attr(a) container_of(a, struct freq_attr, attr)
 
-static ssize_t show(struct kobject * kobj, struct attribute * attr ,char * buf)
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
-       struct cpufreq_policy * policy = to_policy(kobj);
-       struct freq_attr * fattr = to_attr(attr);
-       ssize_t ret;
+       struct cpufreq_policy *policy = to_policy(kobj);
+       struct freq_attr *fattr = to_attr(attr);
+       ssize_t ret = -EINVAL;
        policy = cpufreq_cpu_get(policy->cpu);
        if (!policy)
-               return -EINVAL;
-       ret = fattr->show ? fattr->show(policy,buf) : -EIO;
+               goto no_policy;
+
+       if (lock_policy_rwsem_read(policy->cpu) < 0)
+               goto fail;
+
+       if (fattr->show)
+               ret = fattr->show(policy, buf);
+       else
+               ret = -EIO;
+
+       unlock_policy_rwsem_read(policy->cpu);
+fail:
        cpufreq_cpu_put(policy);
+no_policy:
        return ret;
 }
 
-static ssize_t store(struct kobject * kobj, struct attribute * attr, 
-                    const char * buf, size_t count)
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+                    const char *buf, size_t count)
 {
-       struct cpufreq_policy * policy = to_policy(kobj);
-       struct freq_attr * fattr = to_attr(attr);
-       ssize_t ret;
+       struct cpufreq_policy *policy = to_policy(kobj);
+       struct freq_attr *fattr = to_attr(attr);
+       ssize_t ret = -EINVAL;
        policy = cpufreq_cpu_get(policy->cpu);
        if (!policy)
-               return -EINVAL;
-       ret = fattr->store ? fattr->store(policy,buf,count) : -EIO;
+               goto no_policy;
+
+       if (lock_policy_rwsem_write(policy->cpu) < 0)
+               goto fail;
+
+       if (fattr->store)
+               ret = fattr->store(policy, buf, count);
+       else
+               ret = -EIO;
+
+       unlock_policy_rwsem_write(policy->cpu);
+fail:
        cpufreq_cpu_put(policy);
+no_policy:
        return ret;
 }
 
-static void cpufreq_sysfs_release(struct kobject * kobj)
+static void cpufreq_sysfs_release(struct kobject *kobj)
 {
-       struct cpufreq_policy * policy = to_policy(kobj);
+       struct cpufreq_policy *policy = to_policy(kobj);
        dprintk("last reference is dropped\n");
        complete(&policy->kobj_unregister);
 }
@@ -555,17 +760,24 @@ static struct kobj_type ktype_cpufreq = {
 /**
  * cpufreq_add_dev - add a CPU device
  *
- * Adds the cpufreq interface for a CPU device. 
+ * Adds the cpufreq interface for a CPU device.
  */
-static int cpufreq_add_dev (struct sys_device * sys_dev)
+static int cpufreq_add_dev(struct sys_device *sys_dev)
 {
        unsigned int cpu = sys_dev->id;
        int ret = 0;
        struct cpufreq_policy new_policy;
        struct cpufreq_policy *policy;
        struct freq_attr **drv_attr;
+       struct sys_device *cpu_sys_dev;
        unsigned long flags;
        unsigned int j;
+#ifdef CONFIG_SMP
+       struct cpufreq_policy *managed_policy;
+#endif
+
+       if (cpu_is_offline(cpu))
+               return 0;
 
        cpufreq_debug_disable_ratelimit();
        dprintk("adding CPU %u\n", cpu);
@@ -575,8 +787,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
         * CPU because it is in the same boat. */
        policy = cpufreq_cpu_get(cpu);
        if (unlikely(policy)) {
-               dprintk("CPU already managed, adding link\n");
-               sysfs_create_link(&sys_dev->kobj, &policy->kobj, "cpufreq");
+               cpufreq_cpu_put(policy);
                cpufreq_debug_enable_ratelimit();
                return 0;
        }
@@ -587,20 +798,35 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
                goto module_out;
        }
 
-       policy = kmalloc(sizeof(struct cpufreq_policy), GFP_KERNEL);
+       policy = kzalloc(sizeof(struct cpufreq_policy), GFP_KERNEL);
        if (!policy) {
                ret = -ENOMEM;
                goto nomem_out;
        }
-       memset(policy, 0, sizeof(struct cpufreq_policy));
+       if (!alloc_cpumask_var(&policy->cpus, GFP_KERNEL)) {
+               kfree(policy);
+               ret = -ENOMEM;
+               goto nomem_out;
+       }
+       if (!alloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) {
+               free_cpumask_var(policy->cpus);
+               kfree(policy);
+               ret = -ENOMEM;
+               goto nomem_out;
+       }
 
        policy->cpu = cpu;
-       policy->cpus = cpumask_of_cpu(cpu);
+       cpumask_copy(policy->cpus, cpumask_of(cpu));
+
+       /* Initially set CPU itself as the policy_cpu */
+       per_cpu(policy_cpu, cpu) = cpu;
+       lock_policy_rwsem_write(cpu);
 
-       init_MUTEX_LOCKED(&policy->lock);
        init_completion(&policy->kobj_unregister);
-       INIT_WORK(&policy->update, handle_update, (void *)(long)cpu);
+       INIT_WORK(&policy->update, handle_update);
 
+       /* Set governor before ->init, so that driver could check it */
+       policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
        /* call driver. From then on the cpufreq must be able
         * to accept all calls to ->verify and ->setpolicy for this CPU
         */
@@ -609,59 +835,137 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
                dprintk("initialization failed\n");
                goto err_out;
        }
+       policy->user_policy.min = policy->min;
+       policy->user_policy.max = policy->max;
+
+       blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+                                    CPUFREQ_START, policy);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_HOTPLUG_CPU
+       if (per_cpu(cpufreq_cpu_governor, cpu)) {
+               policy->governor = per_cpu(cpufreq_cpu_governor, cpu);
+               dprintk("Restoring governor %s for cpu %d\n",
+                      policy->governor->name, cpu);
+       }
+#endif
+
+       for_each_cpu(j, policy->cpus) {
+               if (cpu == j)
+                       continue;
+
+               /* Check for existing affected CPUs.
+                * They may not be aware of it due to CPU Hotplug.
+                */
+               managed_policy = cpufreq_cpu_get(j);            /* FIXME: Where is this released?  What about error paths? */
+               if (unlikely(managed_policy)) {
+
+                       /* Set proper policy_cpu */
+                       unlock_policy_rwsem_write(cpu);
+                       per_cpu(policy_cpu, cpu) = managed_policy->cpu;
+
+                       if (lock_policy_rwsem_write(cpu) < 0)
+                               goto err_out_driver_exit;
+
+                       spin_lock_irqsave(&cpufreq_driver_lock, flags);
+                       cpumask_copy(managed_policy->cpus, policy->cpus);
+                       per_cpu(cpufreq_cpu_data, cpu) = managed_policy;
+                       spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
+
+                       dprintk("CPU already managed, adding link\n");
+                       ret = sysfs_create_link(&sys_dev->kobj,
+                                               &managed_policy->kobj,
+                                               "cpufreq");
+                       if (ret)
+                               goto err_out_driver_exit;
 
+                       cpufreq_debug_enable_ratelimit();
+                       ret = 0;
+                       goto err_out_driver_exit; /* call driver->exit() */
+               }
+       }
+#endif
        memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
 
        /* prepare interface data */
-       policy->kobj.parent = &sys_dev->kobj;
-       policy->kobj.ktype = &ktype_cpufreq;
-       strlcpy(policy->kobj.name, "cpufreq", KOBJ_NAME_LEN);
-
-       ret = kobject_register(&policy->kobj);
+       ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj,
+                                  "cpufreq");
        if (ret)
                goto err_out_driver_exit;
 
        /* set up files for this cpu device */
        drv_attr = cpufreq_driver->attr;
        while ((drv_attr) && (*drv_attr)) {
-               sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));
+               ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));
+               if (ret)
+                       goto err_out_driver_exit;
                drv_attr++;
        }
-       if (cpufreq_driver->get)
-               sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr);
-       if (cpufreq_driver->target)
-               sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
+       if (cpufreq_driver->get) {
+               ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr);
+               if (ret)
+                       goto err_out_driver_exit;
+       }
+       if (cpufreq_driver->target) {
+               ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
+               if (ret)
+                       goto err_out_driver_exit;
+       }
 
        spin_lock_irqsave(&cpufreq_driver_lock, flags);
-       for_each_cpu_mask(j, policy->cpus)
-               cpufreq_cpu_data[j] = policy;
+       for_each_cpu(j, policy->cpus) {
+               per_cpu(cpufreq_cpu_data, j) = policy;
+               per_cpu(policy_cpu, j) = policy->cpu;
+       }
        spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
+
+       /* symlink affected CPUs */
+       for_each_cpu(j, policy->cpus) {
+               if (j == cpu)
+                       continue;
+               if (!cpu_online(j))
+                       continue;
+
+               dprintk("CPU %u already managed, adding link\n", j);
+               cpufreq_cpu_get(cpu);
+               cpu_sys_dev = get_cpu_sysdev(j);
+               ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj,
+                                       "cpufreq");
+               if (ret)
+                       goto err_out_unregister;
+       }
+
        policy->governor = NULL; /* to assure that the starting sequence is
                                  * run in cpufreq_set_policy */
-       up(&policy->lock);
-       
+
        /* set default policy */
-       
-       ret = cpufreq_set_policy(&new_policy);
+       ret = __cpufreq_set_policy(policy, &new_policy);
+       policy->user_policy.policy = policy->policy;
+       policy->user_policy.governor = policy->governor;
+
        if (ret) {
                dprintk("setting policy failed\n");
                goto err_out_unregister;
        }
 
+       unlock_policy_rwsem_write(cpu);
+
+       kobject_uevent(&policy->kobj, KOBJ_ADD);
        module_put(cpufreq_driver->owner);
        dprintk("initialization complete\n");
        cpufreq_debug_enable_ratelimit();
-       
+
        return 0;
 
 
 err_out_unregister:
        spin_lock_irqsave(&cpufreq_driver_lock, flags);
-       for_each_cpu_mask(j, policy->cpus)
-               cpufreq_cpu_data[j] = NULL;
+       for_each_cpu(j, policy->cpus)
+               per_cpu(cpufreq_cpu_data, j) = NULL;
        spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-       kobject_unregister(&policy->kobj);
+       kobject_put(&policy->kobj);
        wait_for_completion(&policy->kobj_unregister);
 
 err_out_driver_exit:
@@ -669,28 +973,31 @@ err_out_driver_exit:
                cpufreq_driver->exit(policy);
 
 err_out:
+       unlock_policy_rwsem_write(cpu);
        kfree(policy);
 
 nomem_out:
        module_put(cpufreq_driver->owner);
- module_out:
+module_out:
        cpufreq_debug_enable_ratelimit();
        return ret;
 }
 
 
 /**
- * cpufreq_remove_dev - remove a CPU device
+ * __cpufreq_remove_dev - remove a CPU device
  *
  * Removes the cpufreq interface for a CPU device.
+ * Caller should already have policy_rwsem in write mode for this CPU.
+ * This routine frees the rwsem before returning.
  */
-static int cpufreq_remove_dev (struct sys_device * sys_dev)
+static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 {
        unsigned int cpu = sys_dev->id;
        unsigned long flags;
        struct cpufreq_policy *data;
-       struct sys_device *cpu_sys_dev;
 #ifdef CONFIG_SMP
+       struct sys_device *cpu_sys_dev;
        unsigned int j;
 #endif
 
@@ -698,58 +1005,62 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
        dprintk("unregistering CPU %u\n", cpu);
 
        spin_lock_irqsave(&cpufreq_driver_lock, flags);
-       data = cpufreq_cpu_data[cpu];
+       data = per_cpu(cpufreq_cpu_data, cpu);
 
        if (!data) {
                spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
                cpufreq_debug_enable_ratelimit();
+               unlock_policy_rwsem_write(cpu);
                return -EINVAL;
        }
-       cpufreq_cpu_data[cpu] = NULL;
+       per_cpu(cpufreq_cpu_data, cpu) = NULL;
 
 
 #ifdef CONFIG_SMP
        /* if this isn't the CPU which is the parent of the kobj, we
-        * only need to unlink, put and exit 
+        * only need to unlink, put and exit
         */
        if (unlikely(cpu != data->cpu)) {
                dprintk("removing link\n");
+               cpumask_clear_cpu(cpu, data->cpus);
                spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
                sysfs_remove_link(&sys_dev->kobj, "cpufreq");
                cpufreq_cpu_put(data);
                cpufreq_debug_enable_ratelimit();
+               unlock_policy_rwsem_write(cpu);
                return 0;
        }
 #endif
 
+#ifdef CONFIG_SMP
 
-       if (!kobject_get(&data->kobj)) {
-               spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-               cpufreq_debug_enable_ratelimit();
-               return -EFAULT;
-       }
+#ifdef CONFIG_HOTPLUG_CPU
+       per_cpu(cpufreq_cpu_governor, cpu) = data->governor;
+#endif
 
-#ifdef CONFIG_SMP
        /* if we have other CPUs still registered, we need to unlink them,
         * or else wait_for_completion below will lock up. Clean the
-        * cpufreq_cpu_data[] while holding the lock, and remove the sysfs
-        * links afterwards.
+        * per_cpu(cpufreq_cpu_data) while holding the lock, and remove
+        * the sysfs links afterwards.
         */
-       if (unlikely(cpus_weight(data->cpus) > 1)) {
-               for_each_cpu_mask(j, data->cpus) {
+       if (unlikely(cpumask_weight(data->cpus) > 1)) {
+               for_each_cpu(j, data->cpus) {
                        if (j == cpu)
                                continue;
-                       cpufreq_cpu_data[j] = NULL;
+                       per_cpu(cpufreq_cpu_data, j) = NULL;
                }
        }
 
        spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-       if (unlikely(cpus_weight(data->cpus) > 1)) {
-               for_each_cpu_mask(j, data->cpus) {
+       if (unlikely(cpumask_weight(data->cpus) > 1)) {
+               for_each_cpu(j, data->cpus) {
                        if (j == cpu)
                                continue;
                        dprintk("removing link for cpu %u\n", j);
+#ifdef CONFIG_HOTPLUG_CPU
+                       per_cpu(cpufreq_cpu_governor, j) = data->governor;
+#endif
                        cpu_sys_dev = get_cpu_sysdev(j);
                        sysfs_remove_link(&cpu_sys_dev->kobj, "cpufreq");
                        cpufreq_cpu_put(data);
@@ -759,18 +1070,15 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
        spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 #endif
 
-       down(&data->lock);
+       unlock_policy_rwsem_write(cpu);
+
        if (cpufreq_driver->target)
                __cpufreq_governor(data, CPUFREQ_GOV_STOP);
-       cpufreq_driver->target = NULL;
-       up(&data->lock);
-
-       kobject_unregister(&data->kobj);
 
        kobject_put(&data->kobj);
 
        /* we need to make sure that the underlying kobj is actually
-        * not referenced anymore by anybody before we proceed with 
+        * not referenced anymore by anybody before we proceed with
         * unloading.
         */
        dprintk("waiting for dropping of refcount\n");
@@ -780,17 +1088,37 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
        if (cpufreq_driver->exit)
                cpufreq_driver->exit(data);
 
+       free_cpumask_var(data->related_cpus);
+       free_cpumask_var(data->cpus);
        kfree(data);
+       per_cpu(cpufreq_cpu_data, cpu) = NULL;
 
        cpufreq_debug_enable_ratelimit();
-
        return 0;
 }
 
 
-static void handle_update(void *data)
+static int cpufreq_remove_dev(struct sys_device *sys_dev)
 {
-       unsigned int cpu = (unsigned int)(long)data;
+       unsigned int cpu = sys_dev->id;
+       int retval;
+
+       if (cpu_is_offline(cpu))
+               return 0;
+
+       if (unlikely(lock_policy_rwsem_write(cpu)))
+               BUG();
+
+       retval = __cpufreq_remove_dev(sys_dev);
+       return retval;
+}
+
+
+static void handle_update(struct work_struct *work)
+{
+       struct cpufreq_policy *policy =
+               container_of(work, struct cpufreq_policy, update);
+       unsigned int cpu = policy->cpu;
        dprintk("handle_update for cpu %u called\n", cpu);
        cpufreq_update_policy(cpu);
 }
@@ -801,14 +1129,15 @@ static void handle_update(void *data)
  *     @old_freq: CPU frequency the kernel thinks the CPU runs at
  *     @new_freq: CPU frequency the CPU actually runs at
  *
- *     We adjust to current frequency first, and need to clean up later. So either call
- *     to cpufreq_update_policy() or schedule handle_update()).
+ *     We adjust to current frequency first, and need to clean up later.
+ *     So either call to cpufreq_update_policy() or schedule handle_update()).
  */
-static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq, unsigned int new_freq)
+static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq,
+                               unsigned int new_freq)
 {
        struct cpufreq_freqs freqs;
 
-       dprintk(KERN_WARNING "Warning: CPU frequency out of sync: cpufreq and timing "
+       dprintk("Warning: CPU frequency out of sync: cpufreq and timing "
               "core thinks of %u, is %u kHz.\n", old_freq, new_freq);
 
        freqs.cpu = cpu;
@@ -819,42 +1148,76 @@ static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq, unsigne
 }
 
 
-/** 
- * cpufreq_get - get the current CPU frequency (in kHz)
+/**
+ * cpufreq_quick_get - get the CPU frequency (in kHz) from policy->cur
  * @cpu: CPU number
  *
- * Get the CPU current (static) CPU frequency
+ * This is the last known freq, without actually getting it from the driver.
+ * Return value will be same as what is shown in scaling_cur_freq in sysfs.
  */
-unsigned int cpufreq_get(unsigned int cpu)
+unsigned int cpufreq_quick_get(unsigned int cpu)
 {
        struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
-       unsigned int ret = 0;
+       unsigned int ret_freq = 0;
 
-       if (!policy)
-               return 0;
+       if (policy) {
+               ret_freq = policy->cur;
+               cpufreq_cpu_put(policy);
+       }
 
-       if (!cpufreq_driver->get)
-               goto out;
+       return ret_freq;
+}
+EXPORT_SYMBOL(cpufreq_quick_get);
 
-       down(&policy->lock);
 
-       ret = cpufreq_driver->get(cpu);
+static unsigned int __cpufreq_get(unsigned int cpu)
+{
+       struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
+       unsigned int ret_freq = 0;
 
-       if (ret && policy->cur && !(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) 
-       {
-               /* verify no discrepancy between actual and saved value exists */
-               if (unlikely(ret != policy->cur)) {
-                       cpufreq_out_of_sync(cpu, policy->cur, ret);
+       if (!cpufreq_driver->get)
+               return ret_freq;
+
+       ret_freq = cpufreq_driver->get(cpu);
+
+       if (ret_freq && policy->cur &&
+               !(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {
+               /* verify no discrepancy between actual and
+                                       saved value exists */
+               if (unlikely(ret_freq != policy->cur)) {
+                       cpufreq_out_of_sync(cpu, policy->cur, ret_freq);
                        schedule_work(&policy->update);
                }
        }
 
-       up(&policy->lock);
+       return ret_freq;
+}
 
- out:
-       cpufreq_cpu_put(policy);
+/**
+ * cpufreq_get - get the current CPU frequency (in kHz)
+ * @cpu: CPU number
+ *
+ * Get the CPU current (static) CPU frequency
+ */
+unsigned int cpufreq_get(unsigned int cpu)
+{
+       unsigned int ret_freq = 0;
+       struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+
+       if (!policy)
+               goto out;
+
+       if (unlikely(lock_policy_rwsem_read(cpu)))
+               goto out_policy;
+
+       ret_freq = __cpufreq_get(cpu);
+
+       unlock_policy_rwsem_read(cpu);
 
-       return (ret);
+out_policy:
+       cpufreq_cpu_put(policy);
+out:
+       return ret_freq;
 }
 EXPORT_SYMBOL(cpufreq_get);
 
@@ -863,14 +1226,14 @@ EXPORT_SYMBOL(cpufreq_get);
  *     cpufreq_suspend - let the low level driver prepare for suspend
  */
 
-static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
+static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg)
 {
        int cpu = sysdev->id;
-       unsigned int ret = 0;
+       int ret = 0;
        unsigned int cur_freq = 0;
        struct cpufreq_policy *cpu_policy;
 
-       dprintk("resuming cpu %u\n", cpu);
+       dprintk("suspending cpu %u\n", cpu);
 
        if (!cpu_online(cpu))
                return 0;
@@ -885,22 +1248,18 @@ static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
                return -EINVAL;
 
        /* only handle each CPU group once */
-       if (unlikely(cpu_policy->cpu != cpu)) {
-               cpufreq_cpu_put(cpu_policy);
-               return 0;
-       }
+       if (unlikely(cpu_policy->cpu != cpu))
+               goto out;
 
        if (cpufreq_driver->suspend) {
                ret = cpufreq_driver->suspend(cpu_policy, pmsg);
                if (ret) {
                        printk(KERN_ERR "cpufreq: suspend failed in ->suspend "
                                        "step on CPU %u\n", cpu_policy->cpu);
-                       cpufreq_cpu_put(cpu_policy);
-                       return ret;
+                       goto out;
                }
        }
 
-
        if (cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)
                goto out;
 
@@ -917,7 +1276,7 @@ static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
                struct cpufreq_freqs freqs;
 
                if (!(cpufreq_driver->flags & CPUFREQ_PM_NO_WARN))
-                       dprintk(KERN_DEBUG "Warning: CPU frequency is %u, "
+                       dprintk("Warning: CPU frequency is %u, "
                               "cpufreq assumed %u kHz.\n",
                               cur_freq, cpu_policy->cur);
 
@@ -925,16 +1284,16 @@ static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
                freqs.old = cpu_policy->cur;
                freqs.new = cur_freq;
 
-               notifier_call_chain(&cpufreq_transition_notifier_list,
+               srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
                                    CPUFREQ_SUSPENDCHANGE, &freqs);
                adjust_jiffies(CPUFREQ_SUSPENDCHANGE, &freqs);
 
                cpu_policy->cur = cur_freq;
        }
 
- out:
+out:
        cpufreq_cpu_put(cpu_policy);
-       return 0;
+       return ret;
 }
 
 /**
@@ -945,10 +1304,10 @@ static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
  *     3.) schedule call cpufreq_update_policy() ASAP as interrupts are
  *         restored.
  */
-static int cpufreq_resume(struct sys_device * sysdev)
+static int cpufreq_resume(struct sys_device *sysdev)
 {
        int cpu = sysdev->id;
-       unsigned int ret = 0;
+       int ret = 0;
        struct cpufreq_policy *cpu_policy;
 
        dprintk("resuming cpu %u\n", cpu);
@@ -966,18 +1325,15 @@ static int cpufreq_resume(struct sys_device * sysdev)
                return -EINVAL;
 
        /* only handle each CPU group once */
-       if (unlikely(cpu_policy->cpu != cpu)) {
-               cpufreq_cpu_put(cpu_policy);
-               return 0;
-       }
+       if (unlikely(cpu_policy->cpu != cpu))
+               goto fail;
 
        if (cpufreq_driver->resume) {
                ret = cpufreq_driver->resume(cpu_policy);
                if (ret) {
                        printk(KERN_ERR "cpufreq: resume failed in ->resume "
                                        "step on CPU %u\n", cpu_policy->cpu);
-                       cpufreq_cpu_put(cpu_policy);
-                       return ret;
+                       goto fail;
                }
        }
 
@@ -998,7 +1354,7 @@ static int cpufreq_resume(struct sys_device * sysdev)
                        struct cpufreq_freqs freqs;
 
                        if (!(cpufreq_driver->flags & CPUFREQ_PM_NO_WARN))
-                               dprintk(KERN_WARNING "Warning: CPU frequency"
+                               dprintk("Warning: CPU frequency "
                                       "is %u, cpufreq assumed %u kHz.\n",
                                       cur_freq, cpu_policy->cur);
 
@@ -1006,7 +1362,8 @@ static int cpufreq_resume(struct sys_device * sysdev)
                        freqs.old = cpu_policy->cur;
                        freqs.new = cur_freq;
 
-                       notifier_call_chain(&cpufreq_transition_notifier_list,
+                       srcu_notifier_call_chain(
+                                       &cpufreq_transition_notifier_list,
                                        CPUFREQ_RESUMECHANGE, &freqs);
                        adjust_jiffies(CPUFREQ_RESUMECHANGE, &freqs);
 
@@ -1016,6 +1373,7 @@ static int cpufreq_resume(struct sys_device * sysdev)
 
 out:
        schedule_work(&cpu_policy->update);
+fail:
        cpufreq_cpu_put(cpu_policy);
        return ret;
 }
@@ -1037,30 +1395,32 @@ static struct sysdev_driver cpufreq_sysdev_driver = {
  *     @nb: notifier function to register
  *      @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
  *
- *     Add a driver to one of two lists: either a list of drivers that 
+ *     Add a driver to one of two lists: either a list of drivers that
  *      are notified about clock rate changes (once before and once after
  *      the transition), or a list of drivers that are notified about
  *      changes in cpufreq policy.
  *
  *     This function may sleep, and has the same return conditions as
- *     notifier_chain_register.
+ *     blocking_notifier_chain_register.
  */
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
 {
        int ret;
 
-       down_write(&cpufreq_notifier_rwsem);
+       WARN_ON(!init_cpufreq_transition_notifier_list_called);
+
        switch (list) {
        case CPUFREQ_TRANSITION_NOTIFIER:
-               ret = notifier_chain_register(&cpufreq_transition_notifier_list, nb);
+               ret = srcu_notifier_chain_register(
+                               &cpufreq_transition_notifier_list, nb);
                break;
        case CPUFREQ_POLICY_NOTIFIER:
-               ret = notifier_chain_register(&cpufreq_policy_notifier_list, nb);
+               ret = blocking_notifier_chain_register(
+                               &cpufreq_policy_notifier_list, nb);
                break;
        default:
                ret = -EINVAL;
        }
-       up_write(&cpufreq_notifier_rwsem);
 
        return ret;
 }
@@ -1075,24 +1435,24 @@ EXPORT_SYMBOL(cpufreq_register_notifier);
  *     Remove a driver from the CPU frequency notifier list.
  *
  *     This function may sleep, and has the same return conditions as
- *     notifier_chain_unregister.
+ *     blocking_notifier_chain_unregister.
  */
 int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list)
 {
        int ret;
 
-       down_write(&cpufreq_notifier_rwsem);
        switch (list) {
        case CPUFREQ_TRANSITION_NOTIFIER:
-               ret = notifier_chain_unregister(&cpufreq_transition_notifier_list, nb);
+               ret = srcu_notifier_chain_unregister(
+                               &cpufreq_transition_notifier_list, nb);
                break;
        case CPUFREQ_POLICY_NOTIFIER:
-               ret = notifier_chain_unregister(&cpufreq_policy_notifier_list, nb);
+               ret = blocking_notifier_chain_unregister(
+                               &cpufreq_policy_notifier_list, nb);
                break;
        default:
                ret = -EINVAL;
        }
-       up_write(&cpufreq_notifier_rwsem);
 
        return ret;
 }
@@ -1109,99 +1469,125 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
                            unsigned int relation)
 {
        int retval = -EINVAL;
-       lock_cpu_hotplug();
+
        dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
                target_freq, relation);
        if (cpu_online(policy->cpu) && cpufreq_driver->target)
                retval = cpufreq_driver->target(policy, target_freq, relation);
-       unlock_cpu_hotplug();
+
        return retval;
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
 
-
 int cpufreq_driver_target(struct cpufreq_policy *policy,
                          unsigned int target_freq,
                          unsigned int relation)
 {
-       int ret;
+       int ret = -EINVAL;
 
        policy = cpufreq_cpu_get(policy->cpu);
        if (!policy)
-               return -EINVAL;
+               goto no_policy;
 
-       down(&policy->lock);
+       if (unlikely(lock_policy_rwsem_write(policy->cpu)))
+               goto fail;
 
        ret = __cpufreq_driver_target(policy, target_freq, relation);
 
-       up(&policy->lock);
+       unlock_policy_rwsem_write(policy->cpu);
 
+fail:
        cpufreq_cpu_put(policy);
-
+no_policy:
        return ret;
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_target);
 
-
-static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
+int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu)
 {
-       int ret;
+       int ret = 0;
 
-       if (!try_module_get(policy->governor->owner))
+       policy = cpufreq_cpu_get(policy->cpu);
+       if (!policy)
                return -EINVAL;
 
-       dprintk("__cpufreq_governor for CPU %u, event %u\n", policy->cpu, event);
-       ret = policy->governor->governor(policy, event);
-
-       /* we keep one module reference alive for each CPU governed by this CPU */
-       if ((event != CPUFREQ_GOV_START) || ret)
-               module_put(policy->governor->owner);
-       if ((event == CPUFREQ_GOV_STOP) && !ret)
-               module_put(policy->governor->owner);
+       if (cpu_online(cpu) && cpufreq_driver->getavg)
+               ret = cpufreq_driver->getavg(policy, cpu);
 
+       cpufreq_cpu_put(policy);
        return ret;
 }
+EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg);
 
+/*
+ * when "event" is CPUFREQ_GOV_LIMITS
+ */
 
-int cpufreq_governor(unsigned int cpu, unsigned int event)
+static int __cpufreq_governor(struct cpufreq_policy *policy,
+                                       unsigned int event)
 {
-       int ret = 0;
-       struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+       int ret;
 
-       if (!policy)
+       /* Only must be defined when default governor is known to have latency
+          restrictions, like e.g. conservative or ondemand.
+          That this is the case is already ensured in Kconfig
+       */
+#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
+       struct cpufreq_governor *gov = &cpufreq_gov_performance;
+#else
+       struct cpufreq_governor *gov = NULL;
+#endif
+
+       if (policy->governor->max_transition_latency &&
+           policy->cpuinfo.transition_latency >
+           policy->governor->max_transition_latency) {
+               if (!gov)
+                       return -EINVAL;
+               else {
+                       printk(KERN_WARNING "%s governor failed, too long"
+                              " transition latency of HW, fallback"
+                              " to %s governor\n",
+                              policy->governor->name,
+                              gov->name);
+                       policy->governor = gov;
+               }
+       }
+
+       if (!try_module_get(policy->governor->owner))
                return -EINVAL;
 
-       down(&policy->lock);
-       ret = __cpufreq_governor(policy, event);
-       up(&policy->lock);
+       dprintk("__cpufreq_governor for CPU %u, event %u\n",
+                                               policy->cpu, event);
+       ret = policy->governor->governor(policy, event);
 
-       cpufreq_cpu_put(policy);
+       /* we keep one module reference alive for
+                       each CPU governed by this CPU */
+       if ((event != CPUFREQ_GOV_START) || ret)
+               module_put(policy->governor->owner);
+       if ((event == CPUFREQ_GOV_STOP) && !ret)
+               module_put(policy->governor->owner);
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(cpufreq_governor);
 
 
 int cpufreq_register_governor(struct cpufreq_governor *governor)
 {
-       struct cpufreq_governor *t;
+       int err;
 
        if (!governor)
                return -EINVAL;
 
-       down(&cpufreq_governor_sem);
-       
-       list_for_each_entry(t, &cpufreq_governor_list, governor_list) {
-               if (!strnicmp(governor->name,t->name,CPUFREQ_NAME_LEN)) {
-                       up(&cpufreq_governor_sem);
-                       return -EBUSY;
-               }
-       }
-       list_add(&governor->governor_list, &cpufreq_governor_list);
+       mutex_lock(&cpufreq_governor_mutex);
 
-       up(&cpufreq_governor_sem);
+       err = -EBUSY;
+       if (__find_governor(governor->name) == NULL) {
+               err = 0;
+               list_add(&governor->governor_list, &cpufreq_governor_list);
+       }
 
-       return 0;
+       mutex_unlock(&cpufreq_governor_mutex);
+       return err;
 }
 EXPORT_SYMBOL_GPL(cpufreq_register_governor);
 
@@ -1211,9 +1597,9 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor)
        if (!governor)
                return;
 
-       down(&cpufreq_governor_sem);
+       mutex_lock(&cpufreq_governor_mutex);
        list_del(&governor->governor_list);
-       up(&cpufreq_governor_sem);
+       mutex_unlock(&cpufreq_governor_mutex);
        return;
 }
 EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);
@@ -1226,7 +1612,8 @@ EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);
 
 /**
  * cpufreq_get_policy - get the current cpufreq_policy
- * @policy: struct cpufreq_policy into which the current cpufreq_policy is written
+ * @policy: struct cpufreq_policy into which the current cpufreq_policy
+ *     is written
  *
  * Reads the current cpufreq policy.
  */
@@ -1240,18 +1627,20 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
        if (!cpu_policy)
                return -EINVAL;
 
-       down(&cpu_policy->lock);
        memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy));
-       up(&cpu_policy->lock);
 
        cpufreq_cpu_put(cpu_policy);
-
        return 0;
 }
 EXPORT_SYMBOL(cpufreq_get_policy);
 
 
-static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_policy *policy)
+/*
+ * data   : current policy.
+ * policy : policy to be set.
+ */
+static int __cpufreq_set_policy(struct cpufreq_policy *data,
+                               struct cpufreq_policy *policy)
 {
        int ret = 0;
 
@@ -1259,43 +1648,42 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_poli
        dprintk("setting new policy for CPU %u: %u - %u kHz\n", policy->cpu,
                policy->min, policy->max);
 
-       memcpy(&policy->cpuinfo, 
-              &data->cpuinfo, 
-              sizeof(struct cpufreq_cpuinfo));
+       memcpy(&policy->cpuinfo, &data->cpuinfo,
+                               sizeof(struct cpufreq_cpuinfo));
+
+       if (policy->min > data->max || policy->max < data->min) {
+               ret = -EINVAL;
+               goto error_out;
+       }
 
        /* verify the cpu speed can be set within this limit */
        ret = cpufreq_driver->verify(policy);
        if (ret)
                goto error_out;
 
-       down_read(&cpufreq_notifier_rwsem);
-
        /* adjust if necessary - all reasons */
-       notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_ADJUST,
-                           policy);
+       blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+                       CPUFREQ_ADJUST, policy);
 
        /* adjust if necessary - hardware incompatibility*/
-       notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_INCOMPATIBLE,
-                           policy);
+       blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+                       CPUFREQ_INCOMPATIBLE, policy);
 
        /* verify the cpu speed can be set within this limit,
           which might be different to the first one */
        ret = cpufreq_driver->verify(policy);
-       if (ret) {
-               up_read(&cpufreq_notifier_rwsem);
+       if (ret)
                goto error_out;
-       }
 
        /* notification of the new policy */
-       notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY,
-                           policy);
-
-       up_read(&cpufreq_notifier_rwsem);
+       blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+                       CPUFREQ_NOTIFY, policy);
 
-       data->min    = policy->min;
-       data->max    = policy->max;
+       data->min = policy->min;
+       data->max = policy->max;
 
-       dprintk("new min and max freqs are %u - %u kHz\n", data->min, data->max);
+       dprintk("new min and max freqs are %u - %u kHz\n",
+                                       data->min, data->max);
 
        if (cpufreq_driver->setpolicy) {
                data->policy = policy->policy;
@@ -1316,10 +1704,12 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_poli
                        data->governor = policy->governor;
                        if (__cpufreq_governor(data, CPUFREQ_GOV_START)) {
                                /* new governor failed, so re-start old one */
-                               dprintk("starting governor %s failed\n", data->governor->name);
+                               dprintk("starting governor %s failed\n",
+                                                       data->governor->name);
                                if (old_gov) {
                                        data->governor = old_gov;
-                                       __cpufreq_governor(data, CPUFREQ_GOV_START);
+                                       __cpufreq_governor(data,
+                                                          CPUFREQ_GOV_START);
                                }
                                ret = -EINVAL;
                                goto error_out;
@@ -1330,47 +1720,12 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_poli
                __cpufreq_governor(data, CPUFREQ_GOV_LIMITS);
        }
 
- error_out:
+error_out:
        cpufreq_debug_enable_ratelimit();
        return ret;
 }
 
 /**
- *     cpufreq_set_policy - set a new CPUFreq policy
- *     @policy: policy to be set.
- *
- *     Sets a new CPU frequency and voltage scaling policy.
- */
-int cpufreq_set_policy(struct cpufreq_policy *policy)
-{
-       int ret = 0;
-       struct cpufreq_policy *data;
-
-       if (!policy)
-               return -EINVAL;
-
-       data = cpufreq_cpu_get(policy->cpu);
-       if (!data)
-               return -EINVAL;
-
-       /* lock this CPU */
-       down(&data->lock);
-
-       ret = __cpufreq_set_policy(data, policy);
-       data->user_policy.min = data->min;
-       data->user_policy.max = data->max;
-       data->user_policy.policy = data->policy;
-       data->user_policy.governor = data->governor;
-
-       up(&data->lock);
-       cpufreq_cpu_put(data);
-
-       return ret;
-}
-EXPORT_SYMBOL(cpufreq_set_policy);
-
-
-/**
  *     cpufreq_update_policy - re-evaluate an existing cpufreq policy
  *     @cpu: CPU which shall be re-evaluated
  *
@@ -1381,31 +1736,83 @@ int cpufreq_update_policy(unsigned int cpu)
 {
        struct cpufreq_policy *data = cpufreq_cpu_get(cpu);
        struct cpufreq_policy policy;
-       int ret = 0;
+       int ret;
 
-       if (!data)
-               return -ENODEV;
+       if (!data) {
+               ret = -ENODEV;
+               goto no_policy;
+       }
 
-       down(&data->lock);
+       if (unlikely(lock_policy_rwsem_write(cpu))) {
+               ret = -EINVAL;
+               goto fail;
+       }
 
        dprintk("updating policy for CPU %u\n", cpu);
-       memcpy(&policy, 
-              data,
-              sizeof(struct cpufreq_policy));
+       memcpy(&policy, data, sizeof(struct cpufreq_policy));
        policy.min = data->user_policy.min;
        policy.max = data->user_policy.max;
        policy.policy = data->user_policy.policy;
        policy.governor = data->user_policy.governor;
 
+       /* BIOS might change freq behind our back
+         -> ask driver for current freq and notify governors about a change */
+       if (cpufreq_driver->get) {
+               policy.cur = cpufreq_driver->get(cpu);
+               if (!data->cur) {
+                       dprintk("Driver did not initialize current freq");
+                       data->cur = policy.cur;
+               } else {
+                       if (data->cur != policy.cur)
+                               cpufreq_out_of_sync(cpu, data->cur,
+                                                               policy.cur);
+               }
+       }
+
        ret = __cpufreq_set_policy(data, &policy);
 
-       up(&data->lock);
+       unlock_policy_rwsem_write(cpu);
 
+fail:
        cpufreq_cpu_put(data);
+no_policy:
        return ret;
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
+static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
+                                       unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+       struct sys_device *sys_dev;
+
+       sys_dev = get_cpu_sysdev(cpu);
+       if (sys_dev) {
+               switch (action) {
+               case CPU_ONLINE:
+               case CPU_ONLINE_FROZEN:
+                       cpufreq_add_dev(sys_dev);
+                       break;
+               case CPU_DOWN_PREPARE:
+               case CPU_DOWN_PREPARE_FROZEN:
+                       if (unlikely(lock_policy_rwsem_write(cpu)))
+                               BUG();
+
+                       __cpufreq_remove_dev(sys_dev);
+                       break;
+               case CPU_DOWN_FAILED:
+               case CPU_DOWN_FAILED_FROZEN:
+                       cpufreq_add_dev(sys_dev);
+                       break;
+               }
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata cpufreq_cpu_notifier =
+{
+    .notifier_call = cpufreq_cpu_callback,
+};
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
@@ -1416,9 +1823,9 @@ EXPORT_SYMBOL(cpufreq_update_policy);
  * @driver_data: A struct cpufreq_driver containing the values#
  * submitted by the CPU Frequency driver.
  *
- *   Registers a CPU Frequency driver to this core code. This code 
+ *   Registers a CPU Frequency driver to this core code. This code
  * returns zero on success, -EBUSY when another driver got here first
- * (and isn't unregistered in the meantime). 
+ * (and isn't unregistered in the meantime).
  *
  */
 int cpufreq_register_driver(struct cpufreq_driver *driver_data)
@@ -1443,21 +1850,26 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
        cpufreq_driver = driver_data;
        spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-       ret = sysdev_driver_register(&cpu_sysdev_class,&cpufreq_sysdev_driver);
+       ret = sysdev_driver_register(&cpu_sysdev_class,
+                                       &cpufreq_sysdev_driver);
 
        if ((!ret) && !(cpufreq_driver->flags & CPUFREQ_STICKY)) {
                int i;
                ret = -ENODEV;
 
                /* check for at least one working CPU */
-               for (i=0; i<NR_CPUS; i++)
-                       if (cpufreq_cpu_data[i])
+               for (i = 0; i < nr_cpu_ids; i++)
+                       if (cpu_possible(i) && per_cpu(cpufreq_cpu_data, i)) {
                                ret = 0;
+                               break;
+                       }
 
                /* if all ->init() calls failed, unregister */
                if (ret) {
-                       dprintk("no CPU initialized for driver %s\n", driver_data->name);
-                       sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
+                       dprintk("no CPU initialized for driver %s\n",
+                                                       driver_data->name);
+                       sysdev_driver_unregister(&cpu_sysdev_class,
+                                               &cpufreq_sysdev_driver);
 
                        spin_lock_irqsave(&cpufreq_driver_lock, flags);
                        cpufreq_driver = NULL;
@@ -1466,11 +1878,12 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
        }
 
        if (!ret) {
+               register_hotcpu_notifier(&cpufreq_cpu_notifier);
                dprintk("driver %s up and running\n", driver_data->name);
                cpufreq_debug_enable_ratelimit();
        }
 
-       return (ret);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(cpufreq_register_driver);
 
@@ -1478,7 +1891,7 @@ EXPORT_SYMBOL_GPL(cpufreq_register_driver);
 /**
  * cpufreq_unregister_driver - unregister the current CPUFreq driver
  *
- *    Unregister the current CPUFreq driver. Only call this if you have 
+ *    Unregister the current CPUFreq driver. Only call this if you have
  * the right to do so, i.e. if you have succeeded in initialising before!
  * Returns zero if successful, and -EINVAL if the cpufreq_driver is
  * currently not initialised.
@@ -1497,6 +1910,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
        dprintk("unregistering driver %s\n", driver->name);
 
        sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
+       unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
        spin_lock_irqsave(&cpufreq_driver_lock, flags);
        cpufreq_driver = NULL;
@@ -1505,3 +1919,16 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
        return 0;
 }
 EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
+
+static int __init cpufreq_core_init(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               per_cpu(policy_cpu, cpu) = -1;
+               init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));
+       }
+       return 0;
+}
+
+core_initcall(cpufreq_core_init);