sysctl: add proc_do_large_bitmap
[safe/jmp/linux-2.6] / kernel / time / clocksource.c
index b1c2da8..1f5dde6 100644 (file)
@@ -21,7 +21,6 @@
  *
  * TODO WishList:
  *   o Allow clocksource drivers to be unregistered
- *   o get rid of clocksource_jiffies extern
  */
 
 #include <linux/clocksource.h>
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
 #include <linux/tick.h>
+#include <linux/kthread.h>
 
-/* XXX - Would like a better way for initializing curr_clocksource */
-extern struct clocksource clocksource_jiffies;
+void timecounter_init(struct timecounter *tc,
+                     const struct cyclecounter *cc,
+                     u64 start_tstamp)
+{
+       tc->cc = cc;
+       tc->cycle_last = cc->read(cc);
+       tc->nsec = start_tstamp;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc:         Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+       cycle_t cycle_now, cycle_delta;
+       u64 ns_offset;
+
+       /* read cycle counter: */
+       cycle_now = tc->cc->read(tc->cc);
+
+       /* calculate the delta since the last timecounter_read_delta(): */
+       cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+       /* convert to nanoseconds: */
+       ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+
+       /* update time stamp of timecounter_read_delta() call: */
+       tc->cycle_last = cycle_now;
+
+       return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+       u64 nsec;
+
+       /* increment time by nanoseconds since last call */
+       nsec = timecounter_read_delta(tc);
+       nsec += tc->nsec;
+       tc->nsec = nsec;
+
+       return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+                        cycle_t cycle_tstamp)
+{
+       u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+       u64 nsec;
+
+       /*
+        * Instead of always treating cycle_tstamp as more recent
+        * than tc->cycle_last, detect when it is too far in the
+        * future and treat it as old time stamp instead.
+        */
+       if (cycle_delta > tc->cc->mask / 2) {
+               cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+               nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+       } else {
+               nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+       }
+
+       return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
+
+/**
+ * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
+ * @mult:      pointer to mult variable
+ * @shift:     pointer to shift variable
+ * @from:      frequency to convert from
+ * @to:                frequency to convert to
+ * @minsec:    guaranteed runtime conversion range in seconds
+ *
+ * The function evaluates the shift/mult pair for the scaled math
+ * operations of clocksources and clockevents.
+ *
+ * @to and @from are frequency values in HZ. For clock sources @to is
+ * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
+ * event @to is the counter frequency and @from is NSEC_PER_SEC.
+ *
+ * The @minsec conversion range argument controls the time frame in
+ * seconds which must be covered by the runtime conversion with the
+ * calculated mult and shift factors. This guarantees that no 64bit
+ * overflow happens when the input value of the conversion is
+ * multiplied with the calculated mult factor. Larger ranges may
+ * reduce the conversion accuracy by chosing smaller mult and shift
+ * factors.
+ */
+void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+{
+       u64 tmp;
+       u32 sft, sftacc= 32;
+
+       /*
+        * Calculate the shift factor which is limiting the conversion
+        * range:
+        */
+       tmp = ((u64)minsec * from) >> 32;
+       while (tmp) {
+               tmp >>=1;
+               sftacc--;
+       }
+
+       /*
+        * Find the conversion shift/mult pair which has the best
+        * accuracy and fits the maxsec conversion range:
+        */
+       for (sft = 32; sft > 0; sft--) {
+               tmp = (u64) to << sft;
+               do_div(tmp, from);
+               if ((tmp >> sftacc) == 0)
+                       break;
+       }
+       *mult = tmp;
+       *shift = sft;
+}
 
 /*[Clocksource internal variables]---------
  * curr_clocksource:
- *     currently selected clocksource. Initialized to clocksource_jiffies.
- * next_clocksource:
- *     pending next selected clocksource.
+ *     currently selected clocksource.
  * clocksource_list:
  *     linked list with the registered clocksources
- * clocksource_lock:
- *     protects manipulations to curr_clocksource and next_clocksource
- *     and the clocksource_list
+ * clocksource_mutex:
+ *     protects manipulations to curr_clocksource and the clocksource_list
  * override_name:
  *     Name of the user-specified clocksource.
  */
-static struct clocksource *curr_clocksource = &clocksource_jiffies;
-static struct clocksource *next_clocksource;
-static struct clocksource *clocksource_override;
+static struct clocksource *curr_clocksource;
 static LIST_HEAD(clocksource_list);
-static DEFINE_SPINLOCK(clocksource_lock);
+static DEFINE_MUTEX(clocksource_mutex);
 static char override_name[32];
 static int finished_booting;
 
-/* clocksource_done_booting - Called near the end of core bootup
- *
- * Hack to avoid lots of clocksource churn at boot time.
- * We use fs_initcall because we want this to start before
- * device_initcall but after subsys_initcall.
- */
-static int __init clocksource_done_booting(void)
-{
-       finished_booting = 1;
-       return 0;
-}
-fs_initcall(clocksource_done_booting);
-
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
+static void clocksource_watchdog_work(struct work_struct *work);
+
 static LIST_HEAD(watchdog_list);
 static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
+static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
 static cycle_t watchdog_last;
-static unsigned long watchdog_resumed;
+static int watchdog_running;
+
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
 
 /*
  * Interval: 0.5sec Threshold: 0.0625s
@@ -82,134 +196,273 @@ static unsigned long watchdog_resumed;
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
-static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
+static void clocksource_watchdog_work(struct work_struct *work)
 {
-       if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
-               return;
+       /*
+        * If kthread_run fails the next watchdog scan over the
+        * watchdog_list will find the unstable clock again.
+        */
+       kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
 
+static void __clocksource_unstable(struct clocksource *cs)
+{
+       cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+       cs->flags |= CLOCK_SOURCE_UNSTABLE;
+       if (finished_booting)
+               schedule_work(&watchdog_work);
+}
+
+static void clocksource_unstable(struct clocksource *cs, int64_t delta)
+{
        printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
               cs->name, delta);
-       cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
-       clocksource_change_rating(cs, 0);
-       list_del(&cs->wd_list);
+       __clocksource_unstable(cs);
+}
+
+/**
+ * clocksource_mark_unstable - mark clocksource unstable via watchdog
+ * @cs:                clocksource to be marked unstable
+ *
+ * This function is called instead of clocksource_change_rating from
+ * cpu hotplug code to avoid a deadlock between the clocksource mutex
+ * and the cpu hotplug mutex. It defers the update of the clocksource
+ * to the watchdog thread.
+ */
+void clocksource_mark_unstable(struct clocksource *cs)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&watchdog_lock, flags);
+       if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
+               if (list_empty(&cs->wd_list))
+                       list_add(&cs->wd_list, &watchdog_list);
+               __clocksource_unstable(cs);
+       }
+       spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
 static void clocksource_watchdog(unsigned long data)
 {
-       struct clocksource *cs, *tmp;
+       struct clocksource *cs;
        cycle_t csnow, wdnow;
        int64_t wd_nsec, cs_nsec;
-       int resumed;
+       int next_cpu;
 
        spin_lock(&watchdog_lock);
+       if (!watchdog_running)
+               goto out;
 
-       resumed = test_and_clear_bit(0, &watchdog_resumed);
-
-       wdnow = watchdog->read();
-       wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
+       wdnow = watchdog->read(watchdog);
+       wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
+                                    watchdog->mult, watchdog->shift);
        watchdog_last = wdnow;
 
-       list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
-               csnow = cs->read();
+       list_for_each_entry(cs, &watchdog_list, wd_list) {
 
-               if (unlikely(resumed)) {
-                       cs->wd_last = csnow;
+               /* Clocksource already marked unstable? */
+               if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+                       if (finished_booting)
+                               schedule_work(&watchdog_work);
                        continue;
                }
 
-               /* Initialized ? */
+               csnow = cs->read(cs);
+
+               /* Clocksource initialized ? */
                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
-                       if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
-                           (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
-                               cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
-                               /*
-                                * We just marked the clocksource as
-                                * highres-capable, notify the rest of the
-                                * system as well so that we transition
-                                * into high-res mode:
-                                */
-                               tick_clock_notify();
-                       }
                        cs->flags |= CLOCK_SOURCE_WATCHDOG;
                        cs->wd_last = csnow;
-               } else {
-                       cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
-                       cs->wd_last = csnow;
-                       /* Check the delta. Might remove from the list ! */
-                       clocksource_ratewd(cs, cs_nsec - wd_nsec);
+                       continue;
                }
-       }
 
-       if (!list_empty(&watchdog_list)) {
-               /*
-                * Cycle through CPUs to check if the CPUs stay
-                * synchronized to each other.
-                */
-               int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+               /* Check the deviation from the watchdog clocksource. */
+               cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+                                            cs->mask, cs->mult, cs->shift);
+               cs->wd_last = csnow;
+               if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+                       clocksource_unstable(cs, cs_nsec - wd_nsec);
+                       continue;
+               }
 
-               if (next_cpu >= NR_CPUS)
-                       next_cpu = first_cpu(cpu_online_map);
-               watchdog_timer.expires += WATCHDOG_INTERVAL;
-               add_timer_on(&watchdog_timer, next_cpu);
+               if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+                   (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+                   (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+                       cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+                       /*
+                        * We just marked the clocksource as highres-capable,
+                        * notify the rest of the system as well so that we
+                        * transition into high-res mode:
+                        */
+                       tick_clock_notify();
+               }
        }
+
+       /*
+        * Cycle through CPUs to check if the CPUs stay synchronized
+        * to each other.
+        */
+       next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+       if (next_cpu >= nr_cpu_ids)
+               next_cpu = cpumask_first(cpu_online_mask);
+       watchdog_timer.expires += WATCHDOG_INTERVAL;
+       add_timer_on(&watchdog_timer, next_cpu);
+out:
        spin_unlock(&watchdog_lock);
 }
+
+static inline void clocksource_start_watchdog(void)
+{
+       if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+               return;
+       init_timer(&watchdog_timer);
+       watchdog_timer.function = clocksource_watchdog;
+       watchdog_last = watchdog->read(watchdog);
+       watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
+       add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+       watchdog_running = 1;
+}
+
+static inline void clocksource_stop_watchdog(void)
+{
+       if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+               return;
+       del_timer(&watchdog_timer);
+       watchdog_running = 0;
+}
+
+static inline void clocksource_reset_watchdog(void)
+{
+       struct clocksource *cs;
+
+       list_for_each_entry(cs, &watchdog_list, wd_list)
+               cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
 static void clocksource_resume_watchdog(void)
 {
-       set_bit(0, &watchdog_resumed);
+       unsigned long flags;
+
+       /*
+        * We use trylock here to avoid a potential dead lock when
+        * kgdb calls this code after the kernel has been stopped with
+        * watchdog_lock held. When watchdog_lock is held we just
+        * return and accept, that the watchdog might trigger and mark
+        * the monitored clock source (usually TSC) unstable.
+        *
+        * This does not affect the other caller clocksource_resume()
+        * because at this point the kernel is UP, interrupts are
+        * disabled and nothing can hold watchdog_lock.
+        */
+       if (!spin_trylock_irqsave(&watchdog_lock, flags))
+               return;
+       clocksource_reset_watchdog();
+       spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static void clocksource_check_watchdog(struct clocksource *cs)
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
 {
-       struct clocksource *cse;
        unsigned long flags;
 
        spin_lock_irqsave(&watchdog_lock, flags);
        if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
-               int started = !list_empty(&watchdog_list);
-
+               /* cs is a clocksource to be watched. */
                list_add(&cs->wd_list, &watchdog_list);
-               if (!started && watchdog) {
-                       watchdog_last = watchdog->read();
-                       watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
-                       add_timer_on(&watchdog_timer,
-                                    first_cpu(cpu_online_map));
-               }
+               cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
        } else {
+               /* cs is a watchdog. */
                if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
                        cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
-
+               /* Pick the best watchdog. */
                if (!watchdog || cs->rating > watchdog->rating) {
-                       if (watchdog)
-                               del_timer(&watchdog_timer);
                        watchdog = cs;
-                       init_timer(&watchdog_timer);
-                       watchdog_timer.function = clocksource_watchdog;
-
                        /* Reset watchdog cycles */
-                       list_for_each_entry(cse, &watchdog_list, wd_list)
-                               cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
-                       /* Start if list is not empty */
-                       if (!list_empty(&watchdog_list)) {
-                               watchdog_last = watchdog->read();
-                               watchdog_timer.expires =
-                                       jiffies + WATCHDOG_INTERVAL;
-                               add_timer_on(&watchdog_timer,
-                                            first_cpu(cpu_online_map));
-                       }
+                       clocksource_reset_watchdog();
                }
        }
+       /* Check if the watchdog timer needs to be started. */
+       clocksource_start_watchdog();
+       spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_dequeue_watchdog(struct clocksource *cs)
+{
+       struct clocksource *tmp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&watchdog_lock, flags);
+       if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+               /* cs is a watched clocksource. */
+               list_del_init(&cs->wd_list);
+       } else if (cs == watchdog) {
+               /* Reset watchdog cycles */
+               clocksource_reset_watchdog();
+               /* Current watchdog is removed. Find an alternative. */
+               watchdog = NULL;
+               list_for_each_entry(tmp, &clocksource_list, list) {
+                       if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
+                               continue;
+                       if (!watchdog || tmp->rating > watchdog->rating)
+                               watchdog = tmp;
+               }
+       }
+       cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+       /* Check if the watchdog timer needs to be stopped. */
+       clocksource_stop_watchdog();
+       spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static int clocksource_watchdog_kthread(void *data)
+{
+       struct clocksource *cs, *tmp;
+       unsigned long flags;
+       LIST_HEAD(unstable);
+
+       mutex_lock(&clocksource_mutex);
+       spin_lock_irqsave(&watchdog_lock, flags);
+       list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
+               if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+                       list_del_init(&cs->wd_list);
+                       list_add(&cs->wd_list, &unstable);
+               }
+       /* Check if the watchdog timer needs to be stopped. */
+       clocksource_stop_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
+
+       /* Needs to be done outside of watchdog lock */
+       list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
+               list_del_init(&cs->wd_list);
+               __clocksource_change_rating(cs, 0);
+       }
+       mutex_unlock(&clocksource_mutex);
+       return 0;
 }
-#else
-static void clocksource_check_watchdog(struct clocksource *cs)
+
+#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
+
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
 {
        if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
                cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 }
 
+static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
-#endif
+static inline int clocksource_watchdog_kthread(void *data) { return 0; }
+
+#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
+
+/**
+ * clocksource_suspend - suspend the clocksource(s)
+ */
+void clocksource_suspend(void)
+{
+       struct clocksource *cs;
+
+       list_for_each_entry_reverse(cs, &clocksource_list, list)
+               if (cs->suspend)
+                       cs->suspend(cs);
+}
 
 /**
  * clocksource_resume - resume the clocksource(s)
@@ -217,26 +470,20 @@ static inline void clocksource_resume_watchdog(void) { }
 void clocksource_resume(void)
 {
        struct clocksource *cs;
-       unsigned long flags;
 
-       spin_lock_irqsave(&clocksource_lock, flags);
-
-       list_for_each_entry(cs, &clocksource_list, list) {
+       list_for_each_entry(cs, &clocksource_list, list)
                if (cs->resume)
-                       cs->resume();
-       }
+                       cs->resume(cs);
 
        clocksource_resume_watchdog();
-
-       spin_unlock_irqrestore(&clocksource_lock, flags);
 }
 
 /**
  * clocksource_touch_watchdog - Update watchdog
  *
  * Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
- *
+ * to incorrectly trip the watchdog. This might fail when the kernel
+ * was stopped in code which holds watchdog_lock.
  */
 void clocksource_touch_watchdog(void)
 {
@@ -244,75 +491,139 @@ void clocksource_touch_watchdog(void)
 }
 
 /**
- * clocksource_get_next - Returns the selected clocksource
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
  *
  */
-struct clocksource *clocksource_get_next(void)
+static u64 clocksource_max_deferment(struct clocksource *cs)
 {
-       unsigned long flags;
-
-       spin_lock_irqsave(&clocksource_lock, flags);
-       if (next_clocksource && finished_booting) {
-               curr_clocksource = next_clocksource;
-               next_clocksource = NULL;
-       }
-       spin_unlock_irqrestore(&clocksource_lock, flags);
-
-       return curr_clocksource;
+       u64 max_nsecs, max_cycles;
+
+       /*
+        * Calculate the maximum number of cycles that we can pass to the
+        * cyc2ns function without overflowing a 64-bit signed result. The
+        * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
+        * is equivalent to the below.
+        * max_cycles < (2^63)/cs->mult
+        * max_cycles < 2^(log2((2^63)/cs->mult))
+        * max_cycles < 2^(log2(2^63) - log2(cs->mult))
+        * max_cycles < 2^(63 - log2(cs->mult))
+        * max_cycles < 1 << (63 - log2(cs->mult))
+        * Please note that we add 1 to the result of the log2 to account for
+        * any rounding errors, ensure the above inequality is satisfied and
+        * no overflow will occur.
+        */
+       max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+
+       /*
+        * The actual maximum number of cycles we can defer the clocksource is
+        * determined by the minimum of max_cycles and cs->mask.
+        */
+       max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+       max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+
+       /*
+        * To ensure that the clocksource does not wrap whilst we are idle,
+        * limit the time the clocksource can be deferred by 12.5%. Please
+        * note a margin of 12.5% is used because this can be computed with
+        * a shift, versus say 10% which would require division.
+        */
+       return max_nsecs - (max_nsecs >> 5);
 }
 
+#ifdef CONFIG_GENERIC_TIME
+
 /**
- * select_clocksource - Selects the best registered clocksource.
+ * clocksource_select - Select the best clocksource available
  *
- * Private function. Must hold clocksource_lock when called.
+ * Private function. Must hold clocksource_mutex when called.
  *
  * Select the clocksource with the best rating, or the clocksource,
  * which is selected by userspace override.
  */
-static struct clocksource *select_clocksource(void)
+static void clocksource_select(void)
 {
-       struct clocksource *next;
+       struct clocksource *best, *cs;
 
-       if (list_empty(&clocksource_list))
-               return NULL;
+       if (!finished_booting || list_empty(&clocksource_list))
+               return;
+       /* First clocksource on the list has the best rating. */
+       best = list_first_entry(&clocksource_list, struct clocksource, list);
+       /* Check for the override clocksource. */
+       list_for_each_entry(cs, &clocksource_list, list) {
+               if (strcmp(cs->name, override_name) != 0)
+                       continue;
+               /*
+                * Check to make sure we don't switch to a non-highres
+                * capable clocksource if the tick code is in oneshot
+                * mode (highres or nohz)
+                */
+               if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+                   tick_oneshot_mode_active()) {
+                       /* Override clocksource cannot be used. */
+                       printk(KERN_WARNING "Override clocksource %s is not "
+                              "HRT compatible. Cannot switch while in "
+                              "HRT/NOHZ mode\n", cs->name);
+                       override_name[0] = 0;
+               } else
+                       /* Override clocksource can be used. */
+                       best = cs;
+               break;
+       }
+       if (curr_clocksource != best) {
+               printk(KERN_INFO "Switching to clocksource %s\n", best->name);
+               curr_clocksource = best;
+               timekeeping_notify(curr_clocksource);
+       }
+}
 
-       if (clocksource_override)
-               next = clocksource_override;
-       else
-               next = list_entry(clocksource_list.next, struct clocksource,
-                                 list);
+#else /* CONFIG_GENERIC_TIME */
 
-       if (next == curr_clocksource)
-               return NULL;
+static inline void clocksource_select(void) { }
 
-       return next;
-}
+#endif
 
 /*
- * Enqueue the clocksource sorted by rating
+ * clocksource_done_booting - Called near the end of core bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time.
+ * We use fs_initcall because we want this to start before
+ * device_initcall but after subsys_initcall.
  */
-static int clocksource_enqueue(struct clocksource *c)
+static int __init clocksource_done_booting(void)
 {
-       struct list_head *tmp, *entry = &clocksource_list;
+       mutex_lock(&clocksource_mutex);
+       curr_clocksource = clocksource_default_clock();
+       mutex_unlock(&clocksource_mutex);
 
-       list_for_each(tmp, &clocksource_list) {
-               struct clocksource *cs;
-
-               cs = list_entry(tmp, struct clocksource, list);
-               if (cs == c)
-                       return -EBUSY;
-               /* Keep track of the place, where to insert */
-               if (cs->rating >= c->rating)
-                       entry = tmp;
-       }
-       list_add(&c->list, entry);
+       finished_booting = 1;
 
-       if (strlen(c->name) == strlen(override_name) &&
-           !strcmp(c->name, override_name))
-               clocksource_override = c;
+       /*
+        * Run the watchdog first to eliminate unstable clock sources
+        */
+       clocksource_watchdog_kthread(NULL);
 
+       mutex_lock(&clocksource_mutex);
+       clocksource_select();
+       mutex_unlock(&clocksource_mutex);
        return 0;
 }
+fs_initcall(clocksource_done_booting);
+
+/*
+ * Enqueue the clocksource sorted by rating
+ */
+static void clocksource_enqueue(struct clocksource *cs)
+{
+       struct list_head *entry = &clocksource_list;
+       struct clocksource *tmp;
+
+       list_for_each_entry(tmp, &clocksource_list, list)
+               /* Keep track of the place, where to insert */
+               if (tmp->rating >= cs->rating)
+                       entry = &tmp->list;
+       list_add(&cs->list, entry);
+}
 
 /**
  * clocksource_register - Used to install new clocksources
@@ -320,52 +631,51 @@ static int clocksource_enqueue(struct clocksource *c)
  *
  * Returns -EBUSY if registration fails, zero otherwise.
  */
-int clocksource_register(struct clocksource *c)
+int clocksource_register(struct clocksource *cs)
 {
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&clocksource_lock, flags);
-       ret = clocksource_enqueue(c);
-       if (!ret)
-               next_clocksource = select_clocksource();
-       spin_unlock_irqrestore(&clocksource_lock, flags);
-       if (!ret)
-               clocksource_check_watchdog(c);
-       return ret;
+       /* calculate max idle time permitted for this clocksource */
+       cs->max_idle_ns = clocksource_max_deferment(cs);
+
+       mutex_lock(&clocksource_mutex);
+       clocksource_enqueue(cs);
+       clocksource_select();
+       clocksource_enqueue_watchdog(cs);
+       mutex_unlock(&clocksource_mutex);
+       return 0;
 }
 EXPORT_SYMBOL(clocksource_register);
 
+static void __clocksource_change_rating(struct clocksource *cs, int rating)
+{
+       list_del(&cs->list);
+       cs->rating = rating;
+       clocksource_enqueue(cs);
+       clocksource_select();
+}
+
 /**
  * clocksource_change_rating - Change the rating of a registered clocksource
- *
  */
 void clocksource_change_rating(struct clocksource *cs, int rating)
 {
-       unsigned long flags;
-
-       spin_lock_irqsave(&clocksource_lock, flags);
-       list_del(&cs->list);
-       cs->rating = rating;
-       clocksource_enqueue(cs);
-       next_clocksource = select_clocksource();
-       spin_unlock_irqrestore(&clocksource_lock, flags);
+       mutex_lock(&clocksource_mutex);
+       __clocksource_change_rating(cs, rating);
+       mutex_unlock(&clocksource_mutex);
 }
+EXPORT_SYMBOL(clocksource_change_rating);
 
 /**
  * clocksource_unregister - remove a registered clocksource
  */
 void clocksource_unregister(struct clocksource *cs)
 {
-       unsigned long flags;
-
-       spin_lock_irqsave(&clocksource_lock, flags);
+       mutex_lock(&clocksource_mutex);
+       clocksource_dequeue_watchdog(cs);
        list_del(&cs->list);
-       if (clocksource_override == cs)
-               clocksource_override = NULL;
-       next_clocksource = select_clocksource();
-       spin_unlock_irqrestore(&clocksource_lock, flags);
+       clocksource_select();
+       mutex_unlock(&clocksource_mutex);
 }
+EXPORT_SYMBOL(clocksource_unregister);
 
 #ifdef CONFIG_SYSFS
 /**
@@ -381,9 +691,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 {
        ssize_t count = 0;
 
-       spin_lock_irq(&clocksource_lock);
+       mutex_lock(&clocksource_mutex);
        count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
-       spin_unlock_irq(&clocksource_lock);
+       mutex_unlock(&clocksource_mutex);
 
        return count;
 }
@@ -395,15 +705,13 @@ sysfs_show_current_clocksources(struct sys_device *dev,
  * @count:     length of buffer
  *
  * Takes input from sysfs interface for manually overriding the default
- * clocksource selction.
+ * clocksource selection.
  */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                                          struct sysdev_attribute *attr,
                                          const char *buf, size_t count)
 {
-       struct clocksource *ovr = NULL;
        size_t ret = count;
-       int len;
 
        /* strings from sysfs write are not 0 terminated! */
        if (count >= sizeof(override_name))
@@ -413,32 +721,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
        if (buf[count-1] == '\n')
                count--;
 
-       spin_lock_irq(&clocksource_lock);
+       mutex_lock(&clocksource_mutex);
 
        if (count > 0)
                memcpy(override_name, buf, count);
        override_name[count] = 0;
+       clocksource_select();
 
-       len = strlen(override_name);
-       if (len) {
-               struct clocksource *cs;
-
-               ovr = clocksource_override;
-               /* try to select it: */
-               list_for_each_entry(cs, &clocksource_list, list) {
-                       if (strlen(cs->name) == len &&
-                           !strcmp(cs->name, override_name))
-                               ovr = cs;
-               }
-       }
-
-       /* Reselect, when the override name has changed */
-       if (ovr != clocksource_override) {
-               clocksource_override = ovr;
-               next_clocksource = select_clocksource();
-       }
-
-       spin_unlock_irq(&clocksource_lock);
+       mutex_unlock(&clocksource_mutex);
 
        return ret;
 }
@@ -458,13 +748,19 @@ sysfs_show_available_clocksources(struct sys_device *dev,
        struct clocksource *src;
        ssize_t count = 0;
 
-       spin_lock_irq(&clocksource_lock);
+       mutex_lock(&clocksource_mutex);
        list_for_each_entry(src, &clocksource_list, list) {
-               count += snprintf(buf + count,
+               /*
+                * Don't show non-HRES clocksource if the tick code is
+                * in one shot mode (highres=on or nohz=on)
+                */
+               if (!tick_oneshot_mode_active() ||
+                   (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+                       count += snprintf(buf + count,
                                  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
                                  "%s ", src->name);
        }
-       spin_unlock_irq(&clocksource_lock);
+       mutex_unlock(&clocksource_mutex);
 
        count += snprintf(buf + count,
                          max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -519,11 +815,10 @@ device_initcall(init_clocksource_sysfs);
  */
 static int __init boot_override_clocksource(char* str)
 {
-       unsigned long flags;
-       spin_lock_irqsave(&clocksource_lock, flags);
+       mutex_lock(&clocksource_mutex);
        if (str)
                strlcpy(override_name, str, sizeof(override_name));
-       spin_unlock_irqrestore(&clocksource_lock, flags);
+       mutex_unlock(&clocksource_mutex);
        return 1;
 }