X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;ds=sidebyside;f=kernel%2Fsched_clock.c;h=906a0f718cb32c16797b83df68a4eb23d3e15a16;hb=b2e75eff5e859d0c294e7405958362b26a423c6e;hp=60094e257a9a1616e187b2b2449ec998b01c9b86;hpb=2d452c9b10caeec455eb5e56a0ef4ed485178213;p=safe%2Fjmp%2Flinux-2.6

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 60094e2..906a0f7 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -3,45 +3,52 @@
  *
  *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
+ *  Updates and enhancements:
+ *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
+ *
  * Based on code by:
  *   Ingo Molnar <mingo@redhat.com>
  *   Guillaume Chazarain <guichaz@gmail.com>
  *
  * Create a semi stable clock from a mixture of other events, including:
  *  - gtod
- *  - jiffies
  *  - sched_clock()
  *  - explicit idle events
  *
  * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.  This window
- * is set up using jiffies.
+ * making it monotonic and keeping it within an expected window.
  *
  * Furthermore, explicit sleep and wakeup hooks allow us to account for time
  * that is otherwise invisible (TSC gets stopped).
  *
  * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 1 jiffies difference).
+ * consistent between cpus (never more than 2 jiffies difference).
  */
-#include <linux/sched.h>
-#include <linux/percpu.h>
 #include <linux/spinlock.h>
-#include <linux/ktime.h>
+#include <linux/hardirq.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
 
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
+					* (NSEC_PER_SEC / HZ);
+}
+EXPORT_SYMBOL_GPL(sched_clock);
+
+static __read_mostly int sched_clock_running;
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__read_mostly int sched_clock_stable;
 
 struct sched_clock_data {
-	/*
-	 * Raw spinlock - this is a special case: this might be called
-	 * from within instrumentation code so we dont want to do any
-	 * instrumentation ourselves.
-	 */
-	raw_spinlock_t		lock;
-
-	unsigned long		prev_jiffies;
-	u64			prev_raw;
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
@@ -59,20 +66,14 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 	return &per_cpu(sched_clock_data, cpu);
 }
 
-static __read_mostly int sched_clock_running;
-
 void sched_clock_init(void)
 {
 	u64 ktime_now = ktime_to_ns(ktime_get());
-	unsigned long now_jiffies = jiffies;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		struct sched_clock_data *scd = cpu_sdc(cpu);
 
-		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-		scd->prev_jiffies = now_jiffies;
-		scd->prev_raw = 0;
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
@@ -82,123 +83,136 @@ void sched_clock_init(void)
 }
 
 /*
+ * min, max except they take wrapping into account
+ */
+
+static inline u64 wrap_min(u64 x, u64 y)
+{
+	return (s64)(x - y) < 0 ? x : y;
+}
+
+static inline u64 wrap_max(u64 x, u64 y)
+{
+	return (s64)(x - y) > 0 ? x : y;
+}
+
+/*
  * update the percpu scd from the raw @now value
  *
  *  - filter out backward motion
- *  - use jiffies to generate a min,max window to clip the raw values
+ *  - use the GTOD tick value to create a window to filter crazy TSC values
  */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+static u64 sched_clock_local(struct sched_clock_data *scd)
 {
-	unsigned long now_jiffies = jiffies;
-	long delta_jiffies = now_jiffies - scd->prev_jiffies;
-	u64 clock = scd->clock;
-	u64 min_clock, max_clock;
-	s64 delta = now - scd->prev_raw;
+	u64 now, clock, old_clock, min_clock, max_clock;
+	s64 delta;
 
-	WARN_ON_ONCE(!irqs_disabled());
-	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+again:
+	now = sched_clock();
+	delta = now - scd->tick_raw;
+	if (unlikely(delta < 0))
+		delta = 0;
 
-	if (unlikely(delta < 0)) {
-		clock++;
-		goto out;
-	}
+	old_clock = scd->clock;
 
-	max_clock = min_clock + TICK_NSEC;
+	/*
+	 * scd->clock = clamp(scd->tick_gtod + delta,
+	 *		      max(scd->tick_gtod, scd->clock),
+	 *		      scd->tick_gtod + TICK_NSEC);
+	 */
 
-	if (unlikely(clock + delta > max_clock)) {
-		if (clock < max_clock)
-			clock = max_clock;
-		else
-			clock++;
-	} else {
-		clock += delta;
-	}
+	clock = scd->tick_gtod + delta;
+	min_clock = wrap_max(scd->tick_gtod, old_clock);
+	max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
 
- out:
-	if (unlikely(clock < min_clock))
-		clock = min_clock;
+	clock = wrap_max(clock, min_clock);
+	clock = wrap_min(clock, max_clock);
 
-	scd->prev_raw = now;
-	scd->prev_jiffies = now_jiffies;
-	scd->clock = clock;
+	if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+		goto again;
+
+	return clock;
 }
 
-static void lock_double_clock(struct sched_clock_data *data1,
-				struct sched_clock_data *data2)
+static u64 sched_clock_remote(struct sched_clock_data *scd)
 {
-	if (data1 < data2) {
-		__raw_spin_lock(&data1->lock);
-		__raw_spin_lock(&data2->lock);
+	struct sched_clock_data *my_scd = this_scd();
+	u64 this_clock, remote_clock;
+	u64 *ptr, old_val, val;
+
+	sched_clock_local(my_scd);
+again:
+	this_clock = my_scd->clock;
+	remote_clock = scd->clock;
+
+	/*
+	 * Use the opportunity that we have both locks
+	 * taken to couple the two clocks: we take the
+	 * larger time as the latest time for both
+	 * runqueues. (this creates monotonic movement)
+	 */
+	if (likely((s64)(remote_clock - this_clock) < 0)) {
+		ptr = &scd->clock;
+		old_val = remote_clock;
+		val = this_clock;
 	} else {
-		__raw_spin_lock(&data2->lock);
-		__raw_spin_lock(&data1->lock);
+		/*
+		 * Should be rare, but possible:
+		 */
+		ptr = &my_scd->clock;
+		old_val = this_clock;
+		val = remote_clock;
 	}
+
+	if (cmpxchg64(ptr, old_val, val) != old_val)
+		goto again;
+
+	return val;
 }
 
 u64 sched_clock_cpu(int cpu)
 {
-	struct sched_clock_data *scd = cpu_sdc(cpu);
-	u64 now, clock;
-
-	if (unlikely(!sched_clock_running))
-		return 0ull;
+	struct sched_clock_data *scd;
+	u64 clock;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	now = sched_clock();
-
-	if (cpu != raw_smp_processor_id()) {
-		/*
-		 * in order to update a remote cpu's clock based on our
-		 * unstable raw time rebase it against:
-		 *   tick_raw		(offset between raw counters)
-		 *   tick_gotd          (tick offset between cpus)
-		 */
-		struct sched_clock_data *my_scd = this_scd();
-
-		lock_double_clock(scd, my_scd);
-
-		now -= my_scd->tick_raw;
-		now += scd->tick_raw;
 
-		now -= my_scd->tick_gtod;
-		now += scd->tick_gtod;
+	if (sched_clock_stable)
+		return sched_clock();
 
-		__raw_spin_unlock(&my_scd->lock);
-	} else {
-		__raw_spin_lock(&scd->lock);
-	}
+	if (unlikely(!sched_clock_running))
+		return 0ull;
 
-	__update_sched_clock(scd, now);
-	clock = scd->clock;
+	scd = cpu_sdc(cpu);
 
-	__raw_spin_unlock(&scd->lock);
+	if (cpu != smp_processor_id())
+		clock = sched_clock_remote(scd);
+	else
+		clock = sched_clock_local(scd);
 
 	return clock;
 }
 
 void sched_clock_tick(void)
 {
-	struct sched_clock_data *scd = this_scd();
+	struct sched_clock_data *scd;
 	u64 now, now_gtod;
 
+	if (sched_clock_stable)
+		return;
+
 	if (unlikely(!sched_clock_running))
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
 
-	now = sched_clock();
+	scd = this_scd();
 	now_gtod = ktime_to_ns(ktime_get());
+	now = sched_clock();
 
-	__raw_spin_lock(&scd->lock);
-	__update_sched_clock(scd, now);
-	/*
-	 * update tick_gtod after __update_sched_clock() because that will
-	 * already observe 1 new jiffy; adding a new tick_gtod to that would
-	 * increase the clock 2 jiffies.
-	 */
 	scd->tick_raw = now;
 	scd->tick_gtod = now_gtod;
-	__raw_spin_unlock(&scd->lock);
+	sched_clock_local(scd);
 }
 
 /*
@@ -215,36 +229,14 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
  */
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
-	struct sched_clock_data *scd = this_scd();
-	u64 now = sched_clock();
-
-	/*
-	 * Override the previous timestamp and ignore all
-	 * sched_clock() deltas that occured while we idled,
-	 * and use the PM-provided delta_ns to advance the
-	 * rq clock:
-	 */
-	__raw_spin_lock(&scd->lock);
-	scd->prev_raw = now;
-	scd->clock += delta_ns;
-	__raw_spin_unlock(&scd->lock);
+	if (timekeeping_suspended)
+		return;
 
+	sched_clock_tick();
 	touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-#endif
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-}
-
 unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long clock;
@@ -256,4 +248,28 @@ unsigned long long cpu_clock(int cpu)
 
 	return clock;
 }
+
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+void sched_clock_init(void)
+{
+	sched_clock_running = 1;
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+	if (unlikely(!sched_clock_running))
+		return 0;
+
+	return sched_clock();
+}
+
+
+unsigned long long cpu_clock(int cpu)
+{
+	return sched_clock_cpu(cpu);
+}
+
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
 EXPORT_SYMBOL_GPL(cpu_clock);