sched: Add a comment to get_cpu_idle_time_us()
[safe/jmp/linux-2.6] / kernel / time / tick-sched.c
index 611fa4c..358822e 100644 (file)
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
  * value. We do this unconditionally on any cpu, as we don't know whether the
  * cpu, which has the update task assigned is in a long sleep.
  */
-void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(ktime_t now)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
-       ktime_t now;
-
-       if (!ts->tick_stopped)
-               return;
 
-       cpu_clear(cpu, nohz_cpu_mask);
-       now = ktime_get();
+       cpumask_clear_cpu(cpu, nohz_cpu_mask);
        ts->idle_waketime = now;
 
        local_irq_save(flags);
@@ -155,20 +150,17 @@ void tick_nohz_update_jiffies(void)
        touch_softlockup_watchdog();
 }
 
-static void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+       ktime_t delta;
 
-       if (ts->idle_active) {
-               ktime_t now, delta;
-               now = ktime_get();
-               delta = ktime_sub(now, ts->idle_entrytime);
-               ts->idle_lastupdate = now;
-               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-               ts->idle_active = 0;
+       delta = ktime_sub(now, ts->idle_entrytime);
+       ts->idle_lastupdate = now;
+       ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+       ts->idle_active = 0;
 
-               sched_clock_idle_wakeup_event(0);
-       }
+       sched_clock_idle_wakeup_event(0);
 }
 
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -187,6 +179,20 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
        return now;
 }
 
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -216,12 +222,29 @@ void tick_nohz_stop_sched_tick(int inidle)
        struct tick_sched *ts;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+       u64 time_delta;
        int cpu;
 
        local_irq_save(flags);
 
        cpu = smp_processor_id();
        ts = &per_cpu(tick_cpu_sched, cpu);
+
+       /*
+        * Call to tick_nohz_start_idle stops the last_update_time from being
+        * updated. Thus, it must not be called in the event we are called from
+        * irq_exit() with the prior state different than idle.
+        */
+       if (!inidle && !ts->inidle)
+               goto end;
+
+       /*
+        * Set ts->inidle unconditionally. Even if the system did not
+        * switch to NOHZ mode the cpu frequency governers rely on the
+        * update of the idle time accounting in tick_nohz_start_idle().
+        */
+       ts->inidle = 1;
+
        now = tick_nohz_start_idle(ts);
 
        /*
@@ -239,11 +262,6 @@ void tick_nohz_stop_sched_tick(int inidle)
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
                goto end;
 
-       if (!inidle && !ts->inidle)
-               goto end;
-
-       ts->inidle = 1;
-
        if (need_resched())
                goto end;
 
@@ -252,26 +270,33 @@ void tick_nohz_stop_sched_tick(int inidle)
 
                if (ratelimit < 10) {
                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-                              local_softirq_pending());
+                              (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
                goto end;
        }
 
+       if (nohz_ratelimit(cpu))
+               goto end;
+
        ts->idle_calls++;
        /* Read jiffies and the time when jiffies were updated last */
        do {
                seq = read_seqbegin(&xtime_lock);
                last_update = last_jiffies_update;
                last_jiffies = jiffies;
+               time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&xtime_lock, seq));
 
-       /* Get the next timer wheel timer */
-       next_jiffies = get_next_timer_interrupt(last_jiffies);
-       delta_jiffies = next_jiffies - last_jiffies;
-
-       if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
+       if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
+           arch_needs_cpu(cpu)) {
+               next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
+       } else {
+               /* Get the next timer wheel timer */
+               next_jiffies = get_next_timer_interrupt(last_jiffies);
+               delta_jiffies = next_jiffies - last_jiffies;
+       }
        /*
         * Do not stop the tick, if we are only one off
         * or if the cpu is required for rcu
@@ -283,25 +308,54 @@ void tick_nohz_stop_sched_tick(int inidle)
        if ((long)delta_jiffies >= 1) {
 
                /*
-               * calculate the expiry time for the next timer wheel
-               * timer
-               */
-               expires = ktime_add_ns(last_update, tick_period.tv64 *
-                                  delta_jiffies);
-
-               /*
                 * If this cpu is the one which updates jiffies, then
                 * give up the assignment and let it be taken by the
                 * cpu which runs the tick timer next, which might be
                 * this cpu as well. If we don't drop this here the
                 * jiffies might be stale and do_timer() never
-                * invoked.
+                * invoked. Keep track of the fact that it was the one
+                * which had the do_timer() duty last. If this cpu is
+                * the one which had the do_timer() duty last, we
+                * limit the sleep time to the timekeeping
+                * max_deferement value which we retrieved
+                * above. Otherwise we can sleep as long as we want.
                 */
-               if (cpu == tick_do_timer_cpu)
+               if (cpu == tick_do_timer_cpu) {
                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+                       ts->do_timer_last = 1;
+               } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+                       time_delta = KTIME_MAX;
+                       ts->do_timer_last = 0;
+               } else if (!ts->do_timer_last) {
+                       time_delta = KTIME_MAX;
+               }
+
+               /*
+                * calculate the expiry time for the next timer wheel
+                * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
+                * that there is no timer pending or at least extremely
+                * far into the future (12 days for HZ=1000). In this
+                * case we set the expiry to the end of time.
+                */
+               if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
+                       /*
+                        * Calculate the time delta for the next timer event.
+                        * If the time delta exceeds the maximum time delta
+                        * permitted by the current clocksource then adjust
+                        * the time delta accordingly to ensure the
+                        * clocksource does not wrap.
+                        */
+                       time_delta = min_t(u64, time_delta,
+                                          tick_period.tv64 * delta_jiffies);
+               }
+
+               if (time_delta < KTIME_MAX)
+                       expires = ktime_add_ns(last_update, time_delta);
+               else
+                       expires.tv64 = KTIME_MAX;
 
                if (delta_jiffies > 1)
-                       cpu_set(cpu, nohz_cpu_mask);
+                       cpumask_set_cpu(cpu, nohz_cpu_mask);
 
                /* Skip reprogram of event if its not changed */
                if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
@@ -319,7 +373,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                                /*
                                 * sched tick not stopped!
                                 */
-                               cpu_clear(cpu, nohz_cpu_mask);
+                               cpumask_clear_cpu(cpu, nohz_cpu_mask);
                                goto out;
                        }
 
@@ -331,25 +385,22 @@ void tick_nohz_stop_sched_tick(int inidle)
 
                ts->idle_sleeps++;
 
+               /* Mark expires */
+               ts->idle_expires = expires;
+
                /*
-                * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
-                * there is no timer pending or at least extremly far
-                * into the future (12 days for HZ=1000). In this case
-                * we simply stop the tick timer:
+                * If the expiration time == KTIME_MAX, then
+                * in this case we simply stop the tick timer.
                 */
-               if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
-                       ts->idle_expires.tv64 = KTIME_MAX;
+                if (unlikely(expires.tv64 == KTIME_MAX)) {
                        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
                                hrtimer_cancel(&ts->sched_timer);
                        goto out;
                }
 
-               /* Mark expiries */
-               ts->idle_expires = expires;
-
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
-                                     HRTIMER_MODE_ABS);
+                                     HRTIMER_MODE_ABS_PINNED);
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                goto out;
@@ -361,7 +412,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                 * softirq.
                 */
                tick_do_update_jiffies64(ktime_get());
-               cpu_clear(cpu, nohz_cpu_mask);
+               cpumask_clear_cpu(cpu, nohz_cpu_mask);
        }
        raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
@@ -395,7 +446,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start_expires(&ts->sched_timer,
-                                     HRTIMER_MODE_ABS);
+                                             HRTIMER_MODE_ABS_PINNED);
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                break;
@@ -425,7 +476,11 @@ void tick_nohz_restart_sched_tick(void)
        ktime_t now;
 
        local_irq_disable();
-       tick_nohz_stop_idle(cpu);
+       if (ts->idle_active || (ts->inidle && ts->tick_stopped))
+               now = ktime_get();
+
+       if (ts->idle_active)
+               tick_nohz_stop_idle(cpu, now);
 
        if (!ts->inidle || !ts->tick_stopped) {
                ts->inidle = 0;
@@ -439,9 +494,8 @@ void tick_nohz_restart_sched_tick(void)
 
        /* Update jiffies first */
        select_nohz_load_balancer(0);
-       now = ktime_get();
        tick_do_update_jiffies64(now);
-       cpu_clear(cpu, nohz_cpu_mask);
+       cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
        /*
@@ -573,22 +627,18 @@ static void tick_nohz_switch_to_nohz(void)
  * timer and do not touch the other magic bits which need to be done
  * when idle is left.
  */
-static void tick_nohz_kick_tick(int cpu)
+static void tick_nohz_kick_tick(int cpu, ktime_t now)
 {
 #if 0
        /* Switch back to 2.6.27 behaviour */
 
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-       ktime_t delta, now;
-
-       if (!ts->tick_stopped)
-               return;
+       ktime_t delta;
 
        /*
         * Do not touch the tick device, when the next expiry is either
         * already reached or less/equal than the tick period.
         */
-       now = ktime_get();
        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
        if (delta.tv64 <= tick_period.tv64)
                return;
@@ -597,9 +647,26 @@ static void tick_nohz_kick_tick(int cpu)
 #endif
 }
 
+static inline void tick_check_nohz(int cpu)
+{
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+       ktime_t now;
+
+       if (!ts->idle_active && !ts->tick_stopped)
+               return;
+       now = ktime_get();
+       if (ts->idle_active)
+               tick_nohz_stop_idle(cpu, now);
+       if (ts->tick_stopped) {
+               tick_nohz_update_jiffies(now);
+               tick_nohz_kick_tick(cpu, now);
+       }
+}
+
 #else
 
 static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_check_nohz(int cpu) { }
 
 #endif /* NO_HZ */
 
@@ -609,11 +676,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 void tick_check_idle(int cpu)
 {
        tick_check_oneshot_broadcast(cpu);
-#ifdef CONFIG_NO_HZ
-       tick_nohz_stop_idle(cpu);
-       tick_nohz_update_jiffies();
-       tick_nohz_kick_tick(cpu);
-#endif
+       tick_check_nohz(cpu);
 }
 
 /*
@@ -698,7 +761,8 @@ void tick_setup_sched_timer(void)
 
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
-               hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&ts->sched_timer,
+                                     HRTIMER_MODE_ABS_PINNED);
                /* Check, if the timer was already in the past */
                if (hrtimer_active(&ts->sched_timer))
                        break;