X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Ftimer.c;h=1a69705c2fb95536dc67c82479fcff315c0e0e1e;hb=f0ede66fca23cfee4ee80b48298007d930f49bbe;hp=78d3fa10fcd6136f70540a82bcdbcb8c35bcde69;hpb=b0ee75561beadc4db4d9a899c8ef4a7db50aa0ab;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/timer.c b/kernel/timer.c index 78d3fa1..1a69705 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1,7 +1,7 @@ /* * linux/kernel/timer.c * - * Kernel internal timers, kernel timekeeping, basic process system calls + * Kernel internal timers, basic process system calls * * Copyright (C) 1991, 1992 Linus Torvalds * @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include @@ -72,7 +74,7 @@ struct tvec_t_base_s { tvec_t tv3; tvec_t tv4; tvec_t tv5; -} ____cacheline_aligned_in_smp; +} ____cacheline_aligned; typedef struct tvec_t_base_s tvec_base_t; @@ -80,6 +82,169 @@ tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; +/* + * Note that all tvec_bases is 2 byte aligned and lower bit of + * base in timer_list is guaranteed to be zero. Use the LSB for + * the new flag to indicate whether the timer is deferrable + */ +#define TBASE_DEFERRABLE_FLAG (0x1) + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(tvec_base_t *base) +{ + return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); +} + +static inline tvec_base_t *tbase_get_base(tvec_base_t *base) +{ + return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); +} + +static inline void timer_set_deferrable(struct timer_list *timer) +{ + timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | + TBASE_DEFERRABLE_FLAG)); +} + +static inline void +timer_set_base(struct timer_list *timer, tvec_base_t *new_base) +{ + timer->base = (tvec_base_t *)((unsigned long)(new_base) | + tbase_get_deferrable(timer->base)); +} + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + */ + if (rem < HZ/4) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + /* + * In theory the following code can skip a jiffy in case jiffies + * increments right between the addition and the later subtraction. + * However since the entire point of this function is to use approximate + * timeouts, it's entirely ok to not handle that. + */ + return __round_jiffies(j + jiffies, cpu) - jiffies; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return __round_jiffies(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + + static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { @@ -130,6 +295,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /** * init_timer - initialize a timer. * @timer: the timer to be initialized @@ -141,11 +318,23 @@ void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL(init_timer); +void fastcall init_timer_deferrable(struct timer_list *timer) +{ + init_timer(timer); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL(init_timer_deferrable); + static inline void detach_timer(struct timer_list *timer, - int clear_pending) + int clear_pending) { struct list_head *entry = &timer->entry; @@ -174,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer, tvec_base_t *base; for (;;) { - base = timer->base; + tvec_base_t *prelock_base = timer->base; + base = tbase_get_base(prelock_base); if (likely(base != NULL)) { spin_lock_irqsave(&base->lock, *flags); - if (likely(base == timer->base)) + if (likely(prelock_base == timer->base)) return base; /* The timer has migrated to another CPU */ spin_unlock_irqrestore(&base->lock, *flags); @@ -192,6 +382,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) unsigned long flags; int ret = 0; + timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); @@ -213,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ - timer->base = NULL; + timer_set_base(timer, NULL); spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->base = base; + timer_set_base(timer, base); } } @@ -242,9 +433,10 @@ void add_timer_on(struct timer_list *timer, int cpu) tvec_base_t *base = per_cpu(tvec_bases, cpu); unsigned long flags; + timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); - timer->base = base; + timer_set_base(timer, base); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); } @@ -255,7 +447,7 @@ void add_timer_on(struct timer_list *timer, int cpu) * @timer: the timer to be modified * @expires: new timeout in jiffies * - * mod_timer is a more efficient way to update the expire field of an + * mod_timer() is a more efficient way to update the expire field of an * active timer (if the timer is inactive it will be activated) * * mod_timer(timer, expires) is equivalent to: @@ -274,6 +466,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) { BUG_ON(!timer->function); + timer_stats_timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -304,6 +497,7 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { @@ -350,6 +544,8 @@ out: return ret; } +EXPORT_SYMBOL(try_to_del_timer_sync); + /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -358,7 +554,7 @@ out: * the timer it also makes sure the handler has finished executing on other * CPUs. * - * Synchronization rules: callers must prevent restarting of the timer, + * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts. The caller must not hold locks which would prevent * completion of the timer's handler. The timer's handler must not call @@ -393,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) * don't have to detach them individually. */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(timer->base != base); + BUG_ON(tbase_get_base(timer->base) != base); internal_add_timer(base, timer); } @@ -433,10 +629,12 @@ static inline void __run_timers(tvec_base_t *base) void (*fn)(unsigned long); unsigned long data; - timer = list_entry(head->next,struct timer_list,entry); + timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; + timer_stats_account_timer(timer); + set_running_timer(base, timer); detach_timer(timer, 1); spin_unlock_irq(&base->lock); @@ -459,506 +657,150 @@ static inline void __run_timers(tvec_base_t *base) spin_unlock_irq(&base->lock); } -#ifdef CONFIG_NO_IDLE_HZ +#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a cpus is idle. * This functions needs to be called disabled. */ -unsigned long next_timer_interrupt(void) +static unsigned long __next_timer_interrupt(tvec_base_t *base) { - tvec_base_t *base; - struct list_head *list; + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; + int index, slot, array, found = 0; struct timer_list *nte; - unsigned long expires; - unsigned long hr_expires = MAX_JIFFY_OFFSET; - ktime_t hr_delta; tvec_t *varray[4]; - int i, j; - - hr_delta = hrtimer_get_next_event(); - if (hr_delta.tv64 != KTIME_MAX) { - struct timespec tsdelta; - tsdelta = ktime_to_timespec(hr_delta); - hr_expires = timespec_to_jiffies(&tsdelta); - if (hr_expires < 3) - return hr_expires + jiffies; - } - hr_expires += jiffies; - - base = __get_cpu_var(tvec_bases); - spin_lock(&base->lock); - expires = base->timer_jiffies + (LONG_MAX >> 1); - list = NULL; /* Look for timer events in tv1. */ - j = base->timer_jiffies & TVR_MASK; + index = slot = timer_jiffies & TVR_MASK; do { - list_for_each_entry(nte, base->tv1.vec + j, entry) { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; expires = nte->expires; - if (j < (base->timer_jiffies & TVR_MASK)) - list = base->tv2.vec + (INDEX(0)); - goto found; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; } - j = (j + 1) & TVR_MASK; - } while (j != (base->timer_jiffies & TVR_MASK)); + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; /* Check tv2-tv5. */ varray[0] = &base->tv2; varray[1] = &base->tv3; varray[2] = &base->tv4; varray[3] = &base->tv5; - for (i = 0; i < 4; i++) { - j = INDEX(i); + + for (array = 0; array < 4; array++) { + tvec_t *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; do { - if (list_empty(varray[i]->vec + j)) { - j = (j + 1) & TVN_MASK; - continue; - } - list_for_each_entry(nte, varray[i]->vec + j, entry) + list_for_each_entry(nte, varp->vec + slot, entry) { + found = 1; if (time_before(nte->expires, expires)) expires = nte->expires; - if (j < (INDEX(i)) && i < 3) - list = varray[i + 1]->vec + (INDEX(i + 1)); - goto found; - } while (j != (INDEX(i))); - } -found: - if (list) { - /* - * The search wrapped. We need to look at the next list - * from next tv element that would cascade into tv element - * where we found the timer element. - */ - list_for_each_entry(nte, list, entry) { - if (time_before(nte->expires, expires)) - expires = nte->expires; - } - } - spin_unlock(&base->lock); - - /* - * It can happen that other CPUs service timer IRQs and increment - * jiffies, but we have not yet got a local timer tick to process - * the timer wheels. In that case, the expiry time can be before - * jiffies, but since the high-resolution timer here is relative to - * jiffies, the default expression when high-resolution timers are - * not active, - * - * time_before(MAX_JIFFY_OFFSET + jiffies, expires) - * - * would falsely evaluate to true. If that is the case, just - * return jiffies so that we can immediately fire the local timer - */ - if (time_before(expires, jiffies)) - return jiffies; - - if (time_before(hr_expires, expires)) - return hr_expires; - - return expires; -} -#endif - -/******************************************************************/ - -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); - -EXPORT_SYMBOL(xtime); - - -/* XXX - all of this timekeeping code should be later moved to time.c */ -#include -static struct clocksource *clock; /* pointer to current clocksource */ - -#ifdef CONFIG_GENERIC_TIME -/** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook - * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) - */ -static inline s64 __get_nsec_offset(void) -{ - cycle_t cycle_now, cycle_delta; - s64 ns_offset; - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); - - return ns_offset; -} - -/** - * __get_realtime_clock_ts - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. Used by - * do_gettimeofday() and get_realtime_clock_ts(). - */ -static inline void __get_realtime_clock_ts(struct timespec *ts) -{ - unsigned long seq; - s64 nsecs; - - do { - seq = read_seqbegin(&xtime_lock); - - *ts = xtime; - nsecs = __get_nsec_offset(); - - } while (read_seqretry(&xtime_lock, seq)); - - timespec_add_ns(ts, nsecs); -} - -/** - * getnstimeofday - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. - */ -void getnstimeofday(struct timespec *ts) -{ - __get_realtime_clock_ts(ts); -} - -EXPORT_SYMBOL(getnstimeofday); - -/** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using get_realtime_clock_ts() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec now; - - __get_realtime_clock_ts(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} - -EXPORT_SYMBOL(do_gettimeofday); -/** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time - * - * Sets the time of day to the new time and update NTP and notify hrtimers - */ -int do_settimeofday(struct timespec *tv) -{ - unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - nsec -= __get_nsec_offset(); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - clock->error = 0; - ntp_clear(); - - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* signal hrtimers about time change */ - clock_was_set(); - - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); -/** - * change_clocksource - Swaps clocksources if a new one is available - * - * Accumulates current time interval and initializes new clocksource - */ -static int change_clocksource(void) -{ - struct clocksource *new; - cycle_t now; - u64 nsec; - new = clocksource_get_next(); - if (clock != new) { - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); - - clock = new; - clock->cycle_last = now; - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - return 1; - } else if (clock->update_callback) { - return clock->update_callback(); + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; } - return 0; -} -#else -#define change_clocksource() (0) -#endif - -/** - * timeofday_is_continuous - check to see if timekeeping is free running - */ -int timekeeping_is_continuous(void) -{ - unsigned long seq; - int ret; - - do { - seq = read_seqbegin(&xtime_lock); - - ret = clock->is_continuous; - - } while (read_seqretry(&xtime_lock, seq)); - - return ret; + return expires; } /* - * timekeeping_init - Initializes the clocksource and common timekeeping values - */ -void __init timekeeping_init(void) -{ - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - - ntp_clear(); - - clock = clocksource_get_next(); - clocksource_calculate_interval(clock, tick_nsec); - clock->cycle_last = clocksource_read(clock); - - write_sequnlock_irqrestore(&xtime_lock, flags); -} - - -static int timekeeping_suspended; -/** - * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are - * still managed by arch specific suspend/resume code. + * Check, if the next hrtimer event is before the next timer wheel + * event: */ -static int timekeeping_resume(struct sys_device *dev) -{ - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - /* restart the last cycle value */ - clock->cycle_last = clocksource_read(clock); - clock->error = 0; - timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); - return 0; -} - -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) { - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - timekeeping_suspended = 1; - write_sequnlock_irqrestore(&xtime_lock, flags); - return 0; -} - -/* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, - set_kset_name("timekeeping"), -}; + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; + unsigned long delta; -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) -{ - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} + if (hr_delta.tv64 == KTIME_MAX) + return expires; -device_initcall(timekeeping_init_device); + /* + * Expired timer available, let it expire in the next tick + */ + if (hr_delta.tv64 <= 0) + return now + 1; -/* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. - */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) -{ - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; + tsdelta = ktime_to_timespec(hr_delta); + delta = timespec_to_jiffies(&tsdelta); /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). + * Limit the delta to the max value, which is checked in + * tick_nohz_stop_sched_tick(): */ - error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; + if (delta > NEXT_TIMER_MAX_DELTA) + delta = NEXT_TIMER_MAX_DELTA; /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. + * Take rounding errors in to account and make sure, that it + * expires in the next tick. Otherwise we go into an endless + * ping pong due to tick_nohz_stop_sched_tick() retriggering + * the timer softirq */ - tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; - } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void clocksource_adjust(struct clocksource *clock, s64 offset) -{ - s64 error, interval = clock->cycle_interval; - int adj; - - error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); - if (error > interval) { - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = clocksource_bigadjust(error, &interval, &offset); - } else if (error < -interval) { - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = clocksource_bigadjust(error, &interval, &offset); - } else - return; - - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); + if (delta < 1) + delta = 1; + now += delta; + if (time_before(now, expires)) + return now; + return expires; } /** - * update_wall_time - Uses the current clocksource to increment the wall time - * - * Called from the timer interrupt, must hold a write on xtime_lock. + * next_timer_interrupt - return the jiffy of the next pending timer + * @now: current time (in jiffies) */ -static void update_wall_time(void) +unsigned long get_next_timer_interrupt(unsigned long now) { - cycle_t offset; - - /* Make sure we're fully resumed: */ - if (unlikely(timekeeping_suspended)) - return; - -#ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; -#else - offset = clock->cycle_interval; -#endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; - - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. - */ - while (offset >= clock->cycle_interval) { - /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; - offset -= clock->cycle_interval; - - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; - xtime.tv_sec++; - second_overflow(); - } - - /* interpolator bits */ - time_interpolator_update(clock->xtime_interval - >> clock->shift); - /* increment the NTP state machine */ - update_ntp_one_tick(); + tvec_base_t *base = __get_cpu_var(tvec_bases); + unsigned long expires; - /* accumulate error between NTP and clock interval */ - clock->error += current_tick_length(); - clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); - } + spin_lock(&base->lock); + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); - /* correct the clock when NTP error is too big */ - clocksource_adjust(clock, offset); + if (time_before_eq(expires, now)) + return now; - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + return cmp_next_hrtimer_event(now, expires); +} - /* check to see if there is a new clocksource to use */ - if (change_clocksource()) { - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, tick_nsec); - } +#ifdef CONFIG_NO_IDLE_HZ +unsigned long next_timer_interrupt(void) +{ + return get_next_timer_interrupt(jiffies); } +#endif + +#endif /* * Called from the timer interrupt handler to charge one tick to the current @@ -1010,27 +852,18 @@ static inline void calc_load(unsigned long ticks) unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; - active_tasks = count_active_tasks(); - for (count -= ticks; count < 0; count += LOAD_FREQ) { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); + count -= ticks; + if (unlikely(count < 0)) { + active_tasks = count_active_tasks(); + do { + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + count += LOAD_FREQ; + } while (count < 0); } } -/* jiffies at the most recent update of wall time */ -unsigned long wall_jiffies = INITIAL_JIFFIES; - -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. - */ -#ifndef ARCH_HAVE_XTIME_LOCK -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - -EXPORT_SYMBOL(xtime_lock); -#endif - /* * This function runs timers and the timer-tq in bottom half context. */ @@ -1038,7 +871,8 @@ static void run_timer_softirq(struct softirq_action *h) { tvec_base_t *base = __get_cpu_var(tvec_bases); - hrtimer_run_queues(); + hrtimer_run_queues(); + if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } @@ -1058,7 +892,6 @@ void run_local_timers(void) */ static inline void update_times(unsigned long ticks) { - wall_jiffies += ticks; update_wall_time(); calc_load(ticks); } @@ -1208,11 +1041,10 @@ fastcall signed long __sched schedule_timeout(signed long timeout) * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ - if (timeout < 0) - { + if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); + "value %lx\n", timeout); + dump_stack(); current->state = TASK_RUNNING; goto out; } @@ -1257,17 +1089,16 @@ asmlinkage long sys_gettid(void) } /** - * sys_sysinfo - fill in sysinfo struct + * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill */ -asmlinkage long sys_sysinfo(struct sysinfo __user *info) +int do_sysinfo(struct sysinfo *info) { - struct sysinfo val; unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; unsigned long seq; - memset((char *)&val, 0, sizeof(struct sysinfo)); + memset(info, 0, sizeof(struct sysinfo)); do { struct timespec tp; @@ -1287,17 +1118,17 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; } - val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - val.procs = nr_threads; + info->procs = nr_threads; } while (read_seqretry(&xtime_lock, seq)); - si_meminfo(&val); - si_swapinfo(&val); + si_meminfo(info); + si_swapinfo(info); /* * If the sum of all the available memory (i.e. ram + swap) @@ -1308,11 +1139,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) * -Erik Andersen */ - mem_total = val.totalram + val.totalswap; - if (mem_total < val.totalram || mem_total < val.totalswap) + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) goto out; bitcount = 0; - mem_unit = val.mem_unit; + mem_unit = info->mem_unit; while (mem_unit > 1) { bitcount++; mem_unit >>= 1; @@ -1324,22 +1155,31 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) /* * If mem_total did not overflow, multiply all memory values by - * val.mem_unit and set it to 1. This leaves things compatible + * info->mem_unit and set it to 1. This leaves things compatible * with 2.2.x, and also retains compatibility with earlier 2.4.x * kernels... */ - val.mem_unit = 1; - val.totalram <<= bitcount; - val.freeram <<= bitcount; - val.sharedram <<= bitcount; - val.bufferram <<= bitcount; - val.totalswap <<= bitcount; - val.freeswap <<= bitcount; - val.totalhigh <<= bitcount; - val.freehigh <<= bitcount; + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; + +out: + return 0; +} + +asmlinkage long sys_sysinfo(struct sysinfo __user *info) +{ + struct sysinfo val; + + do_sysinfo(&val); - out: if (copy_to_user(info, &val, sizeof(struct sysinfo))) return -EFAULT; @@ -1370,6 +1210,13 @@ static int __devinit init_timers_cpu(int cpu) cpu_to_node(cpu)); if (!base) return -ENOMEM; + + /* Make sure that tvec_base is 2 byte aligned */ + if (tbase_get_deferrable(base)) { + WARN_ON(1); + kfree(base); + return -ENOMEM; + } memset(base, 0, sizeof(*base)); per_cpu(tvec_bases, cpu) = base; } else { @@ -1409,9 +1256,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) struct timer_list *timer; while (!list_empty(head)) { - timer = list_entry(head->next, struct timer_list, entry); + timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); - timer->base = new_base; + timer_set_base(timer, new_base); internal_add_timer(new_base, timer); } } @@ -1427,8 +1274,8 @@ static void __devinit migrate_timers(int cpu) new_base = get_cpu_var(tvec_bases); local_irq_disable(); - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); + double_spin_lock(&new_base->lock, &old_base->lock, + smp_processor_id() < cpu); BUG_ON(old_base->running_timer); @@ -1441,8 +1288,8 @@ static void __devinit migrate_timers(int cpu) migrate_timer_list(new_base, old_base->tv5.vec + i); } - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); + double_spin_unlock(&new_base->lock, &old_base->lock, + smp_processor_id() < cpu); local_irq_enable(); put_cpu_var(tvec_bases); } @@ -1454,11 +1301,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; switch(action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: if (init_timers_cpu(cpu) < 0) return NOTIFY_BAD; break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + case CPU_DEAD_FROZEN: migrate_timers(cpu); break; #endif @@ -1478,6 +1327,8 @@ void __init init_timers(void) int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + init_timer_stats(); + BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); @@ -1489,7 +1340,7 @@ struct time_interpolator *time_interpolator __read_mostly; static struct time_interpolator *time_interpolator_list __read_mostly; static DEFINE_SPINLOCK(time_interpolator_lock); -static inline u64 time_interpolator_get_cycles(unsigned int src) +static inline cycles_t time_interpolator_get_cycles(unsigned int src) { unsigned long (*x)(void); @@ -1515,8 +1366,8 @@ static inline u64 time_interpolator_get_counter(int writelock) if (time_interpolator->jitter) { - u64 lcycle; - u64 now; + cycles_t lcycle; + cycles_t now; do { lcycle = time_interpolator->last_cycle;