X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fpowerpc%2Fkernel%2Ftime.c;h=f1a38a6c1e2d41c749fb656401e49c97d38204fb;hb=15c8b6c1aaaf1c4edd67e2f02e4d8e1bd1a51c0d;hp=9f264c2f02c3ef4c4b43666e72d19c3aba038c82;hpb=0fd6f717948083d480f38e97f62cc116faf0e534;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 9f264c2..f1a38a6 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -32,7 +32,6 @@ * 2 of the License, or (at your option) any later version. */ -#include #include #include #include @@ -50,6 +49,9 @@ #include #include #include +#include +#include +#include #include #include @@ -61,30 +63,74 @@ #include #include #include -#ifdef CONFIG_PPC64 -#include +#include +#include #include -#endif +#include #ifdef CONFIG_PPC_ISERIES -#include -#include +#include +#include #endif -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; +/* powerpc clocksource/clockevent code */ + +#include +#include + +static cycle_t rtc_read(void); +static struct clocksource clocksource_rtc = { + .name = "rtc", + .rating = 400, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .mask = CLOCKSOURCE_MASK(64), + .shift = 22, + .mult = 0, /* To be filled in */ + .read = rtc_read, +}; + +static cycle_t timebase_read(void); +static struct clocksource clocksource_timebase = { + .name = "timebase", + .rating = 400, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .mask = CLOCKSOURCE_MASK(64), + .shift = 22, + .mult = 0, /* To be filled in */ + .read = timebase_read, +}; -EXPORT_SYMBOL(jiffies_64); +#define DECREMENTER_MAX 0x7fffffff + +static int decrementer_set_next_event(unsigned long evt, + struct clock_event_device *dev); +static void decrementer_set_mode(enum clock_event_mode mode, + struct clock_event_device *dev); + +static struct clock_event_device decrementer_clockevent = { + .name = "decrementer", + .rating = 200, + .shift = 16, + .mult = 0, /* To be filled in */ + .irq = 0, + .set_next_event = decrementer_set_next_event, + .set_mode = decrementer_set_mode, + .features = CLOCK_EVT_FEAT_ONESHOT, +}; + +struct decrementer_clock { + struct clock_event_device event; + u64 next_tb; +}; + +static DEFINE_PER_CPU(struct decrementer_clock, decrementers); -/* keep track of when we need to update the rtc */ -time_t last_rtc_update; -extern int piranha_simulator; #ifdef CONFIG_PPC_ISERIES -unsigned long iSeries_recal_titan = 0; -unsigned long iSeries_recal_tb = 0; -static unsigned long first_settimeofday = 1; -#endif +static unsigned long __initdata iSeries_recal_titan; +static signed long __initdata iSeries_recal_tb; -/* The decrementer counts down by 128 every 128ns on a 601. */ -#define DECREMENTER_COUNT_601 (1000000000 / HZ) +/* Forward declaration is only needed for iSereis compiles */ +void __init clocksource_init(void); +#endif #define XSEC_PER_SEC (1024*1024) @@ -99,149 +145,276 @@ unsigned long tb_ticks_per_jiffy; unsigned long tb_ticks_per_usec = 100; /* sane default */ EXPORT_SYMBOL(tb_ticks_per_usec); unsigned long tb_ticks_per_sec; +EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime_t conversions */ u64 tb_to_xs; unsigned tb_to_us; -unsigned long processor_freq; + +#define TICKLEN_SCALE NTP_SCALE_SHIFT +u64 last_tick_len; /* units are ns / 2^TICKLEN_SCALE */ +u64 ticklen_to_xs; /* 0.64 fraction */ + +/* If last_tick_len corresponds to about 1/HZ seconds, then + last_tick_len << TICKLEN_SHIFT will be about 2^63. */ +#define TICKLEN_SHIFT (63 - 30 - TICKLEN_SCALE + SHIFT_HZ) + DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL_GPL(rtc_lock); -u64 tb_to_ns_scale; -unsigned tb_to_ns_shift; +static u64 tb_to_ns_scale __read_mostly; +static unsigned tb_to_ns_shift __read_mostly; +static unsigned long boot_tb __read_mostly; struct gettimeofday_struct do_gtod; -extern unsigned long wall_jiffies; - extern struct timezone sys_tz; static long timezone_offset; -void ppc_adjtimex(void); - -static unsigned adjusting_time = 0; - unsigned long ppc_proc_freq; +EXPORT_SYMBOL(ppc_proc_freq); unsigned long ppc_tb_freq; -#ifdef CONFIG_PPC32 /* XXX for now */ -#define boot_cpuid 0 -#endif - -u64 tb_last_jiffy __cacheline_aligned_in_smp; -unsigned long tb_last_stamp; +static u64 tb_last_jiffy __cacheline_aligned_in_smp; +static DEFINE_PER_CPU(u64, last_jiffy); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING /* - * Note that on ppc32 this only stores the bottom 32 bits of - * the timebase value, but that's enough to tell when a jiffy - * has passed. + * Factors for converting from cputime_t (timebase ticks) to + * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds). + * These are all stored as 0.64 fixed-point binary fractions. */ -DEFINE_PER_CPU(unsigned long, last_jiffy); - -static __inline__ void timer_check_rtc(void) -{ - /* - * update the rtc when needed, this should be performed on the - * right fraction of a second. Half or full second ? - * Full second works on mk48t59 clocks, others need testing. - * Note that this update is basically only used through - * the adjtimex system calls. Setting the HW clock in - * any other way is a /dev/rtc and userland business. - * This is still wrong by -0.5/+1.5 jiffies because of the - * timer interrupt resolution and possible delay, but here we - * hit a quantization limit which can only be solved by higher - * resolution timers and decoupling time management from timer - * interrupts. This is also wrong on the clocks - * which require being written at the half second boundary. - * We should have an rtc call that only sets the minutes and - * seconds like on Intel to avoid problems with non UTC clocks. - */ - if (ppc_md.set_rtc_time && ntp_synced() && - xtime.tv_sec - last_rtc_update >= 659 && - abs((xtime.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ && - jiffies - wall_jiffies == 1) { - struct rtc_time tm; - to_tm(xtime.tv_sec + 1 + timezone_offset, &tm); - tm.tm_year -= 1900; - tm.tm_mon -= 1; - if (ppc_md.set_rtc_time(&tm) == 0) - last_rtc_update = xtime.tv_sec + 1; - else - /* Try again one minute later */ - last_rtc_update += 60; - } +u64 __cputime_jiffies_factor; +EXPORT_SYMBOL(__cputime_jiffies_factor); +u64 __cputime_msec_factor; +EXPORT_SYMBOL(__cputime_msec_factor); +u64 __cputime_sec_factor; +EXPORT_SYMBOL(__cputime_sec_factor); +u64 __cputime_clockt_factor; +EXPORT_SYMBOL(__cputime_clockt_factor); +DEFINE_PER_CPU(unsigned long, cputime_last_delta); +DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta); + +static void calc_cputime_factors(void) +{ + struct div_result res; + + div128_by_32(HZ, 0, tb_ticks_per_sec, &res); + __cputime_jiffies_factor = res.result_low; + div128_by_32(1000, 0, tb_ticks_per_sec, &res); + __cputime_msec_factor = res.result_low; + div128_by_32(1, 0, tb_ticks_per_sec, &res); + __cputime_sec_factor = res.result_low; + div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); + __cputime_clockt_factor = res.result_low; } /* - * This version of gettimeofday has microsecond resolution. + * Read the PURR on systems that have it, otherwise the timebase. */ -static inline void __do_gettimeofday(struct timeval *tv, u64 tb_val) +static u64 read_purr(void) { - unsigned long sec, usec; - u64 tb_ticks, xsec; - struct gettimeofday_vars *temp_varp; - u64 temp_tb_to_xs, temp_stamp_xsec; + if (cpu_has_feature(CPU_FTR_PURR)) + return mfspr(SPRN_PURR); + return mftb(); +} +/* + * Read the SPURR on systems that have it, otherwise the purr + */ +static u64 read_spurr(u64 purr) +{ /* - * These calculations are faster (gets rid of divides) - * if done in units of 1/2^20 rather than microseconds. - * The conversion to microseconds at the end is done - * without a divide (and in fact, without a multiply) + * cpus without PURR won't have a SPURR + * We already know the former when we use this, so tell gcc */ - temp_varp = do_gtod.varp; - tb_ticks = tb_val - temp_varp->tb_orig_stamp; - temp_tb_to_xs = temp_varp->tb_to_xs; - temp_stamp_xsec = temp_varp->stamp_xsec; - xsec = temp_stamp_xsec + mulhdu(tb_ticks, temp_tb_to_xs); - sec = xsec / XSEC_PER_SEC; - usec = (unsigned long)xsec & (XSEC_PER_SEC - 1); - usec = SCALE_XSEC(usec, 1000000); + if (cpu_has_feature(CPU_FTR_PURR) && cpu_has_feature(CPU_FTR_SPURR)) + return mfspr(SPRN_SPURR); + return purr; +} + +/* + * Account time for a transition between system, hard irq + * or soft irq state. + */ +void account_system_vtime(struct task_struct *tsk) +{ + u64 now, nowscaled, delta, deltascaled, sys_time; + unsigned long flags; - tv->tv_sec = sec; - tv->tv_usec = usec; + local_irq_save(flags); + now = read_purr(); + nowscaled = read_spurr(now); + delta = now - get_paca()->startpurr; + deltascaled = nowscaled - get_paca()->startspurr; + get_paca()->startpurr = now; + get_paca()->startspurr = nowscaled; + if (!in_interrupt()) { + /* deltascaled includes both user and system time. + * Hence scale it based on the purr ratio to estimate + * the system time */ + sys_time = get_paca()->system_time; + if (get_paca()->user_time) + deltascaled = deltascaled * sys_time / + (sys_time + get_paca()->user_time); + delta += sys_time; + get_paca()->system_time = 0; + } + account_system_time(tsk, 0, delta); + account_system_time_scaled(tsk, deltascaled); + per_cpu(cputime_last_delta, smp_processor_id()) = delta; + per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled; + local_irq_restore(flags); } -void do_gettimeofday(struct timeval *tv) +/* + * Transfer the user and system times accumulated in the paca + * by the exception entry and exit code to the generic process + * user and system time records. + * Must be called with interrupts disabled. + */ +void account_process_tick(struct task_struct *tsk, int user_tick) { - if (__USE_RTC()) { - /* do this the old way */ - unsigned long flags, seq; - unsigned int sec, nsec, usec, lost; + cputime_t utime, utimescaled; - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec + tb_ticks_since(tb_last_stamp); - lost = jiffies - wall_jiffies; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - usec = nsec / 1000 + lost * (1000000 / HZ); - while (usec >= 1000000) { - usec -= 1000000; - ++sec; - } - tv->tv_sec = sec; - tv->tv_usec = usec; + utime = get_paca()->user_time; + get_paca()->user_time = 0; + account_user_time(tsk, utime); + + utimescaled = cputime_to_scaled(utime); + account_user_time_scaled(tsk, utimescaled); +} + +/* + * Stuff for accounting stolen time. + */ +struct cpu_purr_data { + int initialized; /* thread is running */ + u64 tb; /* last TB value read */ + u64 purr; /* last PURR value read */ + u64 spurr; /* last SPURR value read */ +}; + +/* + * Each entry in the cpu_purr_data array is manipulated only by its + * "owner" cpu -- usually in the timer interrupt but also occasionally + * in process context for cpu online. As long as cpus do not touch + * each others' cpu_purr_data, disabling local interrupts is + * sufficient to serialize accesses. + */ +static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data); + +static void snapshot_tb_and_purr(void *data) +{ + unsigned long flags; + struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data); + + local_irq_save(flags); + p->tb = get_tb_or_rtc(); + p->purr = mfspr(SPRN_PURR); + wmb(); + p->initialized = 1; + local_irq_restore(flags); +} + +/* + * Called during boot when all cpus have come up. + */ +void snapshot_timebases(void) +{ + if (!cpu_has_feature(CPU_FTR_PURR)) return; - } - __do_gettimeofday(tv, get_tb()); + on_each_cpu(snapshot_tb_and_purr, NULL, 1); +} + +/* + * Must be called with interrupts disabled. + */ +void calculate_steal_time(void) +{ + u64 tb, purr; + s64 stolen; + struct cpu_purr_data *pme; + + pme = &__get_cpu_var(cpu_purr_data); + if (!pme->initialized) + return; /* !CPU_FTR_PURR or early in early boot */ + tb = mftb(); + purr = mfspr(SPRN_PURR); + stolen = (tb - pme->tb) - (purr - pme->purr); + if (stolen > 0) + account_steal_time(current, stolen); + pme->tb = tb; + pme->purr = purr; } -EXPORT_SYMBOL(do_gettimeofday); +#ifdef CONFIG_PPC_SPLPAR +/* + * Must be called before the cpu is added to the online map when + * a cpu is being brought up at runtime. + */ +static void snapshot_purr(void) +{ + struct cpu_purr_data *pme; + unsigned long flags; -/* Synchronize xtime with do_gettimeofday */ + if (!cpu_has_feature(CPU_FTR_PURR)) + return; + local_irq_save(flags); + pme = &__get_cpu_var(cpu_purr_data); + pme->tb = mftb(); + pme->purr = mfspr(SPRN_PURR); + pme->initialized = 1; + local_irq_restore(flags); +} -static inline void timer_sync_xtime(unsigned long cur_tb) +#endif /* CONFIG_PPC_SPLPAR */ + +#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ +#define calc_cputime_factors() +#define calculate_steal_time() do { } while (0) +#endif + +#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR)) +#define snapshot_purr() do { } while (0) +#endif + +/* + * Called when a cpu comes up after the system has finished booting, + * i.e. as a result of a hotplug cpu action. + */ +void snapshot_timebase(void) { -#ifdef CONFIG_PPC64 - /* why do we do this? */ - struct timeval my_tv; + __get_cpu_var(last_jiffy) = get_tb_or_rtc(); + snapshot_purr(); +} - __do_gettimeofday(&my_tv, cur_tb); +void __delay(unsigned long loops) +{ + unsigned long start; + int diff; - if (xtime.tv_sec <= my_tv.tv_sec) { - xtime.tv_sec = my_tv.tv_sec; - xtime.tv_nsec = my_tv.tv_usec * 1000; + if (__USE_RTC()) { + start = get_rtcl(); + do { + /* the RTCL register wraps at 1000000000 */ + diff = get_rtcl() - start; + if (diff < 0) + diff += 1000000000; + } while (diff < loops); + } else { + start = get_tbl(); + while (get_tbl() - start < loops) + HMT_low(); + HMT_medium(); } -#endif } +EXPORT_SYMBOL(__delay); + +void udelay(unsigned long usecs) +{ + __delay(tb_ticks_per_usec * usecs); +} +EXPORT_SYMBOL(udelay); + /* * There are two copies of tb_to_xs and stamp_xsec so that no @@ -267,7 +440,6 @@ static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, do_gtod.varp = temp_varp; do_gtod.var_idx = temp_idx; -#ifdef CONFIG_PPC64 /* * tb_update_count is used to allow the userspace gettimeofday code * to assure itself that it sees a consistent view of the tb_to_xs and @@ -276,40 +448,16 @@ static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, * the two values of tb_update_count match and are even then the * tb_to_xs and stamp_xsec values are consistent. If not, then it * loops back and reads them again until this criteria is met. + * We expect the caller to have done the first increment of + * vdso_data->tb_update_count already. */ - ++(systemcfg->tb_update_count); + vdso_data->tb_orig_stamp = new_tb_stamp; + vdso_data->stamp_xsec = new_stamp_xsec; + vdso_data->tb_to_xs = new_tb_to_xs; + vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; smp_wmb(); - systemcfg->tb_orig_stamp = new_tb_stamp; - systemcfg->stamp_xsec = new_stamp_xsec; - systemcfg->tb_to_xs = new_tb_to_xs; - smp_wmb(); - ++(systemcfg->tb_update_count); -#endif -} - -/* - * When the timebase - tb_orig_stamp gets too big, we do a manipulation - * between tb_orig_stamp and stamp_xsec. The goal here is to keep the - * difference tb - tb_orig_stamp small enough to always fit inside a - * 32 bits number. This is a requirement of our fast 32 bits userland - * implementation in the vdso. If we "miss" a call to this function - * (interrupt latency, CPU locked in a spinlock, ...) and we end up - * with a too big difference, then the vdso will fallback to calling - * the syscall - */ -static __inline__ void timer_recalc_offset(u64 cur_tb) -{ - unsigned long offset; - u64 new_stamp_xsec; - - if (__USE_RTC()) - return; - offset = cur_tb - do_gtod.varp->tb_orig_stamp; - if ((offset & 0x80000000u) == 0) - return; - new_stamp_xsec = do_gtod.varp->stamp_xsec - + mulhdu(offset, do_gtod.varp->tb_to_xs); - update_gtod(cur_tb, new_stamp_xsec, do_gtod.varp->tb_to_xs); + ++(vdso_data->tb_update_count); } #ifdef CONFIG_SMP @@ -333,10 +481,15 @@ EXPORT_SYMBOL(profile_pc); * returned by the service processor for the timebase frequency. */ -static void iSeries_tb_recal(void) +static int __init iSeries_tb_recal(void) { struct div_result divres; unsigned long titan, tb; + + /* Make sure we only run on iSeries */ + if (!firmware_has_feature(FW_FEATURE_ISERIES)) + return -ENODEV; + tb = get_tb(); titan = HvCallXm_loadTod(); if ( iSeries_recal_titan ) { @@ -359,12 +512,13 @@ static void iSeries_tb_recal(void) new_tb_ticks_per_jiffy, sign, tick_diff ); tb_ticks_per_jiffy = new_tb_ticks_per_jiffy; tb_ticks_per_sec = new_tb_ticks_per_sec; + calc_cputime_factors(); div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; tb_to_xs = divres.result_low; do_gtod.varp->tb_to_xs = tb_to_xs; - systemcfg->tb_ticks_per_sec = tb_ticks_per_sec; - systemcfg->tb_to_xs = tb_to_xs; + vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; + vdso_data->tb_to_xs = tb_to_xs; } else { printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" @@ -376,8 +530,20 @@ static void iSeries_tb_recal(void) } iSeries_recal_titan = titan; iSeries_recal_tb = tb; + + /* Called here as now we know accurate values for the timebase */ + clocksource_init(); + return 0; } -#endif +late_initcall(iSeries_tb_recal); + +/* Called from platform early init */ +void __init iSeries_time_init_early(void) +{ + iSeries_recal_tb = get_tb(); + iSeries_recal_titan = HvCallXm_loadTod(); +} +#endif /* CONFIG_PPC_ISERIES */ /* * For iSeries shared processors, we have to let the hypervisor @@ -395,66 +561,44 @@ static void iSeries_tb_recal(void) */ void timer_interrupt(struct pt_regs * regs) { - int next_dec; - int cpu = smp_processor_id(); - unsigned long ticks; + struct pt_regs *old_regs; + struct decrementer_clock *decrementer = &__get_cpu_var(decrementers); + struct clock_event_device *evt = &decrementer->event; + u64 now; + + /* Ensure a positive value is written to the decrementer, or else + * some CPUs will continuue to take decrementer exceptions */ + set_dec(DECREMENTER_MAX); #ifdef CONFIG_PPC32 if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); #endif + now = get_tb_or_rtc(); + if (now < decrementer->next_tb) { + /* not time for this event yet */ + now = decrementer->next_tb - now; + if (now <= DECREMENTER_MAX) + set_dec((int)now); + return; + } + old_regs = set_irq_regs(regs); irq_enter(); - profile_tick(CPU_PROFILING, regs); + calculate_steal_time(); #ifdef CONFIG_PPC_ISERIES - get_paca()->lppaca.int_dword.fields.decr_int = 0; + if (firmware_has_feature(FW_FEATURE_ISERIES)) + get_lppaca()->int_dword.fields.decr_int = 0; #endif - while ((ticks = tb_ticks_since(per_cpu(last_jiffy, cpu))) - >= tb_ticks_per_jiffy) { - /* Update last_jiffy */ - per_cpu(last_jiffy, cpu) += tb_ticks_per_jiffy; - /* Handle RTCL overflow on 601 */ - if (__USE_RTC() && per_cpu(last_jiffy, cpu) >= 1000000000) - per_cpu(last_jiffy, cpu) -= 1000000000; - - /* - * We cannot disable the decrementer, so in the period - * between this cpu's being marked offline in cpu_online_map - * and calling stop-self, it is taking timer interrupts. - * Avoid calling into the scheduler rebalancing code if this - * is the case. - */ - if (!cpu_is_offline(cpu)) - update_process_times(user_mode(regs)); - - /* - * No need to check whether cpu is offline here; boot_cpuid - * should have been fixed up by now. - */ - if (cpu != boot_cpuid) - continue; - - write_seqlock(&xtime_lock); - tb_last_jiffy += tb_ticks_per_jiffy; - tb_last_stamp = per_cpu(last_jiffy, cpu); - timer_recalc_offset(tb_last_jiffy); - do_timer(regs); - timer_sync_xtime(tb_last_jiffy); - timer_check_rtc(); - write_sequnlock(&xtime_lock); - if (adjusting_time && (time_adjust == 0)) - ppc_adjtimex(); - } - - next_dec = tb_ticks_per_jiffy - ticks; - set_dec(next_dec); + if (evt->event_handler) + evt->event_handler(evt); #ifdef CONFIG_PPC_ISERIES - if (hvlpevent_is_pending()) - process_hvlpevents(regs); + if (firmware_has_feature(FW_FEATURE_ISERIES) && hvlpevent_is_pending()) + process_hvlpevents(); #endif #ifdef CONFIG_PPC64 @@ -466,34 +610,77 @@ void timer_interrupt(struct pt_regs * regs) #endif irq_exit(); + set_irq_regs(old_regs); } void wakeup_decrementer(void) { - int i; + unsigned long ticks; - set_dec(tb_ticks_per_jiffy); /* - * We don't expect this to be called on a machine with a 601, - * so using get_tbl is fine. + * The timebase gets saved on sleep and restored on wakeup, + * so all we need to do is to reset the decrementer. */ - tb_last_stamp = tb_last_jiffy = get_tb(); - for_each_cpu(i) - per_cpu(last_jiffy, i) = tb_last_stamp; + ticks = tb_ticks_since(__get_cpu_var(last_jiffy)); + if (ticks < tb_ticks_per_jiffy) + ticks = tb_ticks_per_jiffy - ticks; + else + ticks = 1; + set_dec(ticks); +} + +#ifdef CONFIG_SUSPEND +void generic_suspend_disable_irqs(void) +{ + preempt_disable(); + + /* Disable the decrementer, so that it doesn't interfere + * with suspending. + */ + + set_dec(0x7fffffff); + local_irq_disable(); + set_dec(0x7fffffff); +} + +void generic_suspend_enable_irqs(void) +{ + wakeup_decrementer(); + + local_irq_enable(); + preempt_enable(); } +/* Overrides the weak version in kernel/power/main.c */ +void arch_suspend_disable_irqs(void) +{ + if (ppc_md.suspend_disable_irqs) + ppc_md.suspend_disable_irqs(); + generic_suspend_disable_irqs(); +} + +/* Overrides the weak version in kernel/power/main.c */ +void arch_suspend_enable_irqs(void) +{ + generic_suspend_enable_irqs(); + if (ppc_md.suspend_enable_irqs) + ppc_md.suspend_enable_irqs(); +} +#endif + #ifdef CONFIG_SMP void __init smp_space_timers(unsigned int max_cpus) { int i; - unsigned long offset = tb_ticks_per_jiffy / max_cpus; - unsigned long previous_tb = per_cpu(last_jiffy, boot_cpuid); + u64 previous_tb = per_cpu(last_jiffy, boot_cpuid); - for_each_cpu(i) { - if (i != boot_cpuid) { - previous_tb += offset; - per_cpu(last_jiffy, i) = previous_tb; - } + /* make sure tb > per_cpu(last_jiffy, cpu) for all cpus always */ + previous_tb -= tb_ticks_per_jiffy; + + for_each_possible_cpu(i) { + if (i == boot_cpuid) + continue; + per_cpu(last_jiffy, i) = previous_tb; } } #endif @@ -509,108 +696,52 @@ unsigned long long sched_clock(void) { if (__USE_RTC()) return get_rtc(); - return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift; + return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; } -int do_settimeofday(struct timespec *tv) +static int __init get_freq(char *name, int cells, unsigned long *val) { - time_t wtm_sec, new_sec = tv->tv_sec; - long wtm_nsec, new_nsec = tv->tv_nsec; - unsigned long flags; - long int tb_delta; - u64 new_xsec; + struct device_node *cpu; + const unsigned int *fp; + int found = 0; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; + /* The cpu node should have timebase and clock frequency properties */ + cpu = of_find_node_by_type(NULL, "cpu"); - write_seqlock_irqsave(&xtime_lock, flags); + if (cpu) { + fp = of_get_property(cpu, name, NULL); + if (fp) { + found = 1; + *val = of_read_ulong(fp, cells); + } - /* - * Updating the RTC is not the job of this code. If the time is - * stepped under NTP, the RTC will be updated after STA_UNSYNC - * is cleared. Tools like clock/hwclock either copy the RTC - * to the system time, in which case there is no point in writing - * to the RTC again, or write to the RTC but then they don't call - * settimeofday to perform this operation. - */ -#ifdef CONFIG_PPC_ISERIES - if (first_settimeofday) { - iSeries_tb_recal(); - first_settimeofday = 0; + of_node_put(cpu); } -#endif - tb_delta = tb_ticks_since(tb_last_stamp); - tb_delta += (jiffies - wall_jiffies) * tb_ticks_per_jiffy; - - new_nsec -= 1000 * mulhwu(tb_to_us, tb_delta); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec); - - set_normalized_timespec(&xtime, new_sec, new_nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - /* In case of a large backwards jump in time with NTP, we want the - * clock to be updated as soon as the PLL is again in lock. - */ - last_rtc_update = new_sec - 658; - - ntp_clear(); - - new_xsec = (u64)new_nsec * XSEC_PER_SEC; - do_div(new_xsec, NSEC_PER_SEC); - new_xsec += (u64)new_sec * XSEC_PER_SEC; - update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs); - -#ifdef CONFIG_PPC64 - systemcfg->tz_minuteswest = sys_tz.tz_minuteswest; - systemcfg->tz_dsttime = sys_tz.tz_dsttime; -#endif - - write_sequnlock_irqrestore(&xtime_lock, flags); - clock_was_set(); - return 0; + return found; } -EXPORT_SYMBOL(do_settimeofday); - void __init generic_calibrate_decr(void) { - struct device_node *cpu; - unsigned int *fp; - int node_found; + ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */ - /* - * The cpu node should have a timebase-frequency property - * to tell us the rate at which the decrementer counts. - */ - cpu = of_find_node_by_type(NULL, "cpu"); + if (!get_freq("ibm,extended-timebase-frequency", 2, &ppc_tb_freq) && + !get_freq("timebase-frequency", 1, &ppc_tb_freq)) { - ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */ - node_found = 0; - if (cpu != 0) { - fp = (unsigned int *)get_property(cpu, "timebase-frequency", - NULL); - if (fp != 0) { - node_found = 1; - ppc_tb_freq = *fp; - } - } - if (!node_found) printk(KERN_ERR "WARNING: Estimating decrementer frequency " "(not found)\n"); + } - ppc_proc_freq = DEFAULT_PROC_FREQ; - node_found = 0; - if (cpu != 0) { - fp = (unsigned int *)get_property(cpu, "clock-frequency", - NULL); - if (fp != 0) { - node_found = 1; - ppc_proc_freq = *fp; - } + ppc_proc_freq = DEFAULT_PROC_FREQ; /* hardcoded default */ + + if (!get_freq("ibm,extended-clock-frequency", 2, &ppc_proc_freq) && + !get_freq("clock-frequency", 1, &ppc_proc_freq)) { + + printk(KERN_ERR "WARNING: Estimating processor frequency " + "(not found)\n"); } -#ifdef CONFIG_BOOKE + +#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); mtspr(SPRN_TBWU, 0); @@ -621,19 +752,37 @@ void __init generic_calibrate_decr(void) /* Enable decrementer interrupt */ mtspr(SPRN_TCR, TCR_DIE); #endif - if (!node_found) - printk(KERN_ERR "WARNING: Estimating processor frequency " - "(not found)\n"); +} + +int update_persistent_clock(struct timespec now) +{ + struct rtc_time tm; + + if (!ppc_md.set_rtc_time) + return 0; + + to_tm(now.tv_sec + 1 + timezone_offset, &tm); + tm.tm_year -= 1900; + tm.tm_mon -= 1; - of_node_put(cpu); + return ppc_md.set_rtc_time(&tm); } -unsigned long get_boot_time(void) +unsigned long read_persistent_clock(void) { struct rtc_time tm; + static int first = 1; + + /* XXX this is a litle fragile but will work okay in the short term */ + if (first) { + first = 0; + if (ppc_md.time_init) + timezone_offset = ppc_md.time_init(); - if (ppc_md.get_boot_time) - return ppc_md.get_boot_time(); + /* get_boot_time() isn't guaranteed to be safe to call late */ + if (ppc_md.get_boot_time) + return ppc_md.get_boot_time() -timezone_offset; + } if (!ppc_md.get_rtc_time) return 0; ppc_md.get_rtc_time(&tm); @@ -641,43 +790,178 @@ unsigned long get_boot_time(void) tm.tm_hour, tm.tm_min, tm.tm_sec); } +/* clocksource code */ +static cycle_t rtc_read(void) +{ + return (cycle_t)get_rtc(); +} + +static cycle_t timebase_read(void) +{ + return (cycle_t)get_tb(); +} + +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +{ + u64 t2x, stamp_xsec; + + if (clock != &clocksource_timebase) + return; + + /* Make userspace gettimeofday spin until we're done. */ + ++vdso_data->tb_update_count; + smp_mb(); + + /* XXX this assumes clock->shift == 22 */ + /* 4611686018 ~= 2^(20+64-22) / 1e9 */ + t2x = (u64) clock->mult * 4611686018ULL; + stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; + do_div(stamp_xsec, 1000000000); + stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; + update_gtod(clock->cycle_last, stamp_xsec, t2x); +} + +void update_vsyscall_tz(void) +{ + /* Make userspace gettimeofday spin until we're done. */ + ++vdso_data->tb_update_count; + smp_mb(); + vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; + vdso_data->tz_dsttime = sys_tz.tz_dsttime; + smp_mb(); + ++vdso_data->tb_update_count; +} + +void __init clocksource_init(void) +{ + struct clocksource *clock; + + if (__USE_RTC()) + clock = &clocksource_rtc; + else + clock = &clocksource_timebase; + + clock->mult = clocksource_hz2mult(tb_ticks_per_sec, clock->shift); + + if (clocksource_register(clock)) { + printk(KERN_ERR "clocksource: %s is already registered\n", + clock->name); + return; + } + + printk(KERN_INFO "clocksource: %s mult[%x] shift[%d] registered\n", + clock->name, clock->mult, clock->shift); +} + +static int decrementer_set_next_event(unsigned long evt, + struct clock_event_device *dev) +{ + __get_cpu_var(decrementers).next_tb = get_tb_or_rtc() + evt; + set_dec(evt); + return 0; +} + +static void decrementer_set_mode(enum clock_event_mode mode, + struct clock_event_device *dev) +{ + if (mode != CLOCK_EVT_MODE_ONESHOT) + decrementer_set_next_event(DECREMENTER_MAX, dev); +} + +static void register_decrementer_clockevent(int cpu) +{ + struct clock_event_device *dec = &per_cpu(decrementers, cpu).event; + + *dec = decrementer_clockevent; + dec->cpumask = cpumask_of_cpu(cpu); + + printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n", + dec->name, dec->mult, dec->shift, cpu); + + clockevents_register_device(dec); +} + +static void __init init_decrementer_clockevent(void) +{ + int cpu = smp_processor_id(); + + decrementer_clockevent.mult = div_sc(ppc_tb_freq, NSEC_PER_SEC, + decrementer_clockevent.shift); + decrementer_clockevent.max_delta_ns = + clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent); + decrementer_clockevent.min_delta_ns = + clockevent_delta2ns(2, &decrementer_clockevent); + + register_decrementer_clockevent(cpu); +} + +void secondary_cpu_time_init(void) +{ + /* FIME: Should make unrelatred change to move snapshot_timebase + * call here ! */ + register_decrementer_clockevent(smp_processor_id()); +} + /* This function is only called on the boot processor */ void __init time_init(void) { unsigned long flags; - unsigned long tm = 0; struct div_result res; - u64 scale; + u64 scale, x; unsigned shift; - if (ppc_md.time_init != NULL) - timezone_offset = ppc_md.time_init(); - if (__USE_RTC()) { /* 601 processor: dec counts down by 128 every 128ns */ ppc_tb_freq = 1000000000; - tb_last_stamp = get_rtcl(); - tb_last_jiffy = tb_last_stamp; + tb_last_jiffy = get_rtcl(); } else { /* Normal PowerPC with timebase register */ ppc_md.calibrate_decr(); - printk(KERN_INFO "time_init: decrementer frequency = %lu.%.6lu MHz\n", + printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n", ppc_tb_freq / 1000000, ppc_tb_freq % 1000000); - printk(KERN_INFO "time_init: processor frequency = %lu.%.6lu MHz\n", + printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); - tb_last_stamp = tb_last_jiffy = get_tb(); + tb_last_jiffy = get_tb(); } tb_ticks_per_jiffy = ppc_tb_freq / HZ; - tb_ticks_per_sec = tb_ticks_per_jiffy * HZ; + tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); - div128_by_32(1024*1024, 0, tb_ticks_per_sec, &res); - tb_to_xs = res.result_low; + calc_cputime_factors(); -#ifdef CONFIG_PPC64 - get_paca()->default_decr = tb_ticks_per_jiffy; -#endif + /* + * Calculate the length of each tick in ns. It will not be + * exactly 1e9/HZ unless ppc_tb_freq is divisible by HZ. + * We compute 1e9 * tb_ticks_per_jiffy / ppc_tb_freq, + * rounded up. + */ + x = (u64) NSEC_PER_SEC * tb_ticks_per_jiffy + ppc_tb_freq - 1; + do_div(x, ppc_tb_freq); + tick_nsec = x; + last_tick_len = x << TICKLEN_SCALE; + + /* + * Compute ticklen_to_xs, which is a factor which gets multiplied + * by (last_tick_len << TICKLEN_SHIFT) to get a tb_to_xs value. + * It is computed as: + * ticklen_to_xs = 2^N / (tb_ticks_per_jiffy * 1e9) + * where N = 64 + 20 - TICKLEN_SCALE - TICKLEN_SHIFT + * which turns out to be N = 51 - SHIFT_HZ. + * This gives the result as a 0.64 fixed-point fraction. + * That value is reduced by an offset amounting to 1 xsec per + * 2^31 timebase ticks to avoid problems with time going backwards + * by 1 xsec when we do timer_recalc_offset due to losing the + * fractional xsec. That offset is equal to ppc_tb_freq/2^51 + * since there are 2^20 xsec in a second. + */ + div128_by_32((1ULL << 51) - ppc_tb_freq, 0, + tb_ticks_per_jiffy << SHIFT_HZ, &res); + div128_by_32(res.result_high, res.result_low, NSEC_PER_SEC, &res); + ticklen_to_xs = res.result_low; + + /* Compute tb_to_xs from tick_nsec */ + tb_to_xs = mulhdu(last_tick_len << TICKLEN_SHIFT, ticklen_to_xs); /* * Compute scale factor for sched_clock. @@ -697,167 +981,39 @@ void __init time_init(void) } tb_to_ns_scale = scale; tb_to_ns_shift = shift; - -#ifdef CONFIG_PPC_ISERIES - if (!piranha_simulator) -#endif - tm = get_boot_time(); + /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ + boot_tb = get_tb_or_rtc(); write_seqlock_irqsave(&xtime_lock, flags); - xtime.tv_sec = tm; - xtime.tv_nsec = 0; + + /* If platform provided a timezone (pmac), we correct the time */ + if (timezone_offset) { + sys_tz.tz_minuteswest = -timezone_offset / 60; + sys_tz.tz_dsttime = 0; + } + do_gtod.varp = &do_gtod.vars[0]; do_gtod.var_idx = 0; do_gtod.varp->tb_orig_stamp = tb_last_jiffy; - __get_cpu_var(last_jiffy) = tb_last_stamp; + __get_cpu_var(last_jiffy) = tb_last_jiffy; do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; do_gtod.varp->tb_to_xs = tb_to_xs; do_gtod.tb_to_us = tb_to_us; -#ifdef CONFIG_PPC64 - systemcfg->tb_orig_stamp = tb_last_jiffy; - systemcfg->tb_update_count = 0; - systemcfg->tb_ticks_per_sec = tb_ticks_per_sec; - systemcfg->stamp_xsec = xtime.tv_sec * XSEC_PER_SEC; - systemcfg->tb_to_xs = tb_to_xs; -#endif - time_freq = 0; + vdso_data->tb_orig_stamp = tb_last_jiffy; + vdso_data->tb_update_count = 0; + vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; + vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; + vdso_data->tb_to_xs = tb_to_xs; - /* If platform provided a timezone (pmac), we correct the time */ - if (timezone_offset) { - sys_tz.tz_minuteswest = -timezone_offset / 60; - sys_tz.tz_dsttime = 0; - xtime.tv_sec -= timezone_offset; - } - - last_rtc_update = xtime.tv_sec; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); write_sequnlock_irqrestore(&xtime_lock, flags); - /* Not exact, but the timer interrupt takes care of this */ - set_dec(tb_ticks_per_jiffy); -} - -/* - * After adjtimex is called, adjust the conversion of tb ticks - * to microseconds to keep do_gettimeofday synchronized - * with ntpd. - * - * Use the time_adjust, time_freq and time_offset computed by adjtimex to - * adjust the frequency. - */ - -/* #define DEBUG_PPC_ADJTIMEX 1 */ - -void ppc_adjtimex(void) -{ -#ifdef CONFIG_PPC64 - unsigned long den, new_tb_ticks_per_sec, tb_ticks, old_xsec, - new_tb_to_xs, new_xsec, new_stamp_xsec; - unsigned long tb_ticks_per_sec_delta; - long delta_freq, ltemp; - struct div_result divres; - unsigned long flags; - long singleshot_ppm = 0; - - /* - * Compute parts per million frequency adjustment to - * accomplish the time adjustment implied by time_offset to be - * applied over the elapsed time indicated by time_constant. - * Use SHIFT_USEC to get it into the same units as - * time_freq. - */ - if ( time_offset < 0 ) { - ltemp = -time_offset; - ltemp <<= SHIFT_USEC - SHIFT_UPDATE; - ltemp >>= SHIFT_KG + time_constant; - ltemp = -ltemp; - } else { - ltemp = time_offset; - ltemp <<= SHIFT_USEC - SHIFT_UPDATE; - ltemp >>= SHIFT_KG + time_constant; - } - - /* If there is a single shot time adjustment in progress */ - if ( time_adjust ) { -#ifdef DEBUG_PPC_ADJTIMEX - printk("ppc_adjtimex: "); - if ( adjusting_time == 0 ) - printk("starting "); - printk("single shot time_adjust = %ld\n", time_adjust); -#endif - - adjusting_time = 1; - - /* - * Compute parts per million frequency adjustment - * to match time_adjust - */ - singleshot_ppm = tickadj * HZ; - /* - * The adjustment should be tickadj*HZ to match the code in - * linux/kernel/timer.c, but experiments show that this is too - * large. 3/4 of tickadj*HZ seems about right - */ - singleshot_ppm -= singleshot_ppm / 4; - /* Use SHIFT_USEC to get it into the same units as time_freq */ - singleshot_ppm <<= SHIFT_USEC; - if ( time_adjust < 0 ) - singleshot_ppm = -singleshot_ppm; - } - else { -#ifdef DEBUG_PPC_ADJTIMEX - if ( adjusting_time ) - printk("ppc_adjtimex: ending single shot time_adjust\n"); -#endif - adjusting_time = 0; - } - - /* Add up all of the frequency adjustments */ - delta_freq = time_freq + ltemp + singleshot_ppm; - - /* - * Compute a new value for tb_ticks_per_sec based on - * the frequency adjustment - */ - den = 1000000 * (1 << (SHIFT_USEC - 8)); - if ( delta_freq < 0 ) { - tb_ticks_per_sec_delta = ( tb_ticks_per_sec * ( (-delta_freq) >> (SHIFT_USEC - 8))) / den; - new_tb_ticks_per_sec = tb_ticks_per_sec + tb_ticks_per_sec_delta; - } - else { - tb_ticks_per_sec_delta = ( tb_ticks_per_sec * ( delta_freq >> (SHIFT_USEC - 8))) / den; - new_tb_ticks_per_sec = tb_ticks_per_sec - tb_ticks_per_sec_delta; - } - -#ifdef DEBUG_PPC_ADJTIMEX - printk("ppc_adjtimex: ltemp = %ld, time_freq = %ld, singleshot_ppm = %ld\n", ltemp, time_freq, singleshot_ppm); - printk("ppc_adjtimex: tb_ticks_per_sec - base = %ld new = %ld\n", tb_ticks_per_sec, new_tb_ticks_per_sec); -#endif - - /* - * Compute a new value of tb_to_xs (used to convert tb to - * microseconds) and a new value of stamp_xsec which is the - * time (in 1/2^20 second units) corresponding to - * tb_orig_stamp. This new value of stamp_xsec compensates - * for the change in frequency (implied by the new tb_to_xs) - * which guarantees that the current time remains the same. - */ - write_seqlock_irqsave( &xtime_lock, flags ); - tb_ticks = get_tb() - do_gtod.varp->tb_orig_stamp; - div128_by_32(1024*1024, 0, new_tb_ticks_per_sec, &divres); - new_tb_to_xs = divres.result_low; - new_xsec = mulhdu(tb_ticks, new_tb_to_xs); - - old_xsec = mulhdu(tb_ticks, do_gtod.varp->tb_to_xs); - new_stamp_xsec = do_gtod.varp->stamp_xsec + old_xsec - new_xsec; - - update_gtod(do_gtod.varp->tb_orig_stamp, new_stamp_xsec, new_tb_to_xs); + /* Register the clocksource, if we're not running on iSeries */ + if (!firmware_has_feature(FW_FEATURE_ISERIES)) + clocksource_init(); - write_sequnlock_irqrestore( &xtime_lock, flags ); -#endif /* CONFIG_PPC64 */ + init_decrementer_clockevent(); }