Merge branch 'tmpreg' into devel
[safe/jmp/linux-2.6] / arch / powerpc / kernel / time.c
index 5cf3940..6c6093d 100644 (file)
@@ -52,6 +52,9 @@
 #include <linux/jiffies.h>
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/perf_event.h>
+#include <asm/trace.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -66,6 +69,7 @@
 #include <asm/smp.h>
 #include <asm/vdso_datapage.h>
 #include <asm/firmware.h>
+#include <asm/cputime.h>
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/it_lp_queue.h>
 #include <asm/iseries/hv_call_xm.h>
@@ -76,7 +80,7 @@
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
 
-static cycle_t rtc_read(void);
+static cycle_t rtc_read(struct clocksource *);
 static struct clocksource clocksource_rtc = {
        .name         = "rtc",
        .rating       = 400,
@@ -87,7 +91,7 @@ static struct clocksource clocksource_rtc = {
        .read         = rtc_read,
 };
 
-static cycle_t timebase_read(void);
+static cycle_t timebase_read(struct clocksource *);
 static struct clocksource clocksource_timebase = {
        .name         = "timebase",
        .rating       = 400,
@@ -108,7 +112,7 @@ static void decrementer_set_mode(enum clock_event_mode mode,
 static struct clock_event_device decrementer_clockevent = {
        .name           = "decrementer",
        .rating         = 200,
-       .shift          = 16,
+       .shift          = 0,    /* To be filled in */
        .mult           = 0,    /* To be filled in */
        .irq            = 0,
        .set_next_event = decrementer_set_next_event,
@@ -128,7 +132,7 @@ static unsigned long __initdata iSeries_recal_titan;
 static signed long __initdata iSeries_recal_tb;
 
 /* Forward declaration is only needed for iSereis compiles */
-void __init clocksource_init(void);
+static void __init clocksource_init(void);
 #endif
 
 #define XSEC_PER_SEC (1024*1024)
@@ -148,9 +152,9 @@ EXPORT_SYMBOL(tb_ticks_per_sec);    /* for cputime_t conversions */
 u64 tb_to_xs;
 unsigned tb_to_us;
 
-#define TICKLEN_SCALE  TICK_LENGTH_SHIFT
-u64 last_tick_len;     /* units are ns / 2^TICKLEN_SCALE */
-u64 ticklen_to_xs;     /* 0.64 fraction */
+#define TICKLEN_SCALE  NTP_SCALE_SHIFT
+static u64 last_tick_len;      /* units are ns / 2^TICKLEN_SCALE */
+static u64 ticklen_to_xs;      /* 0.64 fraction */
 
 /* If last_tick_len corresponds to about 1/HZ seconds, then
    last_tick_len << TICKLEN_SHIFT will be about 2^63. */
@@ -163,8 +167,6 @@ static u64 tb_to_ns_scale __read_mostly;
 static unsigned tb_to_ns_shift __read_mostly;
 static unsigned long boot_tb __read_mostly;
 
-struct gettimeofday_struct do_gtod;
-
 extern struct timezone sys_tz;
 static long timezone_offset;
 
@@ -189,6 +191,10 @@ u64 __cputime_sec_factor;
 EXPORT_SYMBOL(__cputime_sec_factor);
 u64 __cputime_clockt_factor;
 EXPORT_SYMBOL(__cputime_clockt_factor);
+DEFINE_PER_CPU(unsigned long, cputime_last_delta);
+DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta);
+
+cputime_t cputime_one_jiffy;
 
 static void calc_cputime_factors(void)
 {
@@ -219,7 +225,11 @@ static u64 read_purr(void)
  */
 static u64 read_spurr(u64 purr)
 {
-       if (cpu_has_feature(CPU_FTR_SPURR))
+       /*
+        * cpus without PURR won't have a SPURR
+        * We already know the former when we use this, so tell gcc
+        */
+       if (cpu_has_feature(CPU_FTR_PURR) && cpu_has_feature(CPU_FTR_SPURR))
                return mfspr(SPRN_SPURR);
        return purr;
 }
@@ -230,32 +240,36 @@ static u64 read_spurr(u64 purr)
  */
 void account_system_vtime(struct task_struct *tsk)
 {
-       u64 now, nowscaled, delta, deltascaled;
+       u64 now, nowscaled, delta, deltascaled, sys_time;
        unsigned long flags;
 
        local_irq_save(flags);
        now = read_purr();
-       delta = now - get_paca()->startpurr;
-       get_paca()->startpurr = now;
        nowscaled = read_spurr(now);
+       delta = now - get_paca()->startpurr;
        deltascaled = nowscaled - get_paca()->startspurr;
+       get_paca()->startpurr = now;
        get_paca()->startspurr = nowscaled;
        if (!in_interrupt()) {
                /* deltascaled includes both user and system time.
                 * Hence scale it based on the purr ratio to estimate
                 * the system time */
+               sys_time = get_paca()->system_time;
                if (get_paca()->user_time)
-                       deltascaled = deltascaled * get_paca()->system_time /
-                            (get_paca()->system_time + get_paca()->user_time);
-               delta += get_paca()->system_time;
+                       deltascaled = deltascaled * sys_time /
+                            (sys_time + get_paca()->user_time);
+               delta += sys_time;
                get_paca()->system_time = 0;
        }
-       account_system_time(tsk, 0, delta);
-       get_paca()->purrdelta = delta;
-       account_system_time_scaled(tsk, deltascaled);
-       get_paca()->spurrdelta = deltascaled;
+       if (in_irq() || idle_task(smp_processor_id()) != tsk)
+               account_system_time(tsk, 0, delta, deltascaled);
+       else
+               account_idle_time(delta);
+       per_cpu(cputime_last_delta, smp_processor_id()) = delta;
+       per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled;
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(account_system_vtime);
 
 /*
  * Transfer the user and system times accumulated in the paca
@@ -269,13 +283,8 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
 
        utime = get_paca()->user_time;
        get_paca()->user_time = 0;
-       account_user_time(tsk, utime);
-
-       /* Estimate the scaled utime by scaling the real utime based
-        * on the last spurr to purr ratio */
-       utimescaled = utime * get_paca()->spurrdelta / get_paca()->purrdelta;
-       get_paca()->spurrdelta = get_paca()->purrdelta = 0;
-       account_user_time_scaled(tsk, utimescaled);
+       utimescaled = cputime_to_scaled(utime);
+       account_user_time(tsk, utime, utimescaled);
 }
 
 /*
@@ -317,7 +326,7 @@ void snapshot_timebases(void)
 {
        if (!cpu_has_feature(CPU_FTR_PURR))
                return;
-       on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1);
+       on_each_cpu(snapshot_tb_and_purr, NULL, 1);
 }
 
 /*
@@ -329,16 +338,18 @@ void calculate_steal_time(void)
        s64 stolen;
        struct cpu_purr_data *pme;
 
-       if (!cpu_has_feature(CPU_FTR_PURR))
-               return;
        pme = &__get_cpu_var(cpu_purr_data);
        if (!pme->initialized)
-               return;         /* this can happen in early boot */
+               return;         /* !CPU_FTR_PURR or early in early boot */
        tb = mftb();
        purr = mfspr(SPRN_PURR);
        stolen = (tb - pme->tb) - (purr - pme->purr);
-       if (stolen > 0)
-               account_steal_time(current, stolen);
+       if (stolen > 0) {
+               if (idle_task(smp_processor_id()) != current)
+                       account_steal_time(stolen);
+               else
+                       account_idle_time(stolen);
+       }
        pme->tb = tb;
        pme->purr = purr;
 }
@@ -412,31 +423,9 @@ void udelay(unsigned long usecs)
 }
 EXPORT_SYMBOL(udelay);
 
-
-/*
- * There are two copies of tb_to_xs and stamp_xsec so that no
- * lock is needed to access and use these values in
- * do_gettimeofday.  We alternate the copies and as long as a
- * reasonable time elapses between changes, there will never
- * be inconsistent values.  ntpd has a minimum of one minute
- * between updates.
- */
 static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
                               u64 new_tb_to_xs)
 {
-       unsigned temp_idx;
-       struct gettimeofday_vars *temp_varp;
-
-       temp_idx = (do_gtod.var_idx == 0);
-       temp_varp = &do_gtod.vars[temp_idx];
-
-       temp_varp->tb_to_xs = new_tb_to_xs;
-       temp_varp->tb_orig_stamp = new_tb_stamp;
-       temp_varp->stamp_xsec = new_stamp_xsec;
-       smp_mb();
-       do_gtod.varp = temp_varp;
-       do_gtod.var_idx = temp_idx;
-
        /*
         * tb_update_count is used to allow the userspace gettimeofday code
         * to assure itself that it sees a consistent view of the tb_to_xs and
@@ -453,6 +442,7 @@ static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
        vdso_data->tb_to_xs = new_tb_to_xs;
        vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
        vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+       vdso_data->stamp_xtime = xtime;
        smp_wmb();
        ++(vdso_data->tb_update_count);
 }
@@ -493,7 +483,8 @@ static int __init iSeries_tb_recal(void)
                unsigned long tb_ticks = tb - iSeries_recal_tb;
                unsigned long titan_usec = (titan - iSeries_recal_titan) >> 12;
                unsigned long new_tb_ticks_per_sec   = (tb_ticks * USEC_PER_SEC)/titan_usec;
-               unsigned long new_tb_ticks_per_jiffy = (new_tb_ticks_per_sec+(HZ/2))/HZ;
+               unsigned long new_tb_ticks_per_jiffy =
+                       DIV_ROUND_CLOSEST(new_tb_ticks_per_sec, HZ);
                long tick_diff = new_tb_ticks_per_jiffy - tb_ticks_per_jiffy;
                char sign = '+';                
                /* make sure tb_ticks_per_sec and tb_ticks_per_jiffy are consistent */
@@ -511,11 +502,10 @@ static int __init iSeries_tb_recal(void)
                                tb_ticks_per_sec   = new_tb_ticks_per_sec;
                                calc_cputime_factors();
                                div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
-                               do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
                                tb_to_xs = divres.result_low;
-                               do_gtod.varp->tb_to_xs = tb_to_xs;
                                vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
                                vdso_data->tb_to_xs = tb_to_xs;
+                               setup_cputime_one_jiffy();
                        }
                        else {
                                printk( "Titan recalibrate: FAILED (difference > 4 percent)\n"
@@ -542,6 +532,26 @@ void __init iSeries_time_init_early(void)
 }
 #endif /* CONFIG_PPC_ISERIES */
 
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32)
+DEFINE_PER_CPU(u8, perf_event_pending);
+
+void set_perf_event_pending(void)
+{
+       get_cpu_var(perf_event_pending) = 1;
+       set_dec(1);
+       put_cpu_var(perf_event_pending);
+}
+
+#define test_perf_event_pending()      __get_cpu_var(perf_event_pending)
+#define clear_perf_event_pending()     __get_cpu_var(perf_event_pending) = 0
+
+#else  /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
+
+#define test_perf_event_pending()      0
+#define clear_perf_event_pending()
+
+#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
+
 /*
  * For iSeries shared processors, we have to let the hypervisor
  * set the hardware decrementer.  We set a virtual decrementer
@@ -563,11 +573,17 @@ void timer_interrupt(struct pt_regs * regs)
        struct clock_event_device *evt = &decrementer->event;
        u64 now;
 
+       trace_timer_interrupt_entry(regs);
+
        /* Ensure a positive value is written to the decrementer, or else
         * some CPUs will continuue to take decrementer exceptions */
        set_dec(DECREMENTER_MAX);
 
 #ifdef CONFIG_PPC32
+       if (test_perf_event_pending()) {
+               clear_perf_event_pending();
+               perf_event_do_pending();
+       }
        if (atomic_read(&ppc_n_lost_interrupts) != 0)
                do_IRQ(regs);
 #endif
@@ -578,6 +594,7 @@ void timer_interrupt(struct pt_regs * regs)
                now = decrementer->next_tb - now;
                if (now <= DECREMENTER_MAX)
                        set_dec((int)now);
+               trace_timer_interrupt_exit(regs);
                return;
        }
        old_regs = set_irq_regs(regs);
@@ -608,6 +625,8 @@ void timer_interrupt(struct pt_regs * regs)
 
        irq_exit();
        set_irq_regs(old_regs);
+
+       trace_timer_interrupt_exit(regs);
 }
 
 void wakeup_decrementer(void)
@@ -626,6 +645,45 @@ void wakeup_decrementer(void)
        set_dec(ticks);
 }
 
+#ifdef CONFIG_SUSPEND
+void generic_suspend_disable_irqs(void)
+{
+       preempt_disable();
+
+       /* Disable the decrementer, so that it doesn't interfere
+        * with suspending.
+        */
+
+       set_dec(0x7fffffff);
+       local_irq_disable();
+       set_dec(0x7fffffff);
+}
+
+void generic_suspend_enable_irqs(void)
+{
+       wakeup_decrementer();
+
+       local_irq_enable();
+       preempt_enable();
+}
+
+/* Overrides the weak version in kernel/power/main.c */
+void arch_suspend_disable_irqs(void)
+{
+       if (ppc_md.suspend_disable_irqs)
+               ppc_md.suspend_disable_irqs();
+       generic_suspend_disable_irqs();
+}
+
+/* Overrides the weak version in kernel/power/main.c */
+void arch_suspend_enable_irqs(void)
+{
+       generic_suspend_enable_irqs();
+       if (ppc_md.suspend_enable_irqs)
+               ppc_md.suspend_enable_irqs();
+}
+#endif
+
 #ifdef CONFIG_SMP
 void __init smp_space_timers(unsigned int max_cpus)
 {
@@ -679,6 +737,18 @@ static int __init get_freq(char *name, int cells, unsigned long *val)
        return found;
 }
 
+/* should become __cpuinit when secondary_cpu_time_init also is */
+void start_cpu_decrementer(void)
+{
+#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+       /* Clear any pending timer interrupts */
+       mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
+
+       /* Enable decrementer interrupt */
+       mtspr(SPRN_TCR, TCR_DIE);
+#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */
+}
+
 void __init generic_calibrate_decr(void)
 {
        ppc_tb_freq = DEFAULT_TB_FREQ;          /* hardcoded default */
@@ -698,18 +768,6 @@ void __init generic_calibrate_decr(void)
                printk(KERN_ERR "WARNING: Estimating processor frequency "
                                "(not found)\n");
        }
-
-#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
-       /* Set the time base to zero */
-       mtspr(SPRN_TBWL, 0);
-       mtspr(SPRN_TBWU, 0);
-
-       /* Clear any pending timer interrupts */
-       mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
-
-       /* Enable decrementer interrupt */
-       mtspr(SPRN_TCR, TCR_DIE);
-#endif
 }
 
 int update_persistent_clock(struct timespec now)
@@ -726,11 +784,12 @@ int update_persistent_clock(struct timespec now)
        return ppc_md.set_rtc_time(&tm);
 }
 
-unsigned long read_persistent_clock(void)
+static void __read_persistent_clock(struct timespec *ts)
 {
        struct rtc_time tm;
        static int first = 1;
 
+       ts->tv_nsec = 0;
        /* XXX this is a litle fragile but will work okay in the short term */
        if (first) {
                first = 0;
@@ -738,28 +797,46 @@ unsigned long read_persistent_clock(void)
                        timezone_offset = ppc_md.time_init();
 
                /* get_boot_time() isn't guaranteed to be safe to call late */
-               if (ppc_md.get_boot_time)
-                       return ppc_md.get_boot_time() -timezone_offset;
+               if (ppc_md.get_boot_time) {
+                       ts->tv_sec = ppc_md.get_boot_time() - timezone_offset;
+                       return;
+               }
+       }
+       if (!ppc_md.get_rtc_time) {
+               ts->tv_sec = 0;
+               return;
        }
-       if (!ppc_md.get_rtc_time)
-               return 0;
        ppc_md.get_rtc_time(&tm);
-       return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday,
-                     tm.tm_hour, tm.tm_min, tm.tm_sec);
+
+       ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday,
+                           tm.tm_hour, tm.tm_min, tm.tm_sec);
+}
+
+void read_persistent_clock(struct timespec *ts)
+{
+       __read_persistent_clock(ts);
+
+       /* Sanitize it in case real time clock is set below EPOCH */
+       if (ts->tv_sec < 0) {
+               ts->tv_sec = 0;
+               ts->tv_nsec = 0;
+       }
+               
 }
 
 /* clocksource code */
-static cycle_t rtc_read(void)
+static cycle_t rtc_read(struct clocksource *cs)
 {
        return (cycle_t)get_rtc();
 }
 
-static cycle_t timebase_read(void)
+static cycle_t timebase_read(struct clocksource *cs)
 {
        return (cycle_t)get_tb();
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+                    u32 mult)
 {
        u64 t2x, stamp_xsec;
 
@@ -772,7 +849,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 
        /* XXX this assumes clock->shift == 22 */
        /* 4611686018 ~= 2^(20+64-22) / 1e9 */
-       t2x = (u64) clock->mult * 4611686018ULL;
+       t2x = (u64) mult * 4611686018ULL;
        stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
        do_div(stamp_xsec, 1000000000);
        stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
@@ -790,7 +867,7 @@ void update_vsyscall_tz(void)
        ++vdso_data->tb_update_count;
 }
 
-void __init clocksource_init(void)
+static void __init clocksource_init(void)
 {
        struct clocksource *clock;
 
@@ -826,14 +903,39 @@ static void decrementer_set_mode(enum clock_event_mode mode,
                decrementer_set_next_event(DECREMENTER_MAX, dev);
 }
 
+static inline uint64_t div_sc64(unsigned long ticks, unsigned long nsec,
+                               int shift)
+{
+       uint64_t tmp = ((uint64_t)ticks) << shift;
+
+       do_div(tmp, nsec);
+       return tmp;
+}
+
+static void __init setup_clockevent_multiplier(unsigned long hz)
+{
+       u64 mult, shift = 32;
+
+       while (1) {
+               mult = div_sc64(hz, NSEC_PER_SEC, shift);
+               if (mult && (mult >> 32UL) == 0UL)
+                       break;
+
+               shift--;
+       }
+
+       decrementer_clockevent.shift = shift;
+       decrementer_clockevent.mult = mult;
+}
+
 static void register_decrementer_clockevent(int cpu)
 {
        struct clock_event_device *dec = &per_cpu(decrementers, cpu).event;
 
        *dec = decrementer_clockevent;
-       dec->cpumask = cpumask_of_cpu(cpu);
+       dec->cpumask = cpumask_of(cpu);
 
-       printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
+       printk(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
               dec->name, dec->mult, dec->shift, cpu);
 
        clockevents_register_device(dec);
@@ -843,8 +945,7 @@ static void __init init_decrementer_clockevent(void)
 {
        int cpu = smp_processor_id();
 
-       decrementer_clockevent.mult = div_sc(ppc_tb_freq, NSEC_PER_SEC,
-                                            decrementer_clockevent.shift);
+       setup_clockevent_multiplier(ppc_tb_freq);
        decrementer_clockevent.max_delta_ns =
                clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent);
        decrementer_clockevent.min_delta_ns =
@@ -855,6 +956,11 @@ static void __init init_decrementer_clockevent(void)
 
 void secondary_cpu_time_init(void)
 {
+       /* Start the decrementer on CPUs that have manual control
+        * such as BookE
+        */
+       start_cpu_decrementer();
+
        /* FIME: Should make unrelatred change to move snapshot_timebase
         * call here ! */
        register_decrementer_clockevent(smp_processor_id());
@@ -887,6 +993,7 @@ void __init time_init(void)
        tb_ticks_per_usec = ppc_tb_freq / 1000000;
        tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
        calc_cputime_factors();
+       setup_cputime_one_jiffy();
 
        /*
         * Calculate the length of each tick in ns.  It will not be
@@ -950,25 +1057,19 @@ void __init time_init(void)
                sys_tz.tz_dsttime = 0;
         }
 
-       do_gtod.varp = &do_gtod.vars[0];
-       do_gtod.var_idx = 0;
-       do_gtod.varp->tb_orig_stamp = tb_last_jiffy;
-       __get_cpu_var(last_jiffy) = tb_last_jiffy;
-       do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
-       do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
-       do_gtod.varp->tb_to_xs = tb_to_xs;
-       do_gtod.tb_to_us = tb_to_us;
-
        vdso_data->tb_orig_stamp = tb_last_jiffy;
        vdso_data->tb_update_count = 0;
        vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
        vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
        vdso_data->tb_to_xs = tb_to_xs;
 
-       time_freq = 0;
-
        write_sequnlock_irqrestore(&xtime_lock, flags);
 
+       /* Start the decrementer on CPUs that have manual control
+        * such as BookE
+        */
+       start_cpu_decrementer();
+
        /* Register the clocksource, if we're not running on iSeries */
        if (!firmware_has_feature(FW_FEATURE_ISERIES))
                clocksource_init();
@@ -1121,3 +1222,28 @@ void div128_by_32(u64 dividend_high, u64 dividend_low,
        dr->result_low  = ((u64)y << 32) + z;
 
 }
+
+/* We don't need to calibrate delay, we use the CPU timebase for that */
+void calibrate_delay(void)
+{
+       /* Some generic code (such as spinlock debug) use loops_per_jiffy
+        * as the number of __delay(1) in a jiffy, so make it so
+        */
+       loops_per_jiffy = tb_ticks_per_jiffy;
+}
+
+static int __init rtc_init(void)
+{
+       struct platform_device *pdev;
+
+       if (!ppc_md.get_rtc_time)
+               return -ENODEV;
+
+       pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
+
+       return 0;
+}
+
+module_init(rtc_init);