Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
authorRusty Russell <rusty@rustcorp.com.au>
Mon, 29 Dec 2008 21:32:35 +0000 (08:02 +1030)
committerRusty Russell <rusty@rustcorp.com.au>
Mon, 29 Dec 2008 21:32:35 +0000 (08:02 +1030)
24 files changed:
1  2 
arch/m32r/Kconfig
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/time.c
arch/powerpc/platforms/pseries/xics.c
arch/powerpc/sysdev/mpic.c
arch/s390/Kconfig
arch/s390/kernel/smp.c
arch/s390/kernel/time.c
arch/s390/kernel/topology.c
arch/x86/include/asm/pci.h
arch/x86/kernel/apic.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/hpet.c
arch/x86/kernel/io_apic.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/smpboot.c
arch/x86/lguest/boot.c
include/linux/smp.h
init/Kconfig
kernel/profile.c
kernel/sched.c
kernel/sched_stats.h
kernel/trace/trace.c
lib/Kconfig

diff --combined arch/m32r/Kconfig
@@@ -10,7 -10,6 +10,7 @@@ config M32
        default y
        select HAVE_IDE
        select HAVE_OPROFILE
 +      select INIT_ALL_POSSIBLE
  
  config SBUS
        bool
@@@ -274,7 -273,7 +274,7 @@@ config GENERIC_CALIBRATE_DELA
        bool
        default y
  
- config SCHED_NO_NO_OMIT_FRAME_POINTER
+ config SCHED_OMIT_FRAME_POINTER
          bool
          default y
  
  #define DBG(fmt...)
  #endif
  
- int smp_hw_index[NR_CPUS];
  struct thread_info *secondary_ti;
  
  DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
  DEFINE_PER_CPU(cpumask_t, cpu_core_map) = CPU_MASK_NONE;
  
 -EXPORT_SYMBOL(cpu_online_map);
 -EXPORT_SYMBOL(cpu_possible_map);
  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
  EXPORT_PER_CPU_SYMBOL(cpu_core_map);
  
@@@ -119,6 -122,65 +118,65 @@@ void smp_message_recv(int msg
        }
  }
  
+ static irqreturn_t call_function_action(int irq, void *data)
+ {
+       generic_smp_call_function_interrupt();
+       return IRQ_HANDLED;
+ }
+ static irqreturn_t reschedule_action(int irq, void *data)
+ {
+       /* we just need the return path side effect of checking need_resched */
+       return IRQ_HANDLED;
+ }
+ static irqreturn_t call_function_single_action(int irq, void *data)
+ {
+       generic_smp_call_function_single_interrupt();
+       return IRQ_HANDLED;
+ }
+ static irqreturn_t debug_ipi_action(int irq, void *data)
+ {
+       smp_message_recv(PPC_MSG_DEBUGGER_BREAK);
+       return IRQ_HANDLED;
+ }
+ static irq_handler_t smp_ipi_action[] = {
+       [PPC_MSG_CALL_FUNCTION] =  call_function_action,
+       [PPC_MSG_RESCHEDULE] = reschedule_action,
+       [PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
+       [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
+ };
+ const char *smp_ipi_name[] = {
+       [PPC_MSG_CALL_FUNCTION] =  "ipi call function",
+       [PPC_MSG_RESCHEDULE] = "ipi reschedule",
+       [PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
+       [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
+ };
+ /* optional function to request ipi, for controllers with >= 4 ipis */
+ int smp_request_message_ipi(int virq, int msg)
+ {
+       int err;
+       if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) {
+               return -EINVAL;
+       }
+ #if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC)
+       if (msg == PPC_MSG_DEBUGGER_BREAK) {
+               return 1;
+       }
+ #endif
+       err = request_irq(virq, smp_ipi_action[msg], IRQF_DISABLED|IRQF_PERCPU,
+                         smp_ipi_name[msg], 0);
+       WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n",
+               virq, smp_ipi_name[msg], err);
+       return err;
+ }
  void smp_send_reschedule(int cpu)
  {
        if (likely(smp_ops))
@@@ -404,8 -466,7 +462,7 @@@ out
  static struct device_node *cpu_to_l2cache(int cpu)
  {
        struct device_node *np;
-       const phandle *php;
-       phandle ph;
+       struct device_node *cache;
  
        if (!cpu_present(cpu))
                return NULL;
        if (np == NULL)
                return NULL;
  
-       php = of_get_property(np, "l2-cache", NULL);
-       if (php == NULL)
-               return NULL;
-       ph = *php;
+       cache = of_find_next_cache_node(np);
        of_node_put(np);
  
-       return of_find_node_by_phandle(ph);
+       return cache;
  }
  
  /* Activate a secondary processor. */
@@@ -164,8 -164,6 +164,6 @@@ static u64 tb_to_ns_scale __read_mostly
  static unsigned tb_to_ns_shift __read_mostly;
  static unsigned long boot_tb __read_mostly;
  
- static struct gettimeofday_struct do_gtod;
  extern struct timezone sys_tz;
  static long timezone_offset;
  
@@@ -415,31 -413,9 +413,9 @@@ void udelay(unsigned long usecs
  }
  EXPORT_SYMBOL(udelay);
  
  static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
                               u64 new_tb_to_xs)
  {
-       unsigned temp_idx;
-       struct gettimeofday_vars *temp_varp;
-       temp_idx = (do_gtod.var_idx == 0);
-       temp_varp = &do_gtod.vars[temp_idx];
-       temp_varp->tb_to_xs = new_tb_to_xs;
-       temp_varp->tb_orig_stamp = new_tb_stamp;
-       temp_varp->stamp_xsec = new_stamp_xsec;
-       smp_mb();
-       do_gtod.varp = temp_varp;
-       do_gtod.var_idx = temp_idx;
        /*
         * tb_update_count is used to allow the userspace gettimeofday code
         * to assure itself that it sees a consistent view of the tb_to_xs and
        vdso_data->tb_to_xs = new_tb_to_xs;
        vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
        vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+       vdso_data->stamp_xtime = xtime;
        smp_wmb();
        ++(vdso_data->tb_update_count);
  }
@@@ -514,9 -491,7 +491,7 @@@ static int __init iSeries_tb_recal(void
                                tb_ticks_per_sec   = new_tb_ticks_per_sec;
                                calc_cputime_factors();
                                div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
-                               do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
                                tb_to_xs = divres.result_low;
-                               do_gtod.varp->tb_to_xs = tb_to_xs;
                                vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
                                vdso_data->tb_to_xs = tb_to_xs;
                        }
@@@ -869,7 -844,7 +844,7 @@@ static void register_decrementer_clocke
        struct clock_event_device *dec = &per_cpu(decrementers, cpu).event;
  
        *dec = decrementer_clockevent;
 -      dec->cpumask = cpumask_of_cpu(cpu);
 +      dec->cpumask = cpumask_of(cpu);
  
        printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
               dec->name, dec->mult, dec->shift, cpu);
@@@ -988,15 -963,6 +963,6 @@@ void __init time_init(void
                sys_tz.tz_dsttime = 0;
          }
  
-       do_gtod.varp = &do_gtod.vars[0];
-       do_gtod.var_idx = 0;
-       do_gtod.varp->tb_orig_stamp = tb_last_jiffy;
-       __get_cpu_var(last_jiffy) = tb_last_jiffy;
-       do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
-       do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
-       do_gtod.varp->tb_to_xs = tb_to_xs;
-       do_gtod.tb_to_us = tb_to_us;
        vdso_data->tb_orig_stamp = tb_last_jiffy;
        vdso_data->tb_update_count = 0;
        vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
@@@ -332,7 -332,7 +332,7 @@@ static void xics_eoi_lpar(unsigned int 
        lpar_xirr_info_set((0xff << 24) | irq);
  }
  
 -static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 +static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
  {
        unsigned int irq;
        int status;
@@@ -579,7 -579,7 +579,7 @@@ static void xics_update_irq_servers(voi
        int i, j;
        struct device_node *np;
        u32 ilen;
-       const u32 *ireg, *isize;
+       const u32 *ireg;
        u32 hcpuid;
  
        /* Find the server numbers for the boot cpu. */
                }
        }
  
-       /* get the bit size of server numbers */
-       isize = of_get_property(np, "ibm,interrupt-server#-size", NULL);
-       if (isize)
-               interrupt_server_size = *isize;
        of_node_put(np);
  }
  
@@@ -682,6 -677,7 +677,7 @@@ void __init xics_init_IRQ(void
        struct device_node *np;
        u32 indx = 0;
        int found = 0;
+       const u32 *isize;
  
        ppc64_boot_msg(0x20, "XICS Init");
  
        if (found == 0)
                return;
  
+       /* get the bit size of server numbers */
+       found = 0;
+       for_each_compatible_node(np, NULL, "ibm,ppc-xics") {
+               isize = of_get_property(np, "ibm,interrupt-server#-size", NULL);
+               if (!isize)
+                       continue;
+               if (!found) {
+                       interrupt_server_size = *isize;
+                       found = 1;
+               } else if (*isize != interrupt_server_size) {
+                       printk(KERN_WARNING "XICS: "
+                              "mismatched ibm,interrupt-server#-size\n");
+                       interrupt_server_size = max(*isize,
+                                                   interrupt_server_size);
+               }
+       }
        xics_update_irq_servers();
        xics_init_host();
  
@@@ -728,9 -744,18 +744,18 @@@ static void xics_set_cpu_priority(unsig
  /* Have the calling processor join or leave the specified global queue */
  static void xics_set_cpu_giq(unsigned int gserver, unsigned int join)
  {
-       int status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE,
-               (1UL << interrupt_server_size) - 1 - gserver, join);
-       WARN_ON(status < 0);
+       int index;
+       int status;
+       if (!rtas_indicator_present(GLOBAL_INTERRUPT_QUEUE, NULL))
+               return;
+       index = (1UL << interrupt_server_size) - 1 - gserver;
+       status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE, index, join);
+       WARN(status < 0, "set-indicator(%d, %d, %u) returned %d\n",
+            GLOBAL_INTERRUPT_QUEUE, index, join, status);
  }
  
  void xics_setup_cpu(void)
@@@ -845,7 -870,7 +870,7 @@@ void xics_migrate_irqs_away(void
  
                /* Reset affinity to all cpus */
                irq_desc[virq].affinity = CPU_MASK_ALL;
 -              desc->chip->set_affinity(virq, CPU_MASK_ALL);
 +              desc->chip->set_affinity(virq, cpu_all_mask);
  unlock:
                spin_unlock_irqrestore(&desc->lock, flags);
        }
@@@ -661,17 -661,6 +661,6 @@@ static inline void mpic_eoi(struct mpi
        (void)mpic_cpu_read(MPIC_INFO(CPU_WHOAMI));
  }
  
- #ifdef CONFIG_SMP
- static irqreturn_t mpic_ipi_action(int irq, void *data)
- {
-       long ipi = (long)data;
-       smp_message_recv(ipi);
-       return IRQ_HANDLED;
- }
- #endif /* CONFIG_SMP */
  /*
   * Linux descriptor level callbacks
   */
@@@ -817,7 -806,7 +806,7 @@@ static void mpic_end_ipi(unsigned int i
  
  #endif /* CONFIG_SMP */
  
 -void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
 +void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
  {
        struct mpic *mpic = mpic_from_irq(irq);
        unsigned int src = mpic_irq_to_hw(irq);
        } else {
                cpumask_t tmp;
  
 -              cpus_and(tmp, cpumask, cpu_online_map);
 +              cpumask_and(&tmp, cpumask, cpu_online_mask);
  
                mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
                               mpic_physmask(cpus_addr(tmp)[0]));
@@@ -1548,13 -1537,7 +1537,7 @@@ unsigned int mpic_get_mcirq(void
  void mpic_request_ipis(void)
  {
        struct mpic *mpic = mpic_primary;
-       long i, err;
-       static char *ipi_names[] = {
-               "IPI0 (call function)",
-               "IPI1 (reschedule)",
-               "IPI2 (call function single)",
-               "IPI3 (debugger break)",
-       };
+       int i;
        BUG_ON(mpic == NULL);
  
        printk(KERN_INFO "mpic: requesting IPIs ... \n");
                unsigned int vipi = irq_create_mapping(mpic->irqhost,
                                                       mpic->ipi_vecs[0] + i);
                if (vipi == NO_IRQ) {
-                       printk(KERN_ERR "Failed to map IPI %ld\n", i);
-                       break;
-               }
-               err = request_irq(vipi, mpic_ipi_action,
-                                 IRQF_DISABLED|IRQF_PERCPU,
-                                 ipi_names[i], (void *)i);
-               if (err) {
-                       printk(KERN_ERR "Request of irq %d for IPI %ld failed\n",
-                              vipi, i);
-                       break;
+                       printk(KERN_ERR "Failed to map %s\n", smp_ipi_name[i]);
+                       continue;
                }
+               smp_request_message_ipi(vipi, i);
        }
  }
  
diff --combined arch/s390/Kconfig
@@@ -43,6 -43,9 +43,9 @@@ config GENERIC_HWEIGH
  config GENERIC_TIME
        def_bool y
  
+ config GENERIC_TIME_VSYSCALL
+       def_bool y
  config GENERIC_CLOCKEVENTS
        def_bool y
  
@@@ -66,16 -69,20 +69,21 @@@ config PGST
        bool
        default y if KVM
  
+ config VIRT_CPU_ACCOUNTING
+       def_bool y
  mainmenu "Linux Kernel Configuration"
  
  config S390
        def_bool y
+       select USE_GENERIC_SMP_HELPERS if SMP
+       select HAVE_FUNCTION_TRACER
        select HAVE_OPROFILE
        select HAVE_KPROBES
        select HAVE_KRETPROBES
        select HAVE_KVM if 64BIT
        select HAVE_ARCH_TRACEHOOK
 +      select INIT_ALL_POSSIBLE
  
  source "init/Kconfig"
  
@@@ -226,6 -233,14 +234,14 @@@ config MARCH_Z9_10
          Class (z9 BC). The kernel will be slightly faster but will not
          work on older machines such as the z990, z890, z900, and z800.
  
+ config MARCH_Z10
+       bool "IBM System z10"
+       help
+         Select this to enable optimizations for IBM System z10. The
+         kernel will be slightly faster but will not work on older
+         machines such as the z990, z890, z900, z800, z9-109, z9-ec
+         and z9-bc.
  endchoice
  
  config PACK_STACK
@@@ -344,16 -359,6 +360,6 @@@ config QDI
  
          If unsure, say Y.
  
- config QDIO_DEBUG
-       bool "Extended debugging information"
-       depends on QDIO
-       help
-         Say Y here to get extended debugging output in
-           /sys/kernel/debug/s390dbf/qdio...
-         Warning: this option reduces the performance of the QDIO module.
-         If unsure, say N.
  config CHSC_SCH
        tristate "Support for CHSC subchannels"
        help
@@@ -467,22 -472,9 +473,9 @@@ config PAGE_STATE
          hypervisor. The ESSA instruction is used to do the states
          changes between a page that has content and the unused state.
  
- config VIRT_TIMER
-       bool "Virtual CPU timer support"
-       help
-         This provides a kernel interface for virtual CPU timers.
-         Default is disabled.
- config VIRT_CPU_ACCOUNTING
-       bool "Base user process accounting on virtual cpu timer"
-       depends on VIRT_TIMER
-       help
-         Select this option to use CPU timer deltas to do user
-         process accounting.
  config APPLDATA_BASE
        bool "Linux - VM Monitor Stream, base infrastructure"
-       depends on PROC_FS && VIRT_TIMER=y
+       depends on PROC_FS
        help
          This provides a kernel interface for creating and updating z/VM APPLDATA
          monitor records. The monitor records are updated at certain time
diff --combined arch/s390/kernel/smp.c
@@@ -20,6 -20,9 +20,9 @@@
   * cpu_number_map in other architectures.
   */
  
+ #define KMSG_COMPONENT "cpu"
+ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/mm.h>
  struct _lowcore *lowcore_ptr[NR_CPUS];
  EXPORT_SYMBOL(lowcore_ptr);
  
 -cpumask_t cpu_online_map = CPU_MASK_NONE;
 -EXPORT_SYMBOL(cpu_online_map);
 -
 -cpumask_t cpu_possible_map = CPU_MASK_ALL;
 -EXPORT_SYMBOL(cpu_possible_map);
 -
  static struct task_struct *current_set[NR_CPUS];
  
  static u8 smp_cpu_type;
@@@ -71,159 -80,6 +74,6 @@@ static DEFINE_PER_CPU(struct cpu, cpu_d
  
  static void smp_ext_bitcall(int, ec_bit_sig);
  
- /*
-  * Structure and data for __smp_call_function_map(). This is designed to
-  * minimise static memory requirements. It also looks cleaner.
-  */
- static DEFINE_SPINLOCK(call_lock);
- struct call_data_struct {
-       void (*func) (void *info);
-       void *info;
-       cpumask_t started;
-       cpumask_t finished;
-       int wait;
- };
- static struct call_data_struct *call_data;
- /*
-  * 'Call function' interrupt callback
-  */
- static void do_call_function(void)
- {
-       void (*func) (void *info) = call_data->func;
-       void *info = call_data->info;
-       int wait = call_data->wait;
-       cpu_set(smp_processor_id(), call_data->started);
-       (*func)(info);
-       if (wait)
-               cpu_set(smp_processor_id(), call_data->finished);;
- }
- static void __smp_call_function_map(void (*func) (void *info), void *info,
-                                   int wait, cpumask_t map)
- {
-       struct call_data_struct data;
-       int cpu, local = 0;
-       /*
-        * Can deadlock when interrupts are disabled or if in wrong context.
-        */
-       WARN_ON(irqs_disabled() || in_irq());
-       /*
-        * Check for local function call. We have to have the same call order
-        * as in on_each_cpu() because of machine_restart_smp().
-        */
-       if (cpu_isset(smp_processor_id(), map)) {
-               local = 1;
-               cpu_clear(smp_processor_id(), map);
-       }
-       cpus_and(map, map, cpu_online_map);
-       if (cpus_empty(map))
-               goto out;
-       data.func = func;
-       data.info = info;
-       data.started = CPU_MASK_NONE;
-       data.wait = wait;
-       if (wait)
-               data.finished = CPU_MASK_NONE;
-       call_data = &data;
-       for_each_cpu_mask(cpu, map)
-               smp_ext_bitcall(cpu, ec_call_function);
-       /* Wait for response */
-       while (!cpus_equal(map, data.started))
-               cpu_relax();
-       if (wait)
-               while (!cpus_equal(map, data.finished))
-                       cpu_relax();
- out:
-       if (local) {
-               local_irq_disable();
-               func(info);
-               local_irq_enable();
-       }
- }
- /*
-  * smp_call_function:
-  * @func: the function to run; this must be fast and non-blocking
-  * @info: an arbitrary pointer to pass to the function
-  * @wait: if true, wait (atomically) until function has completed on other CPUs
-  *
-  * Run a function on all other CPUs.
-  *
-  * You must not call this function with disabled interrupts, from a
-  * hardware interrupt handler or from a bottom half.
-  */
- int smp_call_function(void (*func) (void *info), void *info, int wait)
- {
-       cpumask_t map;
-       spin_lock(&call_lock);
-       map = cpu_online_map;
-       cpu_clear(smp_processor_id(), map);
-       __smp_call_function_map(func, info, wait, map);
-       spin_unlock(&call_lock);
-       return 0;
- }
- EXPORT_SYMBOL(smp_call_function);
- /*
-  * smp_call_function_single:
-  * @cpu: the CPU where func should run
-  * @func: the function to run; this must be fast and non-blocking
-  * @info: an arbitrary pointer to pass to the function
-  * @wait: if true, wait (atomically) until function has completed on other CPUs
-  *
-  * Run a function on one processor.
-  *
-  * You must not call this function with disabled interrupts, from a
-  * hardware interrupt handler or from a bottom half.
-  */
- int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-                            int wait)
- {
-       spin_lock(&call_lock);
-       __smp_call_function_map(func, info, wait, cpumask_of_cpu(cpu));
-       spin_unlock(&call_lock);
-       return 0;
- }
- EXPORT_SYMBOL(smp_call_function_single);
- /**
-  * smp_call_function_mask(): Run a function on a set of other CPUs.
-  * @mask: The set of cpus to run on.  Must not include the current cpu.
-  * @func: The function to run. This must be fast and non-blocking.
-  * @info: An arbitrary pointer to pass to the function.
-  * @wait: If true, wait (atomically) until function has completed on other CPUs.
-  *
-  * Returns 0 on success, else a negative status code.
-  *
-  * If @wait is true, then returns once @func has returned; otherwise
-  * it returns just before the target cpu calls @func.
-  *
-  * You must not call this function with disabled interrupts or from a
-  * hardware interrupt handler or from a bottom half handler.
-  */
- int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
-                          int wait)
- {
-       spin_lock(&call_lock);
-       cpu_clear(smp_processor_id(), mask);
-       __smp_call_function_map(func, info, wait, mask);
-       spin_unlock(&call_lock);
-       return 0;
- }
- EXPORT_SYMBOL(smp_call_function_mask);
  void smp_send_stop(void)
  {
        int cpu, rc;
@@@ -265,7 -121,10 +115,10 @@@ static void do_ext_call_interrupt(__u1
        bits = xchg(&S390_lowcore.ext_call_fast, 0);
  
        if (test_bit(ec_call_function, &bits))
-               do_call_function();
+               generic_smp_call_function_interrupt();
+       if (test_bit(ec_call_function_single, &bits))
+               generic_smp_call_function_single_interrupt();
  }
  
  /*
@@@ -282,6 -141,19 +135,19 @@@ static void smp_ext_bitcall(int cpu, ec
                udelay(10);
  }
  
+ void arch_send_call_function_ipi(cpumask_t mask)
+ {
+       int cpu;
+       for_each_cpu_mask(cpu, mask)
+               smp_ext_bitcall(cpu, ec_call_function);
+ }
+ void arch_send_call_function_single_ipi(int cpu)
+ {
+       smp_ext_bitcall(cpu, ec_call_function_single);
+ }
  #ifndef CONFIG_64BIT
  /*
   * this function sends a 'purge tlb' signal to another CPU.
@@@ -382,8 -254,8 +248,8 @@@ static void __init smp_get_save_area(un
        if (ipl_info.type != IPL_TYPE_FCP_DUMP)
                return;
        if (cpu >= NR_CPUS) {
-               printk(KERN_WARNING "Registers for cpu %i not saved since dump "
-                      "kernel was compiled with NR_CPUS=%i\n", cpu, NR_CPUS);
+               pr_warning("CPU %i exceeds the maximum %i and is excluded from "
+                          "the dump\n", cpu, NR_CPUS - 1);
                return;
        }
        zfcpdump_save_areas[cpu] = kmalloc(sizeof(union save_area), GFP_KERNEL);
@@@ -556,7 -428,7 +422,7 @@@ static void __init smp_detect_cpus(void
        }
  out:
        kfree(info);
-       printk(KERN_INFO "CPUs: %d configured, %d standby\n", c_cpus, s_cpus);
+       pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus);
        get_online_cpus();
        __smp_rescan_cpus();
        put_online_cpus();
@@@ -572,19 -444,17 +438,17 @@@ int __cpuinit start_secondary(void *cpu
        preempt_disable();
        /* Enable TOD clock interrupts on the secondary cpu. */
        init_cpu_timer();
- #ifdef CONFIG_VIRT_TIMER
        /* Enable cpu timer interrupts on the secondary cpu. */
        init_cpu_vtimer();
- #endif
        /* Enable pfault pseudo page faults on this cpu. */
        pfault_init();
  
        /* call cpu notifiers */
        notify_cpu_starting(smp_processor_id());
        /* Mark this cpu as online */
-       spin_lock(&call_lock);
+       ipi_call_lock();
        cpu_set(smp_processor_id(), cpu_online_map);
-       spin_unlock(&call_lock);
+       ipi_call_unlock();
        /* Switch on interrupts */
        local_irq_enable();
        /* Print info about this processor */
@@@ -633,18 -503,15 +497,15 @@@ static int __cpuinit smp_alloc_lowcore(
  
                save_area = get_zeroed_page(GFP_KERNEL);
                if (!save_area)
-                       goto out_save_area;
+                       goto out;
                lowcore->extended_save_area_addr = (u32) save_area;
        }
  #endif
        lowcore_ptr[cpu] = lowcore;
        return 0;
  
- #ifndef CONFIG_64BIT
- out_save_area:
-       free_page(panic_stack);
- #endif
  out:
+       free_page(panic_stack);
        free_pages(async_stack, ASYNC_ORDER);
        free_pages((unsigned long) lowcore, lc_order);
        return -ENOMEM;
@@@ -684,12 -551,8 +545,8 @@@ int __cpuinit __cpu_up(unsigned int cpu
  
        ccode = signal_processor_p((__u32)(unsigned long)(lowcore_ptr[cpu]),
                                   cpu, sigp_set_prefix);
-       if (ccode) {
-               printk("sigp_set_prefix failed for cpu %d "
-                      "with condition code %d\n",
-                      (int) cpu, (int) ccode);
+       if (ccode)
                return -EIO;
-       }
  
        idle = current_set[cpu];
        cpu_lowcore = lowcore_ptr[cpu];
@@@ -772,7 -635,7 +629,7 @@@ void __cpu_die(unsigned int cpu
        while (!smp_cpu_not_running(cpu))
                cpu_relax();
        smp_free_lowcore(cpu);
-       printk(KERN_INFO "Processor %d spun down\n", cpu);
+       pr_info("Processor %d stopped\n", cpu);
  }
  
  void cpu_die(void)
diff --combined arch/s390/kernel/time.c
@@@ -12,6 -12,9 +12,9 @@@
   *    Copyright (C) 1991, 1992, 1995  Linus Torvalds
   */
  
+ #define KMSG_COMPONENT "time"
+ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  #include <linux/errno.h>
  #include <linux/module.h>
  #include <linux/sched.h>
@@@ -20,6 -23,8 +23,8 @@@
  #include <linux/string.h>
  #include <linux/mm.h>
  #include <linux/interrupt.h>
+ #include <linux/cpu.h>
+ #include <linux/stop_machine.h>
  #include <linux/time.h>
  #include <linux/sysdev.h>
  #include <linux/delay.h>
@@@ -36,6 -41,7 +41,7 @@@
  #include <asm/delay.h>
  #include <asm/s390_ext.h>
  #include <asm/div64.h>
+ #include <asm/vdso.h>
  #include <asm/irq.h>
  #include <asm/irq_regs.h>
  #include <asm/timer.h>
@@@ -154,7 -160,7 +160,7 @@@ void init_cpu_timer(void
        cd->min_delta_ns        = 1;
        cd->max_delta_ns        = LONG_MAX;
        cd->rating              = 400;
 -      cd->cpumask             = cpumask_of_cpu(cpu);
 +      cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = s390_next_event;
        cd->set_mode            = s390_set_mode;
  
@@@ -223,6 -229,36 +229,36 @@@ static struct clocksource clocksource_t
  };
  
  
+ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+ {
+       if (clock != &clocksource_tod)
+               return;
+       /* Make userspace gettimeofday spin until we're done. */
+       ++vdso_data->tb_update_count;
+       smp_wmb();
+       vdso_data->xtime_tod_stamp = clock->cycle_last;
+       vdso_data->xtime_clock_sec = xtime.tv_sec;
+       vdso_data->xtime_clock_nsec = xtime.tv_nsec;
+       vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
+       vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+       smp_wmb();
+       ++vdso_data->tb_update_count;
+ }
+ extern struct timezone sys_tz;
+ void update_vsyscall_tz(void)
+ {
+       /* Make userspace gettimeofday spin until we're done. */
+       ++vdso_data->tb_update_count;
+       smp_wmb();
+       vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
+       vdso_data->tz_dsttime = sys_tz.tz_dsttime;
+       smp_wmb();
+       ++vdso_data->tb_update_count;
+ }
  /*
   * Initialize the TOD clock and the CPU timer of
   * the boot cpu.
@@@ -253,10 -289,8 +289,8 @@@ void __init time_init(void
  
        /* Enable TOD clock interrupts on the boot cpu. */
        init_cpu_timer();
- #ifdef CONFIG_VIRT_TIMER
+       /* Enable cpu timer interrupts on the boot cpu. */
        vtime_init();
- #endif
  }
  
  /*
@@@ -288,8 -322,8 +322,8 @@@ static unsigned long long adjust_time(u
        }
        sched_clock_base_cc += delta;
        if (adjust.offset != 0) {
-               printk(KERN_NOTICE "etr: time adjusted by %li micro-seconds\n",
-                      adjust.offset);
+               pr_notice("The ETR interface has adjusted the clock "
+                         "by %li microseconds\n", adjust.offset);
                adjust.modes = ADJ_OFFSET_SINGLESHOT;
                do_adjtimex(&adjust);
        }
@@@ -360,6 -394,15 +394,15 @@@ static void enable_sync_clock(void
        atomic_set_mask(0x80000000, sw_ptr);
  }
  
+ /* Single threaded workqueue used for etr and stp sync events */
+ static struct workqueue_struct *time_sync_wq;
+ static void __init time_init_wq(void)
+ {
+       if (!time_sync_wq)
+               time_sync_wq = create_singlethread_workqueue("timesync");
+ }
  /*
   * External Time Reference (ETR) code.
   */
@@@ -425,6 -468,7 +468,7 @@@ static struct timer_list etr_timer
  
  static void etr_timeout(unsigned long dummy);
  static void etr_work_fn(struct work_struct *work);
+ static DEFINE_MUTEX(etr_work_mutex);
  static DECLARE_WORK(etr_work, etr_work_fn);
  
  /*
@@@ -440,8 -484,8 +484,8 @@@ static void etr_reset(void
                etr_tolec = get_clock();
                set_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags);
        } else if (etr_port0_online || etr_port1_online) {
-               printk(KERN_WARNING "Running on non ETR capable "
-                      "machine, only local mode available.\n");
+               pr_warning("The real or virtual hardware system does "
+                          "not provide an ETR interface\n");
                etr_port0_online = etr_port1_online = 0;
        }
  }
@@@ -452,17 -496,18 +496,18 @@@ static int __init etr_init(void
  
        if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags))
                return 0;
+       time_init_wq();
        /* Check if this machine has the steai instruction. */
        if (etr_steai(&aib, ETR_STEAI_STEPPING_PORT) == 0)
                etr_steai_available = 1;
        setup_timer(&etr_timer, etr_timeout, 0UL);
        if (etr_port0_online) {
                set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
-               schedule_work(&etr_work);
+               queue_work(time_sync_wq, &etr_work);
        }
        if (etr_port1_online) {
                set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
-               schedule_work(&etr_work);
+               queue_work(time_sync_wq, &etr_work);
        }
        return 0;
  }
@@@ -489,7 -534,7 +534,7 @@@ void etr_switch_to_local(void
        if (test_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
                disable_sync_clock(NULL);
        set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events);
-       schedule_work(&etr_work);
+       queue_work(time_sync_wq, &etr_work);
  }
  
  /*
@@@ -505,7 -550,7 +550,7 @@@ void etr_sync_check(void
        if (test_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
                disable_sync_clock(NULL);
        set_bit(ETR_EVENT_SYNC_CHECK, &etr_events);
-       schedule_work(&etr_work);
+       queue_work(time_sync_wq, &etr_work);
  }
  
  /*
@@@ -529,13 -574,13 +574,13 @@@ static void etr_timing_alert(struct etr
                 * Both ports are not up-to-date now.
                 */
                set_bit(ETR_EVENT_PORT_ALERT, &etr_events);
-       schedule_work(&etr_work);
+       queue_work(time_sync_wq, &etr_work);
  }
  
  static void etr_timeout(unsigned long dummy)
  {
        set_bit(ETR_EVENT_UPDATE, &etr_events);
-       schedule_work(&etr_work);
+       queue_work(time_sync_wq, &etr_work);
  }
  
  /*
@@@ -642,14 -687,16 +687,16 @@@ static int etr_aib_follows(struct etr_a
  }
  
  struct clock_sync_data {
+       atomic_t cpus;
        int in_sync;
        unsigned long long fixup_cc;
+       int etr_port;
+       struct etr_aib *etr_aib;
  };
  
- static void clock_sync_cpu_start(void *dummy)
+ static void clock_sync_cpu(struct clock_sync_data *sync)
  {
-       struct clock_sync_data *sync = dummy;
+       atomic_dec(&sync->cpus);
        enable_sync_clock();
        /*
         * This looks like a busy wait loop but it isn't. etr_sync_cpus
        fixup_clock_comparator(sync->fixup_cc);
  }
  
- static void clock_sync_cpu_end(void *dummy)
- {
- }
  /*
   * Sync the TOD clock using the port refered to by aibp. This port
   * has to be enabled and the other port has to be disabled. The
   * last eacr update has to be more than 1.6 seconds in the past.
   */
- static int etr_sync_clock(struct etr_aib *aib, int port)
+ static int etr_sync_clock(void *data)
  {
-       struct etr_aib *sync_port;
-       struct clock_sync_data etr_sync;
+       static int first;
        unsigned long long clock, old_clock, delay, delta;
-       int follows;
+       struct clock_sync_data *etr_sync;
+       struct etr_aib *sync_port, *aib;
+       int port;
        int rc;
  
-       /* Check if the current aib is adjacent to the sync port aib. */
-       sync_port = (port == 0) ? &etr_port0 : &etr_port1;
-       follows = etr_aib_follows(sync_port, aib, port);
-       memcpy(sync_port, aib, sizeof(*aib));
-       if (!follows)
-               return -EAGAIN;
+       etr_sync = data;
  
-       /*
-        * Catch all other cpus and make them wait until we have
-        * successfully synced the clock. smp_call_function will
-        * return after all other cpus are in etr_sync_cpu_start.
-        */
-       memset(&etr_sync, 0, sizeof(etr_sync));
-       preempt_disable();
-       smp_call_function(clock_sync_cpu_start, &etr_sync, 0);
-       local_irq_disable();
+       if (xchg(&first, 1) == 1) {
+               /* Slave */
+               clock_sync_cpu(etr_sync);
+               return 0;
+       }
+       /* Wait until all other cpus entered the sync function. */
+       while (atomic_read(&etr_sync->cpus) != 0)
+               cpu_relax();
+       port = etr_sync->etr_port;
+       aib = etr_sync->etr_aib;
+       sync_port = (port == 0) ? &etr_port0 : &etr_port1;
        enable_sync_clock();
  
        /* Set clock to next OTE. */
                delay = (unsigned long long)
                        (aib->edf2.etv - sync_port->edf2.etv) << 32;
                delta = adjust_time(old_clock, clock, delay);
-               etr_sync.fixup_cc = delta;
+               etr_sync->fixup_cc = delta;
                fixup_clock_comparator(delta);
                /* Verify that the clock is properly set. */
                if (!etr_aib_follows(sync_port, aib, port)) {
                        /* Didn't work. */
                        disable_sync_clock(NULL);
-                       etr_sync.in_sync = -EAGAIN;
+                       etr_sync->in_sync = -EAGAIN;
                        rc = -EAGAIN;
                } else {
-                       etr_sync.in_sync = 1;
+                       etr_sync->in_sync = 1;
                        rc = 0;
                }
        } else {
                __ctl_clear_bit(0, 29);
                __ctl_clear_bit(14, 21);
                disable_sync_clock(NULL);
-               etr_sync.in_sync = -EAGAIN;
+               etr_sync->in_sync = -EAGAIN;
                rc = -EAGAIN;
        }
-       local_irq_enable();
-       smp_call_function(clock_sync_cpu_end, NULL, 0);
-       preempt_enable();
+       xchg(&first, 0);
+       return rc;
+ }
+ static int etr_sync_clock_stop(struct etr_aib *aib, int port)
+ {
+       struct clock_sync_data etr_sync;
+       struct etr_aib *sync_port;
+       int follows;
+       int rc;
+       /* Check if the current aib is adjacent to the sync port aib. */
+       sync_port = (port == 0) ? &etr_port0 : &etr_port1;
+       follows = etr_aib_follows(sync_port, aib, port);
+       memcpy(sync_port, aib, sizeof(*aib));
+       if (!follows)
+               return -EAGAIN;
+       memset(&etr_sync, 0, sizeof(etr_sync));
+       etr_sync.etr_aib = aib;
+       etr_sync.etr_port = port;
+       get_online_cpus();
+       atomic_set(&etr_sync.cpus, num_online_cpus() - 1);
+       rc = stop_machine(etr_sync_clock, &etr_sync, &cpu_online_map);
+       put_online_cpus();
        return rc;
  }
  
@@@ -903,7 -967,7 +967,7 @@@ static void etr_update_eacr(struct etr_
  }
  
  /*
-  * ETR tasklet. In this function you'll find the main logic. In
+  * ETR work. In this function you'll find the main logic. In
   * particular this is the only function that calls etr_update_eacr(),
   * it "controls" the etr control register.
   */
@@@ -914,6 -978,9 +978,9 @@@ static void etr_work_fn(struct work_str
        struct etr_aib aib;
        int sync_port;
  
+       /* prevent multiple execution. */
+       mutex_lock(&etr_work_mutex);
        /* Create working copy of etr_eacr. */
        eacr = etr_eacr;
  
                del_timer_sync(&etr_timer);
                etr_update_eacr(eacr);
                clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
-               return;
+               goto out_unlock;
        }
  
        /* Store aib to get the current ETR status word. */
            eacr.es || sync_port < 0) {
                etr_update_eacr(eacr);
                etr_set_tolec_timeout(now);
-               return;
+               goto out_unlock;
        }
  
        /*
        etr_update_eacr(eacr);
        set_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
        if (now < etr_tolec + (1600000 << 12) ||
-           etr_sync_clock(&aib, sync_port) != 0) {
+           etr_sync_clock_stop(&aib, sync_port) != 0) {
                /* Sync failed. Try again in 1/2 second. */
                eacr.es = 0;
                etr_update_eacr(eacr);
                etr_set_sync_timeout();
        } else
                etr_set_tolec_timeout(now);
+ out_unlock:
+       mutex_unlock(&etr_work_mutex);
  }
  
  /*
@@@ -1125,13 -1194,13 +1194,13 @@@ static ssize_t etr_online_store(struct 
                        return count;   /* Nothing to do. */
                etr_port0_online = value;
                set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
-               schedule_work(&etr_work);
+               queue_work(time_sync_wq, &etr_work);
        } else {
                if (etr_port1_online == value)
                        return count;   /* Nothing to do. */
                etr_port1_online = value;
                set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
-               schedule_work(&etr_work);
+               queue_work(time_sync_wq, &etr_work);
        }
        return count;
  }
@@@ -1332,6 -1401,7 +1401,7 @@@ static struct stp_sstpi stp_info
  static void *stp_page;
  
  static void stp_work_fn(struct work_struct *work);
+ static DEFINE_MUTEX(stp_work_mutex);
  static DECLARE_WORK(stp_work, stp_work_fn);
  
  static int __init early_parse_stp(char *p)
@@@ -1356,7 -1426,8 +1426,8 @@@ static void __init stp_reset(void
        if (rc == 0)
                set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags);
        else if (stp_online) {
-               printk(KERN_WARNING "Running on non STP capable machine.\n");
+               pr_warning("The real or virtual hardware system does "
+                          "not provide an STP interface\n");
                free_bootmem((unsigned long) stp_page, PAGE_SIZE);
                stp_page = NULL;
                stp_online = 0;
  
  static int __init stp_init(void)
  {
-       if (test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online)
-               schedule_work(&stp_work);
+       if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
+               return 0;
+       time_init_wq();
+       if (!stp_online)
+               return 0;
+       queue_work(time_sync_wq, &stp_work);
        return 0;
  }
  
@@@ -1383,7 -1458,7 +1458,7 @@@ arch_initcall(stp_init)
  static void stp_timing_alert(struct stp_irq_parm *intparm)
  {
        if (intparm->tsc || intparm->lac || intparm->tcpc)
-               schedule_work(&stp_work);
+               queue_work(time_sync_wq, &stp_work);
  }
  
  /*
@@@ -1397,7 -1472,7 +1472,7 @@@ void stp_sync_check(void
        if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags))
                return;
        disable_sync_clock(NULL);
-       schedule_work(&stp_work);
+       queue_work(time_sync_wq, &stp_work);
  }
  
  /*
@@@ -1411,46 -1486,34 +1486,34 @@@ void stp_island_check(void
        if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags))
                return;
        disable_sync_clock(NULL);
-       schedule_work(&stp_work);
+       queue_work(time_sync_wq, &stp_work);
  }
  
- /*
-  * STP tasklet. Check for the STP state and take over the clock
-  * synchronization if the STP clock source is usable.
-  */
- static void stp_work_fn(struct work_struct *work)
+ static int stp_sync_clock(void *data)
  {
-       struct clock_sync_data stp_sync;
+       static int first;
        unsigned long long old_clock, delta;
+       struct clock_sync_data *stp_sync;
        int rc;
  
-       if (!stp_online) {
-               chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
-               return;
-       }
+       stp_sync = data;
  
-       rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0);
-       if (rc)
-               return;
+       if (xchg(&first, 1) == 1) {
+               /* Slave */
+               clock_sync_cpu(stp_sync);
+               return 0;
+       }
  
-       rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
-       if (rc || stp_info.c == 0)
-               return;
+       /* Wait until all other cpus entered the sync function. */
+       while (atomic_read(&stp_sync->cpus) != 0)
+               cpu_relax();
  
-       /*
-        * Catch all other cpus and make them wait until we have
-        * successfully synced the clock. smp_call_function will
-        * return after all other cpus are in clock_sync_cpu_start.
-        */
-       memset(&stp_sync, 0, sizeof(stp_sync));
-       preempt_disable();
-       smp_call_function(clock_sync_cpu_start, &stp_sync, 0);
-       local_irq_disable();
        enable_sync_clock();
  
        set_bit(CLOCK_SYNC_STP, &clock_sync_flags);
        if (test_and_clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
-               schedule_work(&etr_work);
+               queue_work(time_sync_wq, &etr_work);
  
        rc = 0;
        if (stp_info.todoff[0] || stp_info.todoff[1] ||
        }
        if (rc) {
                disable_sync_clock(NULL);
-               stp_sync.in_sync = -EAGAIN;
+               stp_sync->in_sync = -EAGAIN;
                clear_bit(CLOCK_SYNC_STP, &clock_sync_flags);
                if (etr_port0_online || etr_port1_online)
-                       schedule_work(&etr_work);
+                       queue_work(time_sync_wq, &etr_work);
        } else
-               stp_sync.in_sync = 1;
+               stp_sync->in_sync = 1;
+       xchg(&first, 0);
+       return 0;
+ }
+ /*
+  * STP work. Check for the STP state and take over the clock
+  * synchronization if the STP clock source is usable.
+  */
+ static void stp_work_fn(struct work_struct *work)
+ {
+       struct clock_sync_data stp_sync;
+       int rc;
+       /* prevent multiple execution. */
+       mutex_lock(&stp_work_mutex);
+       if (!stp_online) {
+               chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
+               goto out_unlock;
+       }
+       rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0);
+       if (rc)
+               goto out_unlock;
+       rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+       if (rc || stp_info.c == 0)
+               goto out_unlock;
+       memset(&stp_sync, 0, sizeof(stp_sync));
+       get_online_cpus();
+       atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
+       stop_machine(stp_sync_clock, &stp_sync, &cpu_online_map);
+       put_online_cpus();
  
-       local_irq_enable();
-       smp_call_function(clock_sync_cpu_end, NULL, 0);
-       preempt_enable();
+ out_unlock:
+       mutex_unlock(&stp_work_mutex);
  }
  
  /*
@@@ -1587,7 -1683,7 +1683,7 @@@ static ssize_t stp_online_store(struct 
        if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
                return -EOPNOTSUPP;
        stp_online = value;
-       schedule_work(&stp_work);
+       queue_work(time_sync_wq, &stp_work);
        return count;
  }
  
@@@ -3,6 -3,9 +3,9 @@@
   *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
   */
  
+ #define KMSG_COMPONENT "cpu"
+ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  #include <linux/kernel.h>
  #include <linux/mm.h>
  #include <linux/init.h>
@@@ -12,6 -15,7 +15,7 @@@
  #include <linux/workqueue.h>
  #include <linux/cpu.h>
  #include <linux/smp.h>
+ #include <linux/cpuset.h>
  #include <asm/delay.h>
  #include <asm/s390_ext.h>
  #include <asm/sysinfo.h>
@@@ -57,11 -61,11 +61,11 @@@ struct core_info 
        cpumask_t mask;
  };
  
+ static int topology_enabled;
  static void topology_work_fn(struct work_struct *work);
  static struct tl_info *tl_info;
  static struct core_info core_info;
  static int machine_has_topology;
- static int machine_has_topology_irq;
  static struct timer_list topology_timer;
  static void set_topology_timer(void);
  static DECLARE_WORK(topology_work, topology_work_fn);
@@@ -77,8 -81,8 +81,8 @@@ cpumask_t cpu_coregroup_map(unsigned in
        cpumask_t mask;
  
        cpus_clear(mask);
-       if (!machine_has_topology)
-               return cpu_present_map;
+       if (!topology_enabled || !machine_has_topology)
+               return cpu_possible_map;
        spin_lock_irqsave(&topology_lock, flags);
        while (core) {
                if (cpu_isset(cpu, core->mask)) {
        return mask;
  }
  
 +const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
 +{
 +      return &cpu_core_map[cpu];
 +}
 +
  static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
  {
        unsigned int cpu;
@@@ -173,7 -172,7 +177,7 @@@ static void topology_update_polarizatio
        int cpu;
  
        mutex_lock(&smp_cpu_state_mutex);
-       for_each_present_cpu(cpu)
+       for_each_possible_cpu(cpu)
                smp_cpu_polarization[cpu] = POLARIZATION_HRZ;
        mutex_unlock(&smp_cpu_state_mutex);
  }
@@@ -204,7 -203,7 +208,7 @@@ int topology_set_cpu_management(int fc
                rc = ptf(PTF_HORIZONTAL);
        if (rc)
                return -EBUSY;
-       for_each_present_cpu(cpu)
+       for_each_possible_cpu(cpu)
                smp_cpu_polarization[cpu] = POLARIZATION_UNKNWN;
        return rc;
  }
@@@ -213,11 -212,11 +217,11 @@@ static void update_cpu_core_map(void
  {
        int cpu;
  
-       for_each_present_cpu(cpu)
+       for_each_possible_cpu(cpu)
                cpu_core_map[cpu] = cpu_coregroup_map(cpu);
  }
  
void arch_update_cpu_topology(void)
int arch_update_cpu_topology(void)
  {
        struct tl_info *info = tl_info;
        struct sys_device *sysdev;
        if (!machine_has_topology) {
                update_cpu_core_map();
                topology_update_polarization_simple();
-               return;
+               return 0;
        }
        stsi(info, 15, 1, 2);
        tl_to_cores(info);
                sysdev = get_cpu_sysdev(cpu);
                kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
        }
+       return 1;
  }
  
  static void topology_work_fn(struct work_struct *work)
  {
-       arch_reinit_sched_domains();
+       rebuild_sched_domains();
  }
  
  void topology_schedule_update(void)
@@@ -262,10 -262,14 +267,14 @@@ static void set_topology_timer(void
        add_timer(&topology_timer);
  }
  
- static void topology_interrupt(__u16 code)
+ static int __init early_parse_topology(char *p)
  {
-       schedule_work(&topology_work);
+       if (strncmp(p, "on", 2))
+               return 0;
+       topology_enabled = 1;
+       return 0;
  }
+ early_param("topology", early_parse_topology);
  
  static int __init init_topology_update(void)
  {
                goto out;
        }
        init_timer_deferrable(&topology_timer);
-       if (machine_has_topology_irq) {
-               rc = register_external_interrupt(0x2005, topology_interrupt);
-               if (rc)
-                       goto out;
-               ctl_set_bit(0, 8);
-       }
-       else
-               set_topology_timer();
+       set_topology_timer();
  out:
        update_cpu_core_map();
        return rc;
@@@ -305,9 -302,6 +307,6 @@@ void __init s390_init_cpu_topology(void
                return;
        machine_has_topology = 1;
  
-       if (facility_bits & (1ULL << 51))
-               machine_has_topology_irq = 1;
        tl_info = alloc_bootmem_pages(PAGE_SIZE);
        info = tl_info;
        stsi(info, 15, 1, 2);
        for (i = 0; i < info->mnest - 2; i++)
                nr_cores *= info->mag[NR_MAG - 3 - i];
  
-       printk(KERN_INFO "CPU topology:");
+       pr_info("The CPU configuration topology of the machine is:");
        for (i = 0; i < NR_MAG; i++)
                printk(" %d", info->mag[i]);
        printk(" / %d\n", info->mnest);
        return;
  error:
        machine_has_topology = 0;
-       machine_has_topology_irq = 0;
  }
@@@ -19,6 -19,8 +19,8 @@@ struct pci_sysdata 
  };
  
  extern int pci_routeirq;
+ extern int noioapicquirk;
+ extern int noioapicreroute;
  
  /* scan a bus after allocating a pci_sysdata for it */
  extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@@ -98,9 -100,9 +100,9 @@@ static inline void early_quirks(void) 
  
  #ifdef CONFIG_NUMA
  /* Returns the node based on pci bus */
 -static inline int __pcibus_to_node(struct pci_bus *bus)
 +static inline int __pcibus_to_node(const struct pci_bus *bus)
  {
 -      struct pci_sysdata *sd = bus->sysdata;
 +      const struct pci_sysdata *sd = bus->sysdata;
  
        return sd->node;
  }
@@@ -109,12 -111,6 +111,12 @@@ static inline cpumask_t __pcibus_to_cpu
  {
        return node_to_cpumask(__pcibus_to_node(bus));
  }
 +
 +static inline const struct cpumask *
 +cpumask_of_pcibus(const struct pci_bus *bus)
 +{
 +      return cpumask_of_node(__pcibus_to_node(bus));
 +}
  #endif
  
  #endif /* _ASM_X86_PCI_H */
diff --combined arch/x86/kernel/apic.c
@@@ -30,6 -30,7 +30,7 @@@
  #include <linux/module.h>
  #include <linux/dmi.h>
  #include <linux/dmar.h>
+ #include <linux/ftrace.h>
  
  #include <asm/atomic.h>
  #include <asm/smp.h>
@@@ -141,7 -142,7 +142,7 @@@ static int lapic_next_event(unsigned lo
                            struct clock_event_device *evt);
  static void lapic_timer_setup(enum clock_event_mode mode,
                              struct clock_event_device *evt);
 -static void lapic_timer_broadcast(cpumask_t mask);
 +static void lapic_timer_broadcast(const struct cpumask *mask);
  static void apic_pm_activate(void);
  
  /*
@@@ -441,6 -442,7 +442,7 @@@ static void lapic_timer_setup(enum cloc
                v = apic_read(APIC_LVTT);
                v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
                apic_write(APIC_LVTT, v);
+               apic_write(APIC_TMICT, 0xffffffff);
                break;
        case CLOCK_EVT_MODE_RESUME:
                /* Nothing to do here */
  /*
   * Local APIC timer broadcast function
   */
 -static void lapic_timer_broadcast(cpumask_t mask)
 +static void lapic_timer_broadcast(const struct cpumask *mask)
  {
  #ifdef CONFIG_SMP
 -      send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
 +      send_IPI_mask(*mask, LOCAL_TIMER_VECTOR);
  #endif
  }
  
@@@ -469,7 -471,7 +471,7 @@@ static void __cpuinit setup_APIC_timer(
        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
  
        memcpy(levt, &lapic_clockevent, sizeof(*levt));
 -      levt->cpumask = cpumask_of_cpu(smp_processor_id());
 +      levt->cpumask = cpumask_of(smp_processor_id());
  
        clockevents_register_device(levt);
  }
@@@ -559,13 -561,13 +561,13 @@@ static int __init calibrate_by_pmtimer(
        } else {
                res = (((u64)deltapm) *  mult) >> 22;
                do_div(res, 1000000);
-               printk(KERN_WARNING "APIC calibration not consistent "
+               pr_warning("APIC calibration not consistent "
                        "with PM Timer: %ldms instead of 100ms\n",
                        (long)res);
                /* Correct the lapic counter value */
                res = (((u64)(*delta)) * pm_100ms);
                do_div(res, deltapm);
-               printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+               pr_info("APIC delta adjusted to PM-Timer: "
                        "%lu (%ld)\n", (unsigned long)res, *delta);
                *delta = (long)res;
        }
@@@ -645,8 -647,7 +647,7 @@@ static int __init calibrate_APIC_clock(
         */
        if (calibration_result < (1000000 / HZ)) {
                local_irq_enable();
-               printk(KERN_WARNING
-                      "APIC frequency too slow, disabling apic timer\n");
+               pr_warning("APIC frequency too slow, disabling apic timer\n");
                return -1;
        }
  
                while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
                        cpu_relax();
  
                /* Stop the lapic timer */
                lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
  
-               local_irq_enable();
                /* Jiffies delta */
                deltaj = lapic_cal_j2 - lapic_cal_j1;
                apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
                local_irq_enable();
  
        if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
-               printk(KERN_WARNING
-                      "APIC timer disabled due to verification failure.\n");
+               pr_warning("APIC timer disabled due to verification failure.\n");
                        return -1;
        }
  
@@@ -714,7 -710,7 +710,7 @@@ void __init setup_boot_APIC_clock(void
         * broadcast mechanism is used. On UP systems simply ignore it.
         */
        if (disable_apic_timer) {
-               printk(KERN_INFO "Disabling APIC timer\n");
+               pr_info("Disabling APIC timer\n");
                /* No broadcast on UP ! */
                if (num_possible_cpus() > 1) {
                        lapic_clockevent.mult = 1;
        if (nmi_watchdog != NMI_IO_APIC)
                lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
        else
-               printk(KERN_WARNING "APIC timer registered as dummy,"
+               pr_warning("APIC timer registered as dummy,"
                        " due to nmi_watchdog=%d!\n", nmi_watchdog);
  
        /* Setup the lapic or request the broadcast */
@@@ -773,8 -769,7 +769,7 @@@ static void local_apic_timer_interrupt(
         * spurious.
         */
        if (!evt->event_handler) {
-               printk(KERN_WARNING
-                      "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+               pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
                /* Switch it off */
                lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
                return;
        /*
         * the NMI deadlock-detector uses this.
         */
- #ifdef CONFIG_X86_64
-       add_pda(apic_timer_irqs, 1);
- #else
-       per_cpu(irq_stat, cpu).apic_timer_irqs++;
- #endif
+       inc_irq_stat(apic_timer_irqs);
  
        evt->event_handler(evt);
  }
   * [ if a single-CPU system runs an SMP kernel then we call the local
   *   interrupt as well. Thus we cannot inline the local irq ... ]
   */
- void smp_apic_timer_interrupt(struct pt_regs *regs)
+ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
  {
        struct pt_regs *old_regs = set_irq_regs(regs);
  
         * Besides, if we don't timer interrupts ignore the global
         * interrupt lock, which is the WrongThing (tm) to do.
         */
- #ifdef CONFIG_X86_64
        exit_idle();
- #endif
        irq_enter();
        local_apic_timer_interrupt();
        irq_exit();
@@@ -1093,7 -1082,7 +1082,7 @@@ static void __cpuinit lapic_setup_esr(v
        unsigned int oldvalue, value, maxlvt;
  
        if (!lapic_is_integrated()) {
-               printk(KERN_INFO "No ESR for 82489DX.\n");
+               pr_info("No ESR for 82489DX.\n");
                return;
        }
  
                 * ESR disabled - we can't do anything useful with the
                 * errors anyway - mbligh
                 */
-               printk(KERN_INFO "Leaving ESR disabled.\n");
+               pr_info("Leaving ESR disabled.\n");
                return;
        }
  
@@@ -1298,7 -1287,7 +1287,7 @@@ void check_x2apic(void
        rdmsr(MSR_IA32_APICBASE, msr, msr2);
  
        if (msr & X2APIC_ENABLE) {
-               printk("x2apic enabled by BIOS, switching to x2apic ops\n");
+               pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
                x2apic_preenabled = x2apic = 1;
                apic_ops = &x2apic_ops;
        }
@@@ -1310,7 -1299,7 +1299,7 @@@ void enable_x2apic(void
  
        rdmsr(MSR_IA32_APICBASE, msr, msr2);
        if (!(msr & X2APIC_ENABLE)) {
-               printk("Enabling x2apic\n");
+               pr_info("Enabling x2apic\n");
                wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
        }
  }
@@@ -1325,9 -1314,8 +1314,8 @@@ void __init enable_IR_x2apic(void
                return;
  
        if (!x2apic_preenabled && disable_x2apic) {
-               printk(KERN_INFO
-                      "Skipped enabling x2apic and Interrupt-remapping "
-                      "because of nox2apic\n");
+               pr_info("Skipped enabling x2apic and Interrupt-remapping "
+                       "because of nox2apic\n");
                return;
        }
  
                panic("Bios already enabled x2apic, can't enforce nox2apic");
  
        if (!x2apic_preenabled && skip_ioapic_setup) {
-               printk(KERN_INFO
-                      "Skipped enabling x2apic and Interrupt-remapping "
-                      "because of skipping io-apic setup\n");
+               pr_info("Skipped enabling x2apic and Interrupt-remapping "
+                       "because of skipping io-apic setup\n");
                return;
        }
  
        ret = dmar_table_init();
        if (ret) {
-               printk(KERN_INFO
-                      "dmar_table_init() failed with %d:\n", ret);
+               pr_info("dmar_table_init() failed with %d:\n", ret);
  
                if (x2apic_preenabled)
                        panic("x2apic enabled by bios. But IR enabling failed");
                else
-                       printk(KERN_INFO
-                              "Not enabling x2apic,Intr-remapping\n");
+                       pr_info("Not enabling x2apic,Intr-remapping\n");
                return;
        }
  
  
        ret = save_mask_IO_APIC_setup();
        if (ret) {
-               printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
+               pr_info("Saving IO-APIC state failed: %d\n", ret);
                goto end;
        }
  
  
        if (!ret) {
                if (!x2apic_preenabled)
-                       printk(KERN_INFO
-                              "Enabled x2apic and interrupt-remapping\n");
+                       pr_info("Enabled x2apic and interrupt-remapping\n");
                else
-                       printk(KERN_INFO
-                              "Enabled Interrupt-remapping\n");
+                       pr_info("Enabled Interrupt-remapping\n");
        } else
-               printk(KERN_ERR
-                      "Failed to enable Interrupt-remapping and x2apic\n");
+               pr_err("Failed to enable Interrupt-remapping and x2apic\n");
  #else
        if (!cpu_has_x2apic)
                return;
                panic("x2apic enabled prior OS handover,"
                      " enable CONFIG_INTR_REMAP");
  
-       printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
-              " and x2apic\n");
+       pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
+               " and x2apic\n");
  #endif
  
        return;
  static int __init detect_init_APIC(void)
  {
        if (!cpu_has_apic) {
-               printk(KERN_INFO "No local APIC present\n");
+               pr_info("No local APIC present\n");
                return -1;
        }
  
@@@ -1469,8 -1451,8 +1451,8 @@@ static int __init detect_init_APIC(void
                 * "lapic" specified.
                 */
                if (!force_enable_local_apic) {
-                       printk(KERN_INFO "Local APIC disabled by BIOS -- "
-                              "you can enable it with \"lapic\"\n");
+                       pr_info("Local APIC disabled by BIOS -- "
+                               "you can enable it with \"lapic\"\n");
                        return -1;
                }
                /*
                 */
                rdmsr(MSR_IA32_APICBASE, l, h);
                if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-                       printk(KERN_INFO
-                              "Local APIC disabled by BIOS -- reenabling.\n");
+                       pr_info("Local APIC disabled by BIOS -- reenabling.\n");
                        l &= ~MSR_IA32_APICBASE_BASE;
                        l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
                        wrmsr(MSR_IA32_APICBASE, l, h);
         */
        features = cpuid_edx(1);
        if (!(features & (1 << X86_FEATURE_APIC))) {
-               printk(KERN_WARNING "Could not enable APIC!\n");
+               pr_warning("Could not enable APIC!\n");
                return -1;
        }
        set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
        if (l & MSR_IA32_APICBASE_ENABLE)
                mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
  
-       printk(KERN_INFO "Found and enabled local APIC!\n");
+       pr_info("Found and enabled local APIC!\n");
  
        apic_pm_activate();
  
        return 0;
  
  no_apic:
-       printk(KERN_INFO "No local APIC present or hardware disabled\n");
+       pr_info("No local APIC present or hardware disabled\n");
        return -1;
  }
  #endif
@@@ -1588,12 -1569,12 +1569,12 @@@ int __init APIC_init_uniprocessor(void
  {
  #ifdef CONFIG_X86_64
        if (disable_apic) {
-               printk(KERN_INFO "Apic disabled\n");
+               pr_info("Apic disabled\n");
                return -1;
        }
        if (!cpu_has_apic) {
                disable_apic = 1;
-               printk(KERN_INFO "Apic disabled by BIOS\n");
+               pr_info("Apic disabled by BIOS\n");
                return -1;
        }
  #else
         */
        if (!cpu_has_apic &&
            APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
-               printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
-                      boot_cpu_physical_apicid);
+               pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
+                       boot_cpu_physical_apicid);
                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
                return -1;
        }
@@@ -1682,9 -1663,7 +1663,7 @@@ void smp_spurious_interrupt(struct pt_r
  {
        u32 v;
  
- #ifdef CONFIG_X86_64
        exit_idle();
- #endif
        irq_enter();
        /*
         * Check if this really is a spurious interrupt and ACK it
        if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
                ack_APIC_irq();
  
- #ifdef CONFIG_X86_64
-       add_pda(irq_spurious_count, 1);
- #else
+       inc_irq_stat(irq_spurious_count);
        /* see sw-dev-man vol 3, chapter 7.4.13.5 */
-       printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
-              "should never happen.\n", smp_processor_id());
-       __get_cpu_var(irq_stat).irq_spurious_count++;
- #endif
+       pr_info("spurious APIC interrupt on CPU#%d, "
+               "should never happen.\n", smp_processor_id());
        irq_exit();
  }
  
@@@ -1713,9 -1689,7 +1689,7 @@@ void smp_error_interrupt(struct pt_reg
  {
        u32 v, v1;
  
- #ifdef CONFIG_X86_64
        exit_idle();
- #endif
        irq_enter();
        /* First tickle the hardware, only then report what went on. -- REW */
        v = apic_read(APIC_ESR);
        ack_APIC_irq();
        atomic_inc(&irq_err_count);
  
-       /* Here is what the APIC error bits mean:
-          0: Send CS error
-          1: Receive CS error
-          2: Send accept error
-          3: Receive accept error
-          4: Reserved
-          5: Send illegal vector
-          6: Received illegal vector
-          7: Illegal register address
-       */
-       printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+       /*
+        * Here is what the APIC error bits mean:
+        * 0: Send CS error
+        * 1: Receive CS error
+        * 2: Send accept error
+        * 3: Receive accept error
+        * 4: Reserved
+        * 5: Send illegal vector
+        * 6: Received illegal vector
+        * 7: Illegal register address
+        */
+       pr_debug("APIC error on CPU%d: %02x(%02x)\n",
                smp_processor_id(), v , v1);
        irq_exit();
  }
@@@ -1838,15 -1813,15 +1813,15 @@@ void __cpuinit generic_processor_info(i
         * Validate version
         */
        if (version == 0x0) {
-               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
-                               "fixing up to 0x10. (tell your hw vendor)\n",
-                               version);
+               pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
+                       "fixing up to 0x10. (tell your hw vendor)\n",
+                       version);
                version = 0x10;
        }
        apic_version[apicid] = version;
  
        if (num_processors >= NR_CPUS) {
-               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+               pr_warning("WARNING: NR_CPUS limit of %i reached."
                        "  Processor ignored.\n", NR_CPUS);
                return;
        }
@@@ -2209,7 -2184,7 +2184,7 @@@ static int __init apic_set_verbosity(ch
        else if (strcmp("verbose", arg) == 0)
                apic_verbosity = APIC_VERBOSE;
        else {
-               printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+               pr_warning("APIC Verbosity level %s not recognised"
                        " use apic=verbose or apic=debug\n", arg);
                return -EINVAL;
        }
@@@ -626,8 -626,8 +626,8 @@@ static ssize_t show_shared_cpu_map_func
                cpumask_t *mask = &this_leaf->shared_cpu_map;
  
                n = type?
 -                      cpulist_scnprintf(buf, len-2, *mask):
 -                      cpumask_scnprintf(buf, len-2, *mask);
 +                      cpulist_scnprintf(buf, len-2, mask) :
 +                      cpumask_scnprintf(buf, len-2, mask);
                buf[n++] = '\n';
                buf[n] = '\0';
        }
@@@ -644,20 -644,17 +644,17 @@@ static inline ssize_t show_shared_cpu_l
        return show_shared_cpu_map_func(leaf, 1, buf);
  }
  
- static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
-       switch(this_leaf->eax.split.type) {
-           case CACHE_TYPE_DATA:
+ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
+ {
+       switch (this_leaf->eax.split.type) {
+       case CACHE_TYPE_DATA:
                return sprintf(buf, "Data\n");
-               break;
-           case CACHE_TYPE_INST:
+       case CACHE_TYPE_INST:
                return sprintf(buf, "Instruction\n");
-               break;
-           case CACHE_TYPE_UNIFIED:
+       case CACHE_TYPE_UNIFIED:
                return sprintf(buf, "Unified\n");
-               break;
-           default:
+       default:
                return sprintf(buf, "Unknown\n");
-               break;
        }
  }
  
diff --combined arch/x86/kernel/hpet.c
@@@ -33,7 -33,9 +33,9 @@@
   * HPET address is set in acpi/boot.c, when an ACPI entry exists
   */
  unsigned long                         hpet_address;
- unsigned long                         hpet_num_timers;
+ #ifdef CONFIG_PCI_MSI
+ static unsigned long                  hpet_num_timers;
+ #endif
  static void __iomem                   *hpet_virt_address;
  
  struct hpet_dev {
@@@ -246,7 -248,7 +248,7 @@@ static void hpet_legacy_clockevent_regi
         * Start hpet with the boot cpu mask and make it
         * global after the IO_APIC has been initialized.
         */
 -      hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
 +      hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
        clockevents_register_device(&hpet_clockevent);
        global_clock_event = &hpet_clockevent;
        printk(KERN_DEBUG "hpet clockevent registered\n");
@@@ -301,7 -303,7 +303,7 @@@ static void hpet_set_mode(enum clock_ev
                        struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
                        hpet_setup_msi_irq(hdev->irq);
                        disable_irq(hdev->irq);
 -                      irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
 +                      irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
                        enable_irq(hdev->irq);
                }
                break;
@@@ -449,7 -451,7 +451,7 @@@ static int hpet_setup_irq(struct hpet_d
                return -1;
  
        disable_irq(dev->irq);
 -      irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
 +      irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
        enable_irq(dev->irq);
  
        printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@@ -500,7 -502,7 +502,7 @@@ static void init_one_hpet_msi_clockeven
        /* 5 usec minimum reprogramming delta. */
        evt->min_delta_ns = 5000;
  
 -      evt->cpumask = cpumask_of_cpu(hdev->cpu);
 +      evt->cpumask = cpumask_of(hdev->cpu);
        clockevents_register_device(evt);
  }
  
@@@ -361,8 -361,7 +361,8 @@@ static void __target_IO_APIC_irq(unsign
  
  static int assign_irq_vector(int irq, cpumask_t mask);
  
 -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 +static void set_ioapic_affinity_irq(unsigned int irq,
 +                                  const struct cpumask *mask)
  {
        struct irq_cfg *cfg;
        unsigned long flags;
        cpumask_t tmp;
        struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
        cfg = irq_cfg(irq);
 -      if (assign_irq_vector(irq, mask))
 +      if (assign_irq_vector(irq, *mask))
                return;
  
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
        /*
         * Only the high 8 bits are valid.
        desc = irq_to_desc(irq);
        spin_lock_irqsave(&ioapic_lock, flags);
        __target_IO_APIC_irq(irq, dest, cfg->vector);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
        spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  #endif /* CONFIG_SMP */
@@@ -2189,7 -2189,7 +2189,7 @@@ static void ir_irq_migration(struct wor
                                continue;
                        }
  
 -                      desc->chip->set_affinity(irq, desc->pending_mask);
 +                      desc->chip->set_affinity(irq, &desc->pending_mask);
                        spin_unlock_irqrestore(&desc->lock, flags);
                }
        }
  /*
   * Migrates the IRQ destination in the process context.
   */
 -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 +static void set_ir_ioapic_affinity_irq(unsigned int irq,
 +                                     const struct cpumask *mask)
  {
        struct irq_desc *desc = irq_to_desc(irq);
  
        if (desc->status & IRQ_LEVEL) {
                desc->status |= IRQ_MOVE_PENDING;
 -              desc->pending_mask = mask;
 +              cpumask_copy(&desc->pending_mask, mask);
                migrate_irq_remapped_level(irq);
                return;
        }
  
 -      migrate_ioapic_irq(irq, mask);
 +      migrate_ioapic_irq(irq, *mask);
  }
  #endif
  
  asmlinkage void smp_irq_move_cleanup_interrupt(void)
  {
        unsigned vector, me;
        ack_APIC_irq();
- #ifdef CONFIG_X86_64
        exit_idle();
- #endif
        irq_enter();
  
        me = smp_processor_id();
@@@ -3028,7 -3026,7 +3027,7 @@@ static int msi_compose_msg(struct pci_d
  }
  
  #ifdef CONFIG_SMP
 -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 +static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
        struct irq_cfg *cfg;
        struct msi_msg msg;
        cpumask_t tmp;
        struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
 -      if (assign_irq_vector(irq, mask))
 +      if (assign_irq_vector(irq, *mask))
                return;
  
        cfg = irq_cfg(irq);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        read_msi_msg(irq, &msg);
  
        write_msi_msg(irq, &msg);
        desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  
  #ifdef CONFIG_INTR_REMAP
   * Migrate the MSI irq to another cpumask. This migration is
   * done in the process context using interrupt-remapping hardware.
   */
 -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 +static void ir_set_msi_irq_affinity(unsigned int irq,
 +                                  const struct cpumask *mask)
  {
        struct irq_cfg *cfg;
        unsigned int dest;
        struct irte irte;
        struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
        if (get_irte(irq, &irte))
                return;
  
 -      if (assign_irq_vector(irq, mask))
 +      if (assign_irq_vector(irq, *mask))
                return;
  
        cfg = irq_cfg(irq);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        irte.vector = cfg->vector;
        }
  
        desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #endif
  #endif /* CONFIG_SMP */
@@@ -3308,7 -3307,7 +3307,7 @@@ void arch_teardown_msi_irq(unsigned in
  
  #ifdef CONFIG_DMAR
  #ifdef CONFIG_SMP
 -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 +static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
        struct irq_cfg *cfg;
        struct msi_msg msg;
        cpumask_t tmp;
        struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
 -      if (assign_irq_vector(irq, mask))
 +      if (assign_irq_vector(irq, *mask))
                return;
  
        cfg = irq_cfg(irq);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        dmar_msi_read(irq, &msg);
  
        dmar_msi_write(irq, &msg);
        desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #endif /* CONFIG_SMP */
  
@@@ -3368,7 -3368,7 +3367,7 @@@ int arch_setup_dmar_msi(unsigned int ir
  #ifdef CONFIG_HPET_TIMER
  
  #ifdef CONFIG_SMP
 -static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
 +static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
        struct irq_cfg *cfg;
        struct irq_desc *desc;
        unsigned int dest;
        cpumask_t tmp;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
 -      if (assign_irq_vector(irq, mask))
 +      if (assign_irq_vector(irq, *mask))
                return;
  
        cfg = irq_cfg(irq);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        hpet_msi_read(irq, &msg);
  
        hpet_msi_write(irq, &msg);
        desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #endif /* CONFIG_SMP */
  
@@@ -3449,26 -3450,27 +3448,26 @@@ static void target_ht_irq(unsigned int 
        write_ht_irq_msg(irq, &msg);
  }
  
 -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 +static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
        struct irq_cfg *cfg;
        unsigned int dest;
        cpumask_t tmp;
        struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
 -      if (assign_irq_vector(irq, mask))
 +      if (assign_irq_vector(irq, *mask))
                return;
  
        cfg = irq_cfg(irq);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        target_ht_irq(irq, dest, cfg->vector);
        desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #endif
  
@@@ -3791,10 -3793,10 +3790,10 @@@ void __init setup_ioapic_dest(void
  
  #ifdef CONFIG_INTR_REMAP
                        if (intr_remapping_enabled)
 -                              set_ir_ioapic_affinity_irq(irq, mask);
 +                              set_ir_ioapic_affinity_irq(irq, &mask);
                        else
  #endif
 -                              set_ioapic_affinity_irq(irq, mask);
 +                              set_ioapic_affinity_irq(irq, &mask);
                }
  
        }
diff --combined arch/x86/kernel/irq_64.c
  #include <linux/seq_file.h>
  #include <linux/module.h>
  #include <linux/delay.h>
+ #include <linux/ftrace.h>
  #include <asm/uaccess.h>
  #include <asm/io_apic.h>
  #include <asm/idle.h>
  #include <asm/smp.h>
  
- #ifdef CONFIG_DEBUG_STACKOVERFLOW
  /*
   * Probabilistic stack overflow check:
   *
   */
  static inline void stack_overflow_check(struct pt_regs *regs)
  {
+ #ifdef CONFIG_DEBUG_STACKOVERFLOW
        u64 curbase = (u64)task_stack_page(current);
-       static unsigned long warned = -60*HZ;
-       if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
-           regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
-           time_after(jiffies, warned + 60*HZ)) {
-               printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
-                      current->comm, curbase, regs->sp);
-               show_stack(NULL,NULL);
-               warned = jiffies;
-       }
- }
+       WARN_ONCE(regs->sp >= curbase &&
+                 regs->sp <= curbase + THREAD_SIZE &&
+                 regs->sp <  curbase + sizeof(struct thread_info) +
+                                       sizeof(struct pt_regs) + 128,
+                 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+                       current->comm, curbase, regs->sp);
  #endif
+ }
  
  /*
   * do_IRQ handles all normal device IRQ's (the special
   * SMP cross-CPU interrupts have their own specific
   * handlers).
   */
- asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+ asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
  {
        struct pt_regs *old_regs = set_irq_regs(regs);
        struct irq_desc *desc;
@@@ -60,9 -59,7 +59,7 @@@
        irq_enter();
        irq = __get_cpu_var(vector_irq)[vector];
  
- #ifdef CONFIG_DEBUG_STACKOVERFLOW
        stack_overflow_check(regs);
- #endif
  
        desc = irq_to_desc(irq);
        if (likely(desc))
@@@ -116,7 -113,7 +113,7 @@@ void fixup_irqs(cpumask_t map
                        desc->chip->mask(irq);
  
                if (desc->chip->set_affinity)
 -                      desc->chip->set_affinity(irq, mask);
 +                      desc->chip->set_affinity(irq, &mask);
                else if (!(warned++))
                        set_affinity = 0;
  
@@@ -62,6 -62,7 +62,7 @@@
  #include <asm/mtrr.h>
  #include <asm/vmi.h>
  #include <asm/genapic.h>
+ #include <asm/setup.h>
  #include <linux/mc146818rtc.h>
  
  #include <mach_apic.h>
@@@ -101,8 -102,14 +102,8 @@@ EXPORT_SYMBOL(smp_num_siblings)
  /* Last level cache ID of each logical CPU */
  DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
  
 -/* bitmap of online cpus */
 -cpumask_t cpu_online_map __read_mostly;
 -EXPORT_SYMBOL(cpu_online_map);
 -
  cpumask_t cpu_callin_map;
  cpumask_t cpu_callout_map;
 -cpumask_t cpu_possible_map;
 -EXPORT_SYMBOL(cpu_possible_map);
  
  /* representing HT siblings of each logical CPU */
  DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@@ -281,16 -288,14 +282,14 @@@ static int __cpuinitdata unsafe_smp
  /*
   * Activate a secondary processor.
   */
- static void __cpuinit start_secondary(void *unused)
notrace static void __cpuinit start_secondary(void *unused)
  {
        /*
         * Don't put *anything* before cpu_init(), SMP booting is too
         * fragile that we want to limit the things done here to the
         * most necessary things.
         */
- #ifdef CONFIG_VMI
        vmi_bringup();
- #endif
        cpu_init();
        preempt_disable();
        smp_callin();
@@@ -497,7 -502,7 +496,7 @@@ void __cpuinit set_cpu_sibling_map(int 
  }
  
  /* maps the cpu to the sched domain representing multi-core */
 -cpumask_t cpu_coregroup_map(int cpu)
 +const struct cpumask *cpu_coregroup_mask(int cpu)
  {
        struct cpuinfo_x86 *c = &cpu_data(cpu);
        /*
         * And for power savings, we return cpu_core_map
         */
        if (sched_mc_power_savings || sched_smt_power_savings)
 -              return per_cpu(cpu_core_map, cpu);
 +              return &per_cpu(cpu_core_map, cpu);
        else
 -              return c->llc_shared_map;
 +              return &c->llc_shared_map;
 +}
 +
 +cpumask_t cpu_coregroup_map(int cpu)
 +{
 +      return *cpu_coregroup_mask(cpu);
  }
  
  static void impress_friends(void)
        pr_debug("Before bogocount - setting activated=1.\n");
  }
  
static inline void __inquire_remote_apic(int apicid)
+ void __inquire_remote_apic(int apicid)
  {
        unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
        char *names[] = { "ID", "VERSION", "SPIV" };
        }
  }
  
- #ifdef WAKE_SECONDARY_VIA_NMI
  /*
   * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
   * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
   * won't ... remember to clear down the APIC, etc later.
   */
static int __devinit
- wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+ int __devinit
+ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
  {
        unsigned long send_status, accept_status = 0;
        int maxlvt;
         * Give the other CPU some time to accept the IPI.
         */
        udelay(200);
-       if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+       if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
                maxlvt = lapic_get_maxlvt();
                if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
                        apic_write(APIC_ESR, 0);
  
        return (send_status | accept_status);
  }
- #endif        /* WAKE_SECONDARY_VIA_NMI */
  
- #ifdef WAKE_SECONDARY_VIA_INIT
- static int __devinit
- wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+ int __devinit
+ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
  {
        unsigned long send_status, accept_status = 0;
        int maxlvt, num_starts, j;
  
        return (send_status | accept_status);
  }
- #endif        /* WAKE_SECONDARY_VIA_INIT */
  
  struct create_idle {
        struct work_struct work;
@@@ -1085,8 -1081,10 +1080,10 @@@ static int __init smp_sanity_check(unsi
  #endif
  
        if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
-               printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
-                                   "by the BIOS.\n", hard_smp_processor_id());
+               printk(KERN_WARNING
+                       "weird, boot CPU (#%d) not listed by the BIOS.\n",
+                       hard_smp_processor_id());
                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
        }
  
diff --combined arch/x86/lguest/boot.c
@@@ -590,7 -590,8 +590,8 @@@ static void __init lguest_init_IRQ(void
                 * a straightforward 1 to 1 mapping, so force that here. */
                __get_cpu_var(vector_irq)[vector] = i;
                if (vector != SYSCALL_VECTOR) {
-                       set_intr_gate(vector, interrupt[vector]);
+                       set_intr_gate(vector,
+                                     interrupt[vector-FIRST_EXTERNAL_VECTOR]);
                        set_irq_chip_and_handler_name(i, &lguest_irq_controller,
                                                      handle_level_irq,
                                                      "level");
@@@ -737,7 -738,7 +738,7 @@@ static void lguest_time_init(void
  
        /* We can't set cpumask in the initializer: damn C limitations!  Set it
         * here and register our timer device. */
 -      lguest_clockevent.cpumask = cpumask_of_cpu(0);
 +      lguest_clockevent.cpumask = cpumask_of(0);
        clockevents_register_device(&lguest_clockevent);
  
        /* Finally, we unblock the timer interrupt. */
diff --combined include/linux/smp.h
@@@ -21,9 -21,6 +21,9 @@@ struct call_single_data 
        u16 priv;
  };
  
 +/* total number of cpus in this system (may exceed NR_CPUS) */
 +extern unsigned int total_cpus;
 +
  #ifdef CONFIG_SMP
  
  #include <linux/preempt.h>
@@@ -149,6 -146,8 +149,8 @@@ static inline void smp_send_reschedule(
  })
  #define smp_call_function_mask(mask, func, info, wait) \
                        (up_smp_call_function(func, info))
+ #define smp_call_function_many(mask, func, info, wait) \
+                       (up_smp_call_function(func, info))
  static inline void init_call_single_data(void)
  {
  }
diff --combined init/Kconfig
@@@ -588,6 -588,13 +588,13 @@@ config KALLSYMS_AL
  
           Say N.
  
+ config KALLSYMS_STRIP_GENERATED
+       bool "Strip machine generated symbols from kallsyms"
+       depends on KALLSYMS_ALL
+       default y
+       help
+         Say N if you want kallsyms to retain even machine generated symbols.
  config KALLSYMS_EXTRA_PASS
        bool "Do an extra kallsyms pass"
        depends on KALLSYMS
@@@ -808,6 -815,7 +815,7 @@@ config TRACEPOINT
  
  config MARKERS
        bool "Activate markers"
+       depends on TRACEPOINTS
        help
          Place an empty function call at each marker site. Can be
          dynamically changed for a probe function.
@@@ -916,15 -924,6 +924,15 @@@ config KMO
  
  endif # MODULES
  
 +config INIT_ALL_POSSIBLE
 +      bool
 +      help
 +        Back when each arch used to define their own cpu_online_map and
 +        cpu_possible_map, some of them chose to initialize cpu_possible_map
 +        with all 1s, and others with all 0s.  When they were centralised,
 +        it was better to provide this option than to break all the archs
 +        and have several arch maintainers persuing me down dark alleys.
 +
  config STOP_MACHINE
        bool
        default y
diff --combined kernel/profile.c
@@@ -442,7 -442,7 +442,7 @@@ void profile_tick(int type
  static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
                        int count, int *eof, void *data)
  {
 -      int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
 +      int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
        if (count - len < 2)
                return -EINVAL;
        len += sprintf(page + len, "\n");
@@@ -456,7 -456,7 +456,7 @@@ static int prof_cpu_mask_write_proc(str
        unsigned long full_count = count, err;
        cpumask_t new_value;
  
 -      err = cpumask_parse_user(buffer, count, new_value);
 +      err = cpumask_parse_user(buffer, count, &new_value);
        if (err)
                return err;
  
@@@ -544,7 -544,7 +544,7 @@@ static const struct file_operations pro
  };
  
  #ifdef CONFIG_SMP
- static inline void profile_nop(void *unused)
+ static void profile_nop(void *unused)
  {
  }
  
diff --combined kernel/sched.c
   */
  #define RUNTIME_INF   ((u64)~0ULL)
  
+ DEFINE_TRACE(sched_wait_task);
+ DEFINE_TRACE(sched_wakeup);
+ DEFINE_TRACE(sched_wakeup_new);
+ DEFINE_TRACE(sched_switch);
+ DEFINE_TRACE(sched_migrate_task);
  #ifdef CONFIG_SMP
  /*
   * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@@ -261,6 -267,10 +267,10 @@@ struct task_group 
        struct cgroup_subsys_state css;
  #endif
  
+ #ifdef CONFIG_USER_SCHED
+       uid_t uid;
+ #endif
  #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
  
  #ifdef CONFIG_USER_SCHED
  
+ /* Helper function to pass uid information to create_sched_user() */
+ void set_tg_uid(struct user_struct *user)
+ {
+       user->tg->uid = user->uid;
+ }
  /*
   * Root task group.
   *    Every UID task group (including init_task_group aka UID-0) will
@@@ -345,7 -361,9 +361,9 @@@ static inline struct task_group *task_g
        struct task_group *tg;
  
  #ifdef CONFIG_USER_SCHED
-       tg = p->user->tg;
+       rcu_read_lock();
+       tg = __task_cred(p)->user->tg;
+       rcu_read_unlock();
  #elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
@@@ -586,6 -604,8 +604,8 @@@ struct rq 
  #ifdef CONFIG_SCHEDSTATS
        /* latency stats */
        struct sched_info rq_sched_info;
+       unsigned long long rq_cpu_time;
+       /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
        /* sys_sched_yield() stats */
        unsigned int yld_exp_empty;
@@@ -703,45 -723,18 +723,18 @@@ static __read_mostly char *sched_feat_n
  
  #undef SCHED_FEAT
  
- static int sched_feat_open(struct inode *inode, struct file *filp)
- {
-       filp->private_data = inode->i_private;
-       return 0;
- }
- static ssize_t
- sched_feat_read(struct file *filp, char __user *ubuf,
-               size_t cnt, loff_t *ppos)
+ static int sched_feat_show(struct seq_file *m, void *v)
  {
        int i;
  
        for (i = 0; sched_feat_names[i]; i++) {
-               len += strlen(sched_feat_names[i]);
-               len += 4;
-       }
-       buf = kmalloc(len + 2, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-       for (i = 0; sched_feat_names[i]; i++) {
-               if (sysctl_sched_features & (1UL << i))
-                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-               else
-                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
        }
+       seq_puts(m, "\n");
  
-       r += sprintf(buf + r, "\n");
-       WARN_ON(r >= len + 2);
-       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-       kfree(buf);
-       return r;
+       return 0;
  }
  
  static ssize_t
@@@ -786,10 -779,17 +779,17 @@@ sched_feat_write(struct file *filp, con
        return cnt;
  }
  
+ static int sched_feat_open(struct inode *inode, struct file *filp)
+ {
+       return single_open(filp, sched_feat_show, NULL);
+ }
  static struct file_operations sched_feat_fops = {
-       .open   = sched_feat_open,
-       .read   = sched_feat_read,
-       .write  = sched_feat_write,
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
  };
  
  static __init int sched_init_debug(void)
@@@ -1474,27 -1474,13 +1474,13 @@@ static voi
  update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
  {
-       int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
  
        if (!tg->se[cpu])
                return;
  
-       rq_weight = tg->cfs_rq[cpu]->load.weight;
-       /*
-        * If there are currently no tasks on the cpu pretend there is one of
-        * average load so that when a new task gets to run here it will not
-        * get delayed by group starvation.
-        */
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-       if (unlikely(rq_weight > sd_rq_weight))
-               rq_weight = sd_rq_weight;
+       rq_weight = tg->cfs_rq[cpu]->rq_weight;
  
        /*
         *           \Sum shares * rq_weight
         *               \Sum rq_weight
         *
         */
-       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
  
        if (abs(shares - tg->se[cpu]->load.weight) >
                unsigned long flags;
  
                spin_lock_irqsave(&rq->lock, flags);
-               /*
-                * record the actual number of shares, not the boosted amount.
-                */
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               tg->cfs_rq[cpu]->rq_weight = rq_weight;
+               tg->cfs_rq[cpu]->shares = shares;
  
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
   */
  static int tg_shares_up(struct task_group *tg, void *data)
  {
-       unsigned long rq_weight = 0;
+       unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
  
        for_each_cpu_mask(i, sd->span) {
-               rq_weight += tg->cfs_rq[i]->load.weight;
+               /*
+                * If there are currently no tasks on the cpu pretend there
+                * is one of average load so that when a new task gets to
+                * run here it will not get delayed by group starvation.
+                */
+               weight = tg->cfs_rq[i]->load.weight;
+               if (!weight)
+                       weight = NICE_0_LOAD;
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
  
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
  
-       if (!rq_weight)
-               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
        for_each_cpu_mask(i, sd->span)
                update_group_shares_cpu(tg, i, shares, rq_weight);
  
@@@ -1612,6 -1601,39 +1601,39 @@@ static inline void update_shares_locked
  
  #endif
  
+ /*
+  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  */
+ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+ {
+       int ret = 0;
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       if (unlikely(!spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       spin_unlock(&this_rq->lock);
+                       spin_lock(&busiest->lock);
+                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+ }
+ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+ {
+       spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ }
  #endif
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -1845,6 -1867,8 +1867,8 @@@ void set_task_cpu(struct task_struct *p
  
        clock_offset = old_rq->clock - new_rq->clock;
  
+       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
  #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
                p->se.wait_start -= clock_offset;
@@@ -2254,6 -2278,7 +2278,7 @@@ static int try_to_wake_up(struct task_s
  
        smp_wmb();
        rq = task_rq_lock(p, &flags);
+       update_rq_clock(rq);
        old_state = p->state;
        if (!(old_state & state))
                goto out;
@@@ -2311,12 -2336,11 +2336,11 @@@ out_activate
                schedstat_inc(p, se.nr_wakeups_local);
        else
                schedstat_inc(p, se.nr_wakeups_remote);
-       update_rq_clock(rq);
        activate_task(rq, p, 1);
        success = 1;
  
  out_running:
-       trace_sched_wakeup(rq, p);
+       trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
  
        p->state = TASK_RUNNING;
@@@ -2449,7 -2473,7 +2473,7 @@@ void wake_up_new_task(struct task_struc
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
-       trace_sched_wakeup_new(rq, p);
+       trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@@ -2812,40 -2836,6 +2836,6 @@@ static void double_rq_unlock(struct rq 
  }
  
  /*
-  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
-  */
- static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
- {
-       int ret = 0;
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-       if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       spin_unlock(&this_rq->lock);
-                       spin_lock(&busiest->lock);
-                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-       }
-       return ret;
- }
- static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
- {
-       spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
- }
- /*
   * If dest_cpu is allowed for this process, migrate the task to it.
   * This is accomplished by forcing the cpu_allowed mask to only
   * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@@ -2862,7 -2852,6 +2852,6 @@@ static void sched_migrate_task(struct t
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
  
-       trace_sched_migrate_task(rq, p, dest_cpu);
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
@@@ -3707,7 -3696,7 +3696,7 @@@ out_balanced
  static void idle_balance(int this_cpu, struct rq *this_rq)
  {
        struct sched_domain *sd;
-       int pulled_task = -1;
+       int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
        cpumask_t tmpmask;
  
@@@ -5134,6 -5123,22 +5123,22 @@@ __setscheduler(struct rq *rq, struct ta
        set_load_weight(p);
  }
  
+ /*
+  * check the target process has a UID that matches the current process's
+  */
+ static bool check_same_owner(struct task_struct *p)
+ {
+       const struct cred *cred = current_cred(), *pcred;
+       bool match;
+       rcu_read_lock();
+       pcred = __task_cred(p);
+       match = (cred->euid == pcred->euid ||
+                cred->euid == pcred->uid);
+       rcu_read_unlock();
+       return match;
+ }
  static int __sched_setscheduler(struct task_struct *p, int policy,
                                struct sched_param *param, bool user)
  {
@@@ -5193,8 -5198,7 +5198,7 @@@ recheck
                        return -EPERM;
  
                /* can't change other user's priorities */
-               if ((current->euid != p->euid) &&
-                   (current->euid != p->uid))
+               if (!check_same_owner(p))
                        return -EPERM;
        }
  
@@@ -5426,8 -5430,7 +5430,7 @@@ long sched_setaffinity(pid_t pid, cons
        read_unlock(&tasklist_lock);
  
        retval = -EPERM;
-       if ((current->euid != p->euid) && (current->euid != p->uid) &&
-                       !capable(CAP_SYS_NICE))
+       if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
  
        retval = security_task_setscheduler(p, 0, NULL);
@@@ -5896,6 -5899,7 +5899,7 @@@ void __cpuinit init_idle(struct task_st
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_task(idle);
  }
  
  /*
@@@ -6126,7 -6130,6 +6130,6 @@@ static int __migrate_task_irq(struct ta
  
  /*
   * Figure out where task on dead CPU should go, use force if necessary.
-  * NOTE: interrupts should be disabled by the caller
   */
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
@@@ -6638,35 -6641,13 +6641,13 @@@ early_initcall(migration_init)
  
  #ifdef CONFIG_SCHED_DEBUG
  
- static inline const char *sd_level_to_string(enum sched_domain_level lvl)
- {
-       switch (lvl) {
-       case SD_LV_NONE:
-                       return "NONE";
-       case SD_LV_SIBLING:
-                       return "SIBLING";
-       case SD_LV_MC:
-                       return "MC";
-       case SD_LV_CPU:
-                       return "CPU";
-       case SD_LV_NODE:
-                       return "NODE";
-       case SD_LV_ALLNODES:
-                       return "ALLNODES";
-       case SD_LV_MAX:
-                       return "MAX";
-       }
-       return "MAX";
- }
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
  {
        struct sched_group *group = sd->groups;
        char str[256];
  
 -      cpulist_scnprintf(str, sizeof(str), sd->span);
 +      cpulist_scnprintf(str, sizeof(str), &sd->span);
        cpus_clear(*groupmask);
  
        printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
                return -1;
        }
  
-       printk(KERN_CONT "span %s level %s\n",
-               str, sd_level_to_string(sd->level));
+       printk(KERN_CONT "span %s level %s\n", str, sd->name);
  
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
  
                cpus_or(*groupmask, *groupmask, group->cpumask);
  
 -              cpulist_scnprintf(str, sizeof(str), group->cpumask);
 +              cpulist_scnprintf(str, sizeof(str), &group->cpumask);
                printk(KERN_CONT " %s", str);
  
                group = group->next;
@@@ -6816,6 -6796,8 +6796,8 @@@ sd_parent_degenerate(struct sched_domai
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
        }
        if (~cflags & pflags)
                return 0;
@@@ -7119,7 -7101,7 +7101,7 @@@ cpu_to_phys_group(int cpu, const cpumas
  {
        int group;
  #ifdef CONFIG_SCHED_MC
 -      *mask = cpu_coregroup_map(cpu);
 +      *mask = *cpu_coregroup_mask(cpu);
        cpus_and(*mask, *mask, *cpu_map);
        group = first_cpu(*mask);
  #elif defined(CONFIG_SCHED_SMT)
@@@ -7336,13 -7318,21 +7318,21 @@@ struct allmasks 
  };
  
  #if   NR_CPUS > 128
- #define       SCHED_CPUMASK_ALLOC             1
- #define       SCHED_CPUMASK_FREE(v)           kfree(v)
- #define       SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
+ #define SCHED_CPUMASK_DECLARE(v)      struct allmasks *v
+ static inline void sched_cpumask_alloc(struct allmasks **masks)
+ {
+       *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+ }
+ static inline void sched_cpumask_free(struct allmasks *masks)
+ {
+       kfree(masks);
+ }
  #else
- #define       SCHED_CPUMASK_ALLOC             0
- #define       SCHED_CPUMASK_FREE(v)
- #define       SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
+ #define SCHED_CPUMASK_DECLARE(v)      struct allmasks _v, *v = &_v
+ static inline void sched_cpumask_alloc(struct allmasks **masks)
+ { }
+ static inline void sched_cpumask_free(struct allmasks *masks)
+ { }
  #endif
  
  #define       SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
@@@ -7418,9 -7408,8 +7408,8 @@@ static int __build_sched_domains(const 
                return -ENOMEM;
        }
  
- #if SCHED_CPUMASK_ALLOC
        /* get space for all scratch cpumask variables */
-       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+       sched_cpumask_alloc(&allmasks);
        if (!allmasks) {
                printk(KERN_WARNING "Cannot alloc cpumask array\n");
                kfree(rd);
  #endif
                return -ENOMEM;
        }
- #endif
        tmpmask = (cpumask_t *)allmasks;
  
  
                sd = &per_cpu(core_domains, i);
                SD_INIT(sd, MC);
                set_domain_attribute(sd, attr);
 -              sd->span = cpu_coregroup_map(i);
 +              sd->span = *cpu_coregroup_mask(i);
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                p->child = sd;
                SCHED_CPUMASK_VAR(this_core_map, allmasks);
                SCHED_CPUMASK_VAR(send_covered, allmasks);
  
 -              *this_core_map = cpu_coregroup_map(i);
 +              *this_core_map = *cpu_coregroup_mask(i);
                cpus_and(*this_core_map, *this_core_map, *cpu_map);
                if (i != first_cpu(*this_core_map))
                        continue;
                cpu_attach_domain(sd, rd, i);
        }
  
-       SCHED_CPUMASK_FREE((void *)allmasks);
+       sched_cpumask_free(allmasks);
        return 0;
  
  #ifdef CONFIG_NUMA
  error:
        free_sched_groups(cpu_map, tmpmask);
-       SCHED_CPUMASK_FREE((void *)allmasks);
+       sched_cpumask_free(allmasks);
        kfree(rd);
        return -ENOMEM;
  #endif
@@@ -7712,8 -7701,14 +7701,14 @@@ static struct sched_domain_attr *dattr_
   */
  static cpumask_t fallback_doms;
  
- void __attribute__((weak)) arch_update_cpu_topology(void)
+ /*
+  * arch_update_cpu_topology lets virtualized architectures update the
+  * cpu core maps. It is supposed to return 1 if the topology changed
+  * or 0 if it stayed the same.
+  */
+ int __attribute__((weak)) arch_update_cpu_topology(void)
  {
+       return 0;
  }
  
  /*
@@@ -7753,8 -7748,6 +7748,6 @@@ static void detach_destroy_domains(cons
        cpumask_t tmpmask;
        int i;
  
-       unregister_sched_domain_sysctl();
        for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
@@@ -7807,17 -7800,21 +7800,21 @@@ void partition_sched_domains(int ndoms_
                             struct sched_domain_attr *dattr_new)
  {
        int i, j, n;
+       int new_topology;
  
        mutex_lock(&sched_domains_mutex);
  
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
  
+       /* Let architecture update cpu core mappings. */
+       new_topology = arch_update_cpu_topology();
        n = doms_new ? ndoms_new : 0;
  
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < n; j++) {
+               for (j = 0; j < n && !new_topology; j++) {
                        if (cpus_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
@@@ -7832,12 -7829,12 +7829,12 @@@ match1
                ndoms_cur = 0;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-               dattr_new = NULL;
+               WARN_ON_ONCE(dattr_new);
        }
  
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur; j++) {
+               for (j = 0; j < ndoms_cur && !new_topology; j++) {
                        if (cpus_equal(doms_new[i], doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
@@@ -8492,7 -8489,7 +8489,7 @@@ stati
  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
        struct cfs_rq *cfs_rq;
-       struct sched_entity *se, *parent_se;
+       struct sched_entity *se;
        struct rq *rq;
        int i;
  
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
  
-               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
  
-               se = kmalloc_node(sizeof(struct sched_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               se = kzalloc_node(sizeof(struct sched_entity),
+                                 GFP_KERNEL, cpu_to_node(i));
                if (!se)
                        goto err;
  
-               parent_se = parent ? parent->se[i] : NULL;
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
  
        return 1;
@@@ -8580,7 -8576,7 +8576,7 @@@ stati
  int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  {
        struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se, *parent_se;
+       struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
  
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
  
-               rt_rq = kmalloc_node(sizeof(struct rt_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
  
-               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
                        goto err;
  
-               parent_se = parent ? parent->rt_se[i] : NULL;
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
  
        return 1;
@@@ -9251,11 -9246,12 +9246,12 @@@ struct cgroup_subsys cpu_cgroup_subsys 
   * (balbir@in.ibm.com).
   */
  
- /* track cpu usage of a group of tasks */
+ /* track cpu usage of a group of tasks and its child groups */
  struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+       struct cpuacct *parent;
  };
  
  struct cgroup_subsys cpuacct_subsys;
@@@ -9289,6 -9285,9 +9285,9 @@@ static struct cgroup_subsys_state *cpua
                return ERR_PTR(-ENOMEM);
        }
  
+       if (cgrp->parent)
+               ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
  }
  
@@@ -9302,6 -9301,41 +9301,41 @@@ cpuacct_destroy(struct cgroup_subsys *s
        kfree(ca);
  }
  
+ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+ {
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 data;
+ #ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       data = *cpuusage;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+ #else
+       data = *cpuusage;
+ #endif
+       return data;
+ }
+ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+ {
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ #ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       *cpuusage = val;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+ #else
+       *cpuusage = val;
+ #endif
+ }
  /* return total cpu usage (in nanoseconds) of a group */
  static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
  {
        u64 totalcpuusage = 0;
        int i;
  
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-               /*
-                * Take rq->lock to make 64-bit addition safe on 32-bit
-                * platforms.
-                */
-               spin_lock_irq(&cpu_rq(i)->lock);
-               totalcpuusage += *cpuusage;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
+       for_each_present_cpu(i)
+               totalcpuusage += cpuacct_cpuusage_read(ca, i);
  
        return totalcpuusage;
  }
@@@ -9336,23 -9361,39 +9361,39 @@@ static int cpuusage_write(struct cgrou
                goto out;
        }
  
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+       for_each_present_cpu(i)
+               cpuacct_cpuusage_write(ca, i, 0);
  
-               spin_lock_irq(&cpu_rq(i)->lock);
-               *cpuusage = 0;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
  out:
        return err;
  }
  
+ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+                                  struct seq_file *m)
+ {
+       struct cpuacct *ca = cgroup_ca(cgroup);
+       u64 percpu;
+       int i;
+       for_each_present_cpu(i) {
+               percpu = cpuacct_cpuusage_read(ca, i);
+               seq_printf(m, "%llu ", (unsigned long long) percpu);
+       }
+       seq_printf(m, "\n");
+       return 0;
+ }
  static struct cftype files[] = {
        {
                .name = "usage",
                .read_u64 = cpuusage_read,
                .write_u64 = cpuusage_write,
        },
+       {
+               .name = "usage_percpu",
+               .read_seq_string = cpuacct_percpu_seq_read,
+       },
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
  static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  {
        struct cpuacct *ca;
+       int cpu;
  
        if (!cpuacct_subsys.active)
                return;
  
+       cpu = task_cpu(tsk);
        ca = task_ca(tsk);
-       if (ca) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
  
+       for (; ca; ca = ca->parent) {
+               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
  }
diff --combined kernel/sched_stats.h
@@@ -31,7 -31,7 +31,7 @@@ static int show_schedstat(struct seq_fi
                    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
                    rq->sched_switch, rq->sched_count, rq->sched_goidle,
                    rq->ttwu_count, rq->ttwu_local,
-                   rq->rq_sched_info.cpu_time,
+                   rq->rq_cpu_time,
                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
  
                seq_printf(seq, "\n");
@@@ -42,7 -42,7 +42,7 @@@
                for_each_domain(cpu, sd) {
                        enum cpu_idle_type itype;
  
 -                      cpumask_scnprintf(mask_str, mask_len, sd->span);
 +                      cpumask_scnprintf(mask_str, mask_len, &sd->span);
                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
                                        itype++) {
@@@ -123,7 -123,7 +123,7 @@@ static inline voi
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {
        if (rq)
-               rq->rq_sched_info.cpu_time += delta;
+               rq->rq_cpu_time += delta;
  }
  
  static inline void
@@@ -236,7 -236,6 +236,6 @@@ static inline void sched_info_depart(st
        unsigned long long delta = task_rq(t)->clock -
                                        t->sched_info.last_arrival;
  
-       t->sched_info.cpu_time += delta;
        rq_sched_info_depart(task_rq(t), delta);
  
        if (t->state == TASK_RUNNING)
diff --combined kernel/trace/trace.c
@@@ -30,6 -30,7 +30,7 @@@
  #include <linux/gfp.h>
  #include <linux/fs.h>
  #include <linux/kprobes.h>
+ #include <linux/seq_file.h>
  #include <linux/writeback.h>
  
  #include <linux/stacktrace.h>
  unsigned long __read_mostly   tracing_max_latency = (cycle_t)ULONG_MAX;
  unsigned long __read_mostly   tracing_thresh;
  
+ /*
+  * We need to change this state when a selftest is running.
+  * A selftest will lurk into the ring-buffer to count the
+  * entries inserted during the selftest although some concurrent
+  * insertions into the ring-buffer such as ftrace_printk could occurred
+  * at the same time, giving false positive or negative results.
+  */
+ static bool __read_mostly tracing_selftest_running;
+ /* For tracers that don't implement custom flags */
+ static struct tracer_opt dummy_tracer_opt[] = {
+       { }
+ };
+ static struct tracer_flags dummy_tracer_flags = {
+       .val = 0,
+       .opts = dummy_tracer_opt
+ };
+ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+ {
+       return 0;
+ }
+ /*
+  * Kill all tracing for good (never come back).
+  * It is initialized to 1 but will turn to zero if the initialization
+  * of the tracer is successful. But that is the only place that sets
+  * this back to zero.
+  */
+ int tracing_disabled = 1;
  static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
  
  static inline void ftrace_disable_cpu(void)
@@@ -62,7 -95,36 +95,36 @@@ static cpumask_t __read_mostly              tracing
  #define for_each_tracing_cpu(cpu)     \
        for_each_cpu_mask(cpu, tracing_buffer_mask)
  
- static int tracing_disabled = 1;
+ /*
+  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+  *
+  * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+  * is set, then ftrace_dump is called. This will output the contents
+  * of the ftrace buffers to the console.  This is very useful for
+  * capturing traces that lead to crashes and outputing it to a
+  * serial console.
+  *
+  * It is default off, but you can enable it with either specifying
+  * "ftrace_dump_on_oops" in the kernel command line, or setting
+  * /proc/sys/kernel/ftrace_dump_on_oops to true.
+  */
+ int ftrace_dump_on_oops;
+ static int tracing_set_tracer(char *buf);
+ static int __init set_ftrace(char *str)
+ {
+       tracing_set_tracer(str);
+       return 1;
+ }
+ __setup("ftrace", set_ftrace);
+ static int __init set_ftrace_dump_on_oops(char *str)
+ {
+       ftrace_dump_on_oops = 1;
+       return 1;
+ }
+ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
  
  long
  ns2usecs(cycle_t nsec)
@@@ -112,6 -174,19 +174,19 @@@ static DEFINE_PER_CPU(struct trace_arra
  /* tracer_enabled is used to toggle activation of a tracer */
  static int                    tracer_enabled = 1;
  
+ /**
+  * tracing_is_enabled - return tracer_enabled status
+  *
+  * This function is used by other tracers to know the status
+  * of the tracer_enabled flag.  Tracers may use this function
+  * to know if it should enable their features when starting
+  * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+  */
+ int tracing_is_enabled(void)
+ {
+       return tracer_enabled;
+ }
  /* function tracing enabled */
  int                           ftrace_function_enabled;
  
@@@ -153,8 -228,9 +228,9 @@@ static DEFINE_MUTEX(trace_types_lock)
  /* trace_wait is a waitqueue for tasks blocked on trace_poll */
  static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
  
- /* trace_flags holds iter_ctrl options */
- unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+ /* trace_flags holds trace_options default values */
+ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+       TRACE_ITER_ANNOTATE;
  
  /**
   * trace_wake_up - wake up tasks waiting for trace input
@@@ -193,13 -269,6 +269,6 @@@ unsigned long nsecs_to_usecs(unsigned l
        return nsecs / 1000;
  }
  
- /*
-  * TRACE_ITER_SYM_MASK masks the options in trace_flags that
-  * control the output of kernel symbols.
-  */
- #define TRACE_ITER_SYM_MASK \
-       (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
  /* These must match the bit postions in trace_iterator_flags */
  static const char *trace_options[] = {
        "print-parent",
        "stacktrace",
        "sched-tree",
        "ftrace_printk",
+       "ftrace_preempt",
+       "branch",
+       "annotate",
+       "userstacktrace",
+       "sym-userobj",
+       "printk-msg-only",
        NULL
  };
  
@@@ -246,7 -321,7 +321,7 @@@ __update_max_tr(struct trace_array *tr
  
        memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
        data->pid = tsk->pid;
-       data->uid = tsk->uid;
+       data->uid = task_uid(tsk);
        data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
        data->policy = tsk->policy;
        data->rt_priority = tsk->rt_priority;
@@@ -359,6 -434,28 +434,28 @@@ trace_seq_putmem_hex(struct trace_seq *
        return trace_seq_putmem(s, hex, j);
  }
  
+ static int
+ trace_seq_path(struct trace_seq *s, struct path *path)
+ {
+       unsigned char *p;
+       if (s->len >= (PAGE_SIZE - 1))
+               return 0;
+       p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+       if (!IS_ERR(p)) {
+               p = mangle_path(s->buffer + s->len, p, "\n");
+               if (p) {
+                       s->len = p - s->buffer;
+                       return 1;
+               }
+       } else {
+               s->buffer[s->len++] = '?';
+               return 1;
+       }
+       return 0;
+ }
  static void
  trace_seq_reset(struct trace_seq *s)
  {
@@@ -470,7 -567,17 +567,17 @@@ int register_tracer(struct tracer *type
                return -1;
        }
  
+       /*
+        * When this gets called we hold the BKL which means that
+        * preemption is disabled. Various trace selftests however
+        * need to disable and enable preemption for successful tests.
+        * So we drop the BKL here and grab it after the tests again.
+        */
+       unlock_kernel();
        mutex_lock(&trace_types_lock);
+       tracing_selftest_running = true;
        for (t = trace_types; t; t = t->next) {
                if (strcmp(type->name, t->name) == 0) {
                        /* already found */
                }
        }
  
+       if (!type->set_flag)
+               type->set_flag = &dummy_set_flag;
+       if (!type->flags)
+               type->flags = &dummy_tracer_flags;
+       else
+               if (!type->flags->opts)
+                       type->flags->opts = dummy_tracer_opt;
  #ifdef CONFIG_FTRACE_STARTUP_TEST
        if (type->selftest) {
                struct tracer *saved_tracer = current_trace;
                struct trace_array *tr = &global_trace;
-               int saved_ctrl = tr->ctrl;
                int i;
                /*
                 * Run a selftest on this tracer.
                 * Here we reset the trace buffer, and set the current
                 * internal tracing to verify that everything is in order.
                 * If we fail, we do not register this tracer.
                 */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                        tracing_reset(tr, i);
-               }
                current_trace = type;
-               tr->ctrl = 0;
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
                ret = type->selftest(type, tr);
                /* the test is responsible for resetting too */
                current_trace = saved_tracer;
-               tr->ctrl = saved_ctrl;
                if (ret) {
                        printk(KERN_CONT "FAILED!\n");
                        goto out;
                }
                /* Only reset on passing, to avoid touching corrupted buffers */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                        tracing_reset(tr, i);
-               }
                printk(KERN_CONT "PASSED\n");
        }
  #endif
                max_tracer_type_len = len;
  
   out:
+       tracing_selftest_running = false;
        mutex_unlock(&trace_types_lock);
+       lock_kernel();
  
        return ret;
  }
@@@ -564,6 -679,16 +679,16 @@@ void tracing_reset(struct trace_array *
        ftrace_enable_cpu();
  }
  
+ void tracing_reset_online_cpus(struct trace_array *tr)
+ {
+       int cpu;
+       tr->time_start = ftrace_now(tr->cpu);
+       for_each_online_cpu(cpu)
+               tracing_reset(tr, cpu);
+ }
  #define SAVED_CMDLINES 128
  static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
  static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
@@@ -581,6 -706,91 +706,91 @@@ static void trace_init_cmdlines(void
        cmdline_idx = 0;
  }
  
+ static int trace_stop_count;
+ static DEFINE_SPINLOCK(tracing_start_lock);
+ /**
+  * ftrace_off_permanent - disable all ftrace code permanently
+  *
+  * This should only be called when a serious anomally has
+  * been detected.  This will turn off the function tracing,
+  * ring buffers, and other tracing utilites. It takes no
+  * locks and can be called from any context.
+  */
+ void ftrace_off_permanent(void)
+ {
+       tracing_disabled = 1;
+       ftrace_stop();
+       tracing_off_permanent();
+ }
+ /**
+  * tracing_start - quick start of the tracer
+  *
+  * If tracing is enabled but was stopped by tracing_stop,
+  * this will start the tracer back up.
+  */
+ void tracing_start(void)
+ {
+       struct ring_buffer *buffer;
+       unsigned long flags;
+       if (tracing_disabled)
+               return;
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (--trace_stop_count)
+               goto out;
+       if (trace_stop_count < 0) {
+               /* Someone screwed up their debugging */
+               WARN_ON_ONCE(1);
+               trace_stop_count = 0;
+               goto out;
+       }
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+       ftrace_start();
+  out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+ }
+ /**
+  * tracing_stop - quick stop of the tracer
+  *
+  * Light weight way to stop tracing. Use in conjunction with
+  * tracing_start.
+  */
+ void tracing_stop(void)
+ {
+       struct ring_buffer *buffer;
+       unsigned long flags;
+       ftrace_stop();
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (trace_stop_count++)
+               goto out;
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+  out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+ }
  void trace_stop_cmdline_recording(void);
  
  static void trace_save_cmdline(struct task_struct *tsk)
        spin_unlock(&trace_cmdline_lock);
  }
  
static char *trace_find_cmdline(int pid)
+ char *trace_find_cmdline(int pid)
  {
        char *cmdline = "<...>";
        unsigned map;
@@@ -655,6 -865,7 +865,7 @@@ tracing_generic_entry_update(struct tra
  
        entry->preempt_count            = pc & 0xff;
        entry->pid                      = (tsk) ? tsk->pid : 0;
+       entry->tgid                     = (tsk) ? tsk->tgid : 0;
        entry->flags =
  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@@ -691,6 -902,56 +902,56 @@@ trace_function(struct trace_array *tr, 
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
  }
  
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ static void __trace_graph_entry(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ent *trace,
+                               unsigned long flags,
+                               int pc)
+ {
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ent_entry *entry;
+       unsigned long irq_flags;
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_ENT;
+       entry->graph_ent                        = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ }
+ static void __trace_graph_return(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ret *trace,
+                               unsigned long flags,
+                               int pc)
+ {
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ret_entry *entry;
+       unsigned long irq_flags;
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_RET;
+       entry->ret                              = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ }
+ #endif
  void
  ftrace(struct trace_array *tr, struct trace_array_cpu *data,
         unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@@ -742,6 -1003,46 +1003,46 @@@ void __trace_stack(struct trace_array *
        ftrace_trace_stack(tr, data, flags, skip, preempt_count());
  }
  
+ static void ftrace_trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags, int pc)
+ {
+ #ifdef CONFIG_STACKTRACE
+       struct ring_buffer_event *event;
+       struct userstack_entry *entry;
+       struct stack_trace trace;
+       unsigned long irq_flags;
+       if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+               return;
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type         = TRACE_USER_STACK;
+       memset(&entry->caller, 0, sizeof(entry->caller));
+       trace.nr_entries        = 0;
+       trace.max_entries       = FTRACE_STACK_ENTRIES;
+       trace.skip              = 0;
+       trace.entries           = entry->caller;
+       save_stack_trace_user(&trace);
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ #endif
+ }
+ void __trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags)
+ {
+       ftrace_trace_userstack(tr, data, flags, preempt_count());
+ }
  static void
  ftrace_trace_special(void *__tr, void *__data,
                     unsigned long arg1, unsigned long arg2, unsigned long arg3,
        entry->arg3                     = arg3;
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+       ftrace_trace_userstack(tr, data, irq_flags, pc);
  
        trace_wake_up();
  }
@@@ -803,6 -1105,7 +1105,7 @@@ tracing_sched_switch_trace(struct trace
        entry->next_cpu = task_cpu(next);
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, flags, 5, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
  }
  
  void
@@@ -832,6 -1135,7 +1135,7 @@@ tracing_sched_wakeup_trace(struct trace
        entry->next_cpu                 = task_cpu(wakee);
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, flags, 6, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
  
        trace_wake_up();
  }
@@@ -841,26 -1145,28 +1145,28 @@@ ftrace_special(unsigned long arg1, unsi
  {
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
+       unsigned long flags;
        int cpu;
        int pc;
  
-       if (tracing_disabled || !tr->ctrl)
+       if (tracing_disabled)
                return;
  
        pc = preempt_count();
-       preempt_disable_notrace();
+       local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
  
-       if (likely(!atomic_read(&data->disabled)))
+       if (likely(atomic_inc_return(&data->disabled) == 1))
                ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
  
-       preempt_enable_notrace();
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
  }
  
  #ifdef CONFIG_FUNCTION_TRACER
  static void
- function_trace_call(unsigned long ip, unsigned long parent_ip)
+ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
  {
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
                return;
  
        pc = preempt_count();
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
        local_save_flags(flags);
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
                trace_function(tr, data, ip, parent_ip, flags, pc);
  
        atomic_dec(&data->disabled);
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
  }
  
+ static void
+ function_trace_call(unsigned long ip, unsigned long parent_ip)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+       if (unlikely(!ftrace_function_enabled))
+               return;
+       /*
+        * Need to use raw, since this must be called before the
+        * recursive protection is performed.
+        */
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               trace_function(tr, data, ip, parent_ip, flags, pc);
+       }
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+ }
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ int trace_graph_entry(struct ftrace_graph_ent *trace)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+       if (!ftrace_trace_task(current))
+               return 0;
+       if (!ftrace_graph_addr(trace->func))
+               return 0;
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_entry(tr, data, trace, flags, pc);
+       }
+       /* Only do the atomic if it is not already set */
+       if (!test_tsk_trace_graph(current))
+               set_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+       return 1;
+ }
+ void trace_graph_return(struct ftrace_graph_ret *trace)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_return(tr, data, trace, flags, pc);
+       }
+       if (!trace->depth)
+               clear_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+ }
+ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
  static struct ftrace_ops trace_ops __read_mostly =
  {
        .func = function_trace_call,
  void tracing_start_function_trace(void)
  {
        ftrace_function_enabled = 0;
+       if (trace_flags & TRACE_ITER_PREEMPTONLY)
+               trace_ops.func = function_trace_call_preempt_only;
+       else
+               trace_ops.func = function_trace_call;
        register_ftrace_function(&trace_ops);
-       if (tracer_enabled)
-               ftrace_function_enabled = 1;
+       ftrace_function_enabled = 1;
  }
  
  void tracing_stop_function_trace(void)
  
  enum trace_file_type {
        TRACE_FILE_LAT_FMT      = 1,
+       TRACE_FILE_ANNOTATE     = 2,
  };
  
  static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@@ -1047,10 -1443,6 +1443,6 @@@ static void *s_start(struct seq_file *m
  
        atomic_inc(&trace_record_cmdline_disabled);
  
-       /* let the tracer grab locks here if needed */
-       if (current_trace->start)
-               current_trace->start(iter);
        if (*pos != iter->pos) {
                iter->ent = NULL;
                iter->cpu = 0;
  
  static void s_stop(struct seq_file *m, void *p)
  {
-       struct trace_iterator *iter = m->private;
        atomic_dec(&trace_record_cmdline_disabled);
-       /* let the tracer release locks here if needed */
-       if (current_trace && current_trace == iter->trace && iter->trace->stop)
-               iter->trace->stop(iter);
        mutex_unlock(&trace_types_lock);
  }
  
@@@ -1143,7 -1528,7 +1528,7 @@@ seq_print_sym_offset(struct trace_seq *
  # define IP_FMT "%016lx"
  #endif
  
static int
+ int
  seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
  {
        int ret;
        return ret;
  }
  
+ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+                                   unsigned long ip, unsigned long sym_flags)
+ {
+       struct file *file = NULL;
+       unsigned long vmstart = 0;
+       int ret = 1;
+       if (mm) {
+               const struct vm_area_struct *vma;
+               down_read(&mm->mmap_sem);
+               vma = find_vma(mm, ip);
+               if (vma) {
+                       file = vma->vm_file;
+                       vmstart = vma->vm_start;
+               }
+               if (file) {
+                       ret = trace_seq_path(s, &file->f_path);
+                       if (ret)
+                               ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
+               }
+               up_read(&mm->mmap_sem);
+       }
+       if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+               ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+       return ret;
+ }
+ static int
+ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+                     unsigned long sym_flags)
+ {
+       struct mm_struct *mm = NULL;
+       int ret = 1;
+       unsigned int i;
+       if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+               struct task_struct *task;
+               /*
+                * we do the lookup on the thread group leader,
+                * since individual threads might have already quit!
+                */
+               rcu_read_lock();
+               task = find_task_by_vpid(entry->ent.tgid);
+               if (task)
+                       mm = get_task_mm(task);
+               rcu_read_unlock();
+       }
+       for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+               unsigned long ip = entry->caller[i];
+               if (ip == ULONG_MAX || !ret)
+                       break;
+               if (i && ret)
+                       ret = trace_seq_puts(s, " <- ");
+               if (!ip) {
+                       if (ret)
+                               ret = trace_seq_puts(s, "??");
+                       continue;
+               }
+               if (!ret)
+                       break;
+               if (ret)
+                       ret = seq_print_user_ip(s, mm, ip, sym_flags);
+       }
+       if (mm)
+               mmput(mm);
+       return ret;
+ }
  static void print_lat_help_header(struct seq_file *m)
  {
        seq_puts(m, "#                  _------=> CPU#            \n");
@@@ -1301,6 -1758,13 +1758,13 @@@ lat_print_timestamp(struct trace_seq *s
  
  static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
  
+ static int task_state_char(unsigned long state)
+ {
+       int bit = state ? __ffs(state) + 1 : 0;
+       return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+ }
  /*
   * The message is supposed to contain an ending newline.
   * If the printing stops prematurely, try to add a newline of our own.
@@@ -1338,6 -1802,23 +1802,23 @@@ void trace_seq_print_cont(struct trace_
                trace_seq_putc(s, '\n');
  }
  
+ static void test_cpu_buff_start(struct trace_iterator *iter)
+ {
+       struct trace_seq *s = &iter->seq;
+       if (!(trace_flags & TRACE_ITER_ANNOTATE))
+               return;
+       if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+               return;
+       if (cpu_isset(iter->cpu, iter->started))
+               return;
+       cpu_set(iter->cpu, iter->started);
+       trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+ }
  static enum print_line_t
  print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
  {
        char *comm;
        int S, T;
        int i;
-       unsigned state;
  
        if (entry->type == TRACE_CONT)
                return TRACE_TYPE_HANDLED;
  
+       test_cpu_buff_start(iter);
        next_entry = find_next_entry(iter, NULL, &next_ts);
        if (!next_entry)
                next_ts = iter->ts;
  
                trace_assign_type(field, entry);
  
-               T = field->next_state < sizeof(state_to_char) ?
-                       state_to_char[field->next_state] : 'X';
-               state = field->prev_state ?
-                       __ffs(field->prev_state) + 1 : 0;
-               S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+               T = task_state_char(field->next_state);
+               S = task_state_char(field->prev_state);
                comm = trace_find_cmdline(field->next_pid);
                trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
                                 field->prev_pid,
                        trace_seq_print_cont(s, iter);
                break;
        }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+               trace_assign_type(field, entry);
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+               trace_assign_type(field, entry);
+               seq_print_userip_objs(field, s, sym_flags);
+               trace_seq_putc(s, '\n');
+               break;
+       }
        default:
                trace_seq_printf(s, "Unknown type %d\n", entry->type);
        }
@@@ -1472,6 -1971,8 +1971,8 @@@ static enum print_line_t print_trace_fm
        if (entry->type == TRACE_CONT)
                return TRACE_TYPE_HANDLED;
  
+       test_cpu_buff_start(iter);
        comm = trace_find_cmdline(iter->ent->pid);
  
        t = ns2usecs(iter->ts);
  
                trace_assign_type(field, entry);
  
-               S = field->prev_state < sizeof(state_to_char) ?
-                       state_to_char[field->prev_state] : 'X';
-               T = field->next_state < sizeof(state_to_char) ?
-                       state_to_char[field->next_state] : 'X';
+               T = task_state_char(field->next_state);
+               S = task_state_char(field->prev_state);
                ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
                                       field->prev_pid,
                                       field->prev_prio,
                        trace_seq_print_cont(s, iter);
                break;
        }
+       case TRACE_GRAPH_RET: {
+               return print_graph_function(iter);
+       }
+       case TRACE_GRAPH_ENT: {
+               return print_graph_function(iter);
+       }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+               trace_assign_type(field, entry);
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+               trace_assign_type(field, entry);
+               ret = seq_print_userip_objs(field, s, sym_flags);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               ret = trace_seq_putc(s, '\n');
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               break;
+       }
        }
        return TRACE_TYPE_HANDLED;
  }
@@@ -1621,12 -2151,9 +2151,9 @@@ static enum print_line_t print_raw_fmt(
  
                trace_assign_type(field, entry);
  
-               S = field->prev_state < sizeof(state_to_char) ?
-                       state_to_char[field->prev_state] : 'X';
-               T = field->next_state < sizeof(state_to_char) ?
-                       state_to_char[field->next_state] : 'X';
-               if (entry->type == TRACE_WAKE)
-                       S = '+';
+               T = task_state_char(field->next_state);
+               S = entry->type == TRACE_WAKE ? '+' :
+                       task_state_char(field->prev_state);
                ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
                                       field->prev_pid,
                                       field->prev_prio,
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
  
@@@ -1712,12 -2240,9 +2240,9 @@@ static enum print_line_t print_hex_fmt(
  
                trace_assign_type(field, entry);
  
-               S = field->prev_state < sizeof(state_to_char) ?
-                       state_to_char[field->prev_state] : 'X';
-               T = field->next_state < sizeof(state_to_char) ?
-                       state_to_char[field->next_state] : 'X';
-               if (entry->type == TRACE_WAKE)
-                       S = '+';
+               T = task_state_char(field->next_state);
+               S = entry->type == TRACE_WAKE ? '+' :
+                       task_state_char(field->prev_state);
                SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
                SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
                SEQ_PUT_HEX_FIELD_RET(s, S);
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
  
        return TRACE_TYPE_HANDLED;
  }
  
+ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
+ {
+       struct trace_seq *s = &iter->seq;
+       struct trace_entry *entry = iter->ent;
+       struct print_entry *field;
+       int ret;
+       trace_assign_type(field, entry);
+       ret = trace_seq_printf(s, field->buf);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+       if (entry->flags & TRACE_FLAG_CONT)
+               trace_seq_print_cont(s, iter);
+       return TRACE_TYPE_HANDLED;
+ }
  static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
  {
        struct trace_seq *s = &iter->seq;
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
  
@@@ -1823,6 -2369,11 +2369,11 @@@ static enum print_line_t print_trace_li
                        return ret;
        }
  
+       if (iter->ent->type == TRACE_PRINT &&
+                       trace_flags & TRACE_ITER_PRINTK &&
+                       trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+               return print_printk_msg_only(iter);
        if (trace_flags & TRACE_ITER_BIN)
                return print_bin_fmt(iter);
  
@@@ -1847,7 -2398,9 +2398,9 @@@ static int s_show(struct seq_file *m, v
                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
                        seq_puts(m, "#\n");
                }
-               if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+               if (iter->trace && iter->trace->print_header)
+                       iter->trace->print_header(m);
+               else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                        /* print nothing if the buffers are empty */
                        if (trace_empty(iter))
                                return 0;
@@@ -1899,6 -2452,15 +2452,15 @@@ __tracing_open(struct inode *inode, str
        iter->trace = current_trace;
        iter->pos = -1;
  
+       /* Notify the tracer early; before we stop tracing. */
+       if (iter->trace && iter->trace->open)
+               iter->trace->open(iter);
+       /* Annotate start of buffers if we had overruns */
+       if (ring_buffer_overruns(iter->tr->buffer))
+               iter->iter_flags |= TRACE_FILE_ANNOTATE;
        for_each_tracing_cpu(cpu) {
  
                iter->buffer_iter[cpu] =
        m->private = iter;
  
        /* stop the trace while dumping */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 0;
-               ftrace_function_enabled = 0;
-       }
-       if (iter->trace && iter->trace->open)
-                       iter->trace->open(iter);
+       tracing_stop();
  
        mutex_unlock(&trace_types_lock);
  
@@@ -1966,14 -2522,7 +2522,7 @@@ int tracing_release(struct inode *inode
                iter->trace->close(iter);
  
        /* reenable tracing if it was previously enabled */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 1;
-               /*
-                * It is safe to enable function tracing even if it
-                * isn't used
-                */
-               ftrace_function_enabled = 1;
-       }
+       tracing_start();
        mutex_unlock(&trace_types_lock);
  
        seq_release(inode, file);
@@@ -2126,7 -2675,7 +2675,7 @@@ tracing_cpumask_read(struct file *filp
  
        mutex_lock(&tracing_cpumask_update_lock);
  
 -      len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
 +      len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
        if (count - len < 2) {
                count = -EINVAL;
                goto out_err;
@@@ -2147,11 -2696,11 +2696,11 @@@ tracing_cpumask_write(struct file *filp
        int err, cpu;
  
        mutex_lock(&tracing_cpumask_update_lock);
 -      err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
 +      err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
        if (err)
                goto err_unlock;
  
-       raw_local_irq_disable();
+       local_irq_disable();
        __raw_spin_lock(&ftrace_max_lock);
        for_each_tracing_cpu(cpu) {
                /*
                }
        }
        __raw_spin_unlock(&ftrace_max_lock);
-       raw_local_irq_enable();
+       local_irq_enable();
  
        tracing_cpumask = tracing_cpumask_new;
  
@@@ -2189,13 -2738,16 +2738,16 @@@ static struct file_operations tracing_c
  };
  
  static ssize_t
- tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+ tracing_trace_options_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
  {
+       int i;
        char *buf;
        int r = 0;
        int len = 0;
-       int i;
+       u32 tracer_flags = current_trace->flags->val;
+       struct tracer_opt *trace_opts = current_trace->flags->opts;
  
        /* calulate max size */
        for (i = 0; trace_options[i]; i++) {
                len += 3; /* "no" and space */
        }
  
+       /*
+        * Increase the size with names of options specific
+        * of the current tracer.
+        */
+       for (i = 0; trace_opts[i].name; i++) {
+               len += strlen(trace_opts[i].name);
+               len += 3; /* "no" and space */
+       }
        /* +2 for \n and \0 */
        buf = kmalloc(len + 2, GFP_KERNEL);
        if (!buf)
                        r += sprintf(buf + r, "no%s ", trace_options[i]);
        }
  
+       for (i = 0; trace_opts[i].name; i++) {
+               if (tracer_flags & trace_opts[i].bit)
+                       r += sprintf(buf + r, "%s ",
+                               trace_opts[i].name);
+               else
+                       r += sprintf(buf + r, "no%s ",
+                               trace_opts[i].name);
+       }
        r += sprintf(buf + r, "\n");
        WARN_ON(r >= len + 2);
  
        return r;
  }
  
+ /* Try to assign a tracer specific option */
+ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+ {
+       struct tracer_flags *trace_flags = trace->flags;
+       struct tracer_opt *opts = NULL;
+       int ret = 0, i = 0;
+       int len;
+       for (i = 0; trace_flags->opts[i].name; i++) {
+               opts = &trace_flags->opts[i];
+               len = strlen(opts->name);
+               if (strncmp(cmp, opts->name, len) == 0) {
+                       ret = trace->set_flag(trace_flags->val,
+                               opts->bit, !neg);
+                       break;
+               }
+       }
+       /* Not found */
+       if (!trace_flags->opts[i].name)
+               return -EINVAL;
+       /* Refused to handle */
+       if (ret)
+               return ret;
+       if (neg)
+               trace_flags->val &= ~opts->bit;
+       else
+               trace_flags->val |= opts->bit;
+       return 0;
+ }
  static ssize_t
- tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
  {
        char buf[64];
        char *cmp = buf;
        int neg = 0;
+       int ret;
        int i;
  
        if (cnt >= sizeof(buf))
                        break;
                }
        }
-       /*
-        * If no option could be set, return an error:
-        */
-       if (!trace_options[i])
-               return -EINVAL;
+       /* If no option could be set, test the specific tracer options */
+       if (!trace_options[i]) {
+               ret = set_tracer_option(current_trace, cmp, neg);
+               if (ret)
+                       return ret;
+       }
  
        filp->f_pos += cnt;
  
  
  static struct file_operations tracing_iter_fops = {
        .open           = tracing_open_generic,
-       .read           = tracing_iter_ctrl_read,
-       .write          = tracing_iter_ctrl_write,
+       .read           = tracing_trace_options_read,
+       .write          = tracing_trace_options_write,
  };
  
  static const char readme_msg[] =
        "# echo sched_switch > /debug/tracing/current_tracer\n"
        "# cat /debug/tracing/current_tracer\n"
        "sched_switch\n"
-       "# cat /debug/tracing/iter_ctrl\n"
+       "# cat /debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
-       "# echo print-parent > /debug/tracing/iter_ctrl\n"
+       "# echo print-parent > /debug/tracing/trace_options\n"
        "# echo 1 > /debug/tracing/tracing_enabled\n"
        "# cat /debug/tracing/trace > /tmp/trace.txt\n"
        "echo 0 > /debug/tracing/tracing_enabled\n"
@@@ -2311,11 -2918,10 +2918,10 @@@ static ssize_
  tracing_ctrl_read(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
  {
        char buf[64];
        int r;
  
-       r = sprintf(buf, "%ld\n", tr->ctrl);
+       r = sprintf(buf, "%u\n", tracer_enabled);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  }
  
@@@ -2343,16 -2949,18 +2949,18 @@@ tracing_ctrl_write(struct file *filp, c
        val = !!val;
  
        mutex_lock(&trace_types_lock);
-       if (tr->ctrl ^ val) {
-               if (val)
+       if (tracer_enabled ^ val) {
+               if (val) {
                        tracer_enabled = 1;
-               else
+                       if (current_trace->start)
+                               current_trace->start(tr);
+                       tracing_start();
+               } else {
                        tracer_enabled = 0;
-               tr->ctrl = val;
-               if (current_trace && current_trace->ctrl_update)
-                       current_trace->ctrl_update(tr);
+                       tracing_stop();
+                       if (current_trace->stop)
+                               current_trace->stop(tr);
+               }
        }
        mutex_unlock(&trace_types_lock);
  
@@@ -2378,29 -2986,11 +2986,11 @@@ tracing_set_trace_read(struct file *fil
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  }
  
- static ssize_t
- tracing_set_trace_write(struct file *filp, const char __user *ubuf,
-                       size_t cnt, loff_t *ppos)
+ static int tracing_set_tracer(char *buf)
  {
        struct trace_array *tr = &global_trace;
        struct tracer *t;
-       char buf[max_tracer_type_len+1];
-       int i;
-       size_t ret;
-       ret = cnt;
-       if (cnt > max_tracer_type_len)
-               cnt = max_tracer_type_len;
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-       buf[cnt] = 0;
-       /* strip ending whitespace. */
-       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
-               buf[i] = 0;
+       int ret = 0;
  
        mutex_lock(&trace_types_lock);
        for (t = trace_types; t; t = t->next) {
        if (t == current_trace)
                goto out;
  
+       trace_branch_disable();
        if (current_trace && current_trace->reset)
                current_trace->reset(tr);
  
        current_trace = t;
-       if (t->init)
-               t->init(tr);
+       if (t->init) {
+               ret = t->init(tr);
+               if (ret)
+                       goto out;
+       }
  
+       trace_branch_enable(tr);
   out:
        mutex_unlock(&trace_types_lock);
  
-       if (ret > 0)
-               filp->f_pos += ret;
+       return ret;
+ }
+ static ssize_t
+ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+ {
+       char buf[max_tracer_type_len+1];
+       int i;
+       size_t ret;
+       int err;
+       ret = cnt;
+       if (cnt > max_tracer_type_len)
+               cnt = max_tracer_type_len;
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+       buf[cnt] = 0;
+       /* strip ending whitespace. */
+       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+               buf[i] = 0;
+       err = tracing_set_tracer(buf);
+       if (err)
+               return err;
+       filp->f_pos += ret;
  
        return ret;
  }
@@@ -2492,6 -3116,10 +3116,10 @@@ static int tracing_open_pipe(struct ino
                return -ENOMEM;
  
        mutex_lock(&trace_types_lock);
+       /* trace pipe does not show start of buffer */
+       cpus_setall(iter->started);
        iter->tr = &global_trace;
        iter->trace = current_trace;
        filp->private_data = iter;
@@@ -2667,7 -3295,7 +3295,7 @@@ tracing_entries_read(struct file *filp
        char buf[64];
        int r;
  
-       r = sprintf(buf, "%lu\n", tr->entries);
+       r = sprintf(buf, "%lu\n", tr->entries >> 10);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  }
  
@@@ -2678,7 -3306,6 +3306,6 @@@ tracing_entries_write(struct file *filp
        unsigned long val;
        char buf[64];
        int ret, cpu;
-       struct trace_array *tr = filp->private_data;
  
        if (cnt >= sizeof(buf))
                return -EINVAL;
  
        mutex_lock(&trace_types_lock);
  
-       if (tr->ctrl) {
-               cnt = -EBUSY;
-               pr_info("ftrace: please disable tracing"
-                       " before modifying buffer size\n");
-               goto out;
-       }
+       tracing_stop();
  
        /* disable all cpu buffers */
        for_each_tracing_cpu(cpu) {
                        atomic_inc(&max_tr.data[cpu]->disabled);
        }
  
+       /* value is in KB */
+       val <<= 10;
        if (val != global_trace.entries) {
                ret = ring_buffer_resize(global_trace.buffer, val);
                if (ret < 0) {
                        atomic_dec(&max_tr.data[cpu]->disabled);
        }
  
+       tracing_start();
        max_tr.entries = global_trace.entries;
        mutex_unlock(&trace_types_lock);
  
@@@ -2762,7 -3388,7 +3388,7 @@@ static int mark_printk(const char *fmt
        int ret;
        va_list args;
        va_start(args, fmt);
-       ret = trace_vprintk(0, fmt, args);
+       ret = trace_vprintk(0, -1, fmt, args);
        va_end(args);
        return ret;
  }
@@@ -2773,9 -3399,8 +3399,8 @@@ tracing_mark_write(struct file *filp, c
  {
        char *buf;
        char *end;
-       struct trace_array *tr = &global_trace;
  
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled)
                return -EINVAL;
  
        if (cnt > TRACE_BUF_SIZE)
@@@ -2841,22 -3466,38 +3466,38 @@@ static struct file_operations tracing_m
  
  #ifdef CONFIG_DYNAMIC_FTRACE
  
+ int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+ {
+       return 0;
+ }
  static ssize_t
- tracing_read_long(struct file *filp, char __user *ubuf,
+ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
  {
+       static char ftrace_dyn_info_buffer[1024];
+       static DEFINE_MUTEX(dyn_info_mutex);
        unsigned long *p = filp->private_data;
-       char buf[64];
+       char *buf = ftrace_dyn_info_buffer;
+       int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
        int r;
  
-       r = sprintf(buf, "%ld\n", *p);
+       mutex_lock(&dyn_info_mutex);
+       r = sprintf(buf, "%ld ", *p);
  
-       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+       r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+       buf[r++] = '\n';
+       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+       mutex_unlock(&dyn_info_mutex);
+       return r;
  }
  
- static struct file_operations tracing_read_long_fops = {
+ static struct file_operations tracing_dyn_info_fops = {
        .open           = tracing_open_generic,
-       .read           = tracing_read_long,
+       .read           = tracing_read_dyn_info,
  };
  #endif
  
@@@ -2897,10 -3538,10 +3538,10 @@@ static __init int tracer_init_debugfs(v
        if (!entry)
                pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
  
-       entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+       entry = debugfs_create_file("trace_options", 0644, d_tracer,
                                    NULL, &tracing_iter_fops);
        if (!entry)
-               pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+               pr_warning("Could not create debugfs 'trace_options' entry\n");
  
        entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
                                    NULL, &tracing_cpumask_fops);
                pr_warning("Could not create debugfs "
                           "'trace_pipe' entry\n");
  
-       entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+       entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
                                    &global_trace, &tracing_entries_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
-                          "'trace_entries' entry\n");
+                          "'buffer_size_kb' entry\n");
  
        entry = debugfs_create_file("trace_marker", 0220, d_tracer,
                                    NULL, &tracing_mark_fops);
  #ifdef CONFIG_DYNAMIC_FTRACE
        entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                                    &ftrace_update_tot_cnt,
-                                   &tracing_read_long_fops);
+                                   &tracing_dyn_info_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
                           "'dyn_ftrace_total_info' entry\n");
        return 0;
  }
  
- int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
  {
        static DEFINE_SPINLOCK(trace_buf_lock);
        static char trace_buf[TRACE_BUF_SIZE];
        struct ring_buffer_event *event;
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
-       struct print_entry *entry;
-       unsigned long flags, irq_flags;
        int cpu, len = 0, size, pc;
+       struct print_entry *entry;
+       unsigned long irq_flags;
  
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled || tracing_selftest_running)
                return 0;
  
        pc = preempt_count();
        if (unlikely(atomic_read(&data->disabled)))
                goto out;
  
-       spin_lock_irqsave(&trace_buf_lock, flags);
+       pause_graph_tracing();
+       spin_lock_irqsave(&trace_buf_lock, irq_flags);
        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
  
        len = min(len, TRACE_BUF_SIZE-1);
        if (!event)
                goto out_unlock;
        entry = ring_buffer_event_data(event);
-       tracing_generic_entry_update(&entry->ent, flags, pc);
+       tracing_generic_entry_update(&entry->ent, irq_flags, pc);
        entry->ent.type                 = TRACE_PRINT;
        entry->ip                       = ip;
+       entry->depth                    = depth;
  
        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = 0;
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
  
   out_unlock:
-       spin_unlock_irqrestore(&trace_buf_lock, flags);
+       spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+       unpause_graph_tracing();
   out:
        preempt_enable_notrace();
  
@@@ -3037,7 -3680,7 +3680,7 @@@ int __ftrace_printk(unsigned long ip, c
                return 0;
  
        va_start(ap, fmt);
-       ret = trace_vprintk(ip, fmt, ap);
+       ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
        va_end(ap);
        return ret;
  }
@@@ -3046,7 -3689,8 +3689,8 @@@ EXPORT_SYMBOL_GPL(__ftrace_printk)
  static int trace_panic_handler(struct notifier_block *this,
                               unsigned long event, void *unused)
  {
-       ftrace_dump();
+       if (ftrace_dump_on_oops)
+               ftrace_dump();
        return NOTIFY_OK;
  }
  
@@@ -3062,7 -3706,8 +3706,8 @@@ static int trace_die_handler(struct not
  {
        switch (val) {
        case DIE_OOPS:
-               ftrace_dump();
+               if (ftrace_dump_on_oops)
+                       ftrace_dump();
                break;
        default:
                break;
@@@ -3103,7 -3748,6 +3748,6 @@@ trace_printk_seq(struct trace_seq *s
        trace_seq_reset(s);
  }
  
  void ftrace_dump(void)
  {
        static DEFINE_SPINLOCK(ftrace_dump_lock);
                atomic_inc(&global_trace.data[cpu]->disabled);
        }
  
+       /* don't look at user memory in panic mode */
+       trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
        printk(KERN_TRACE "Dumping ftrace buffer:\n");
  
        iter.tr = &global_trace;
@@@ -3221,7 -3868,6 +3868,6 @@@ __init static int tracer_alloc_buffers(
  #endif
  
        /* All seems OK, enable tracing */
-       global_trace.ctrl = tracer_enabled;
        tracing_disabled = 0;
  
        atomic_notifier_chain_register(&panic_notifier_list,
diff --combined lib/Kconfig
@@@ -64,6 -64,8 +64,8 @@@ config CRC
  
  config LIBCRC32C
        tristate "CRC32c (Castagnoli, et al) Cyclic Redundancy-Check"
+       select CRYPTO
+       select CRYPTO_CRC32C
        help
          This option is provided for the case where no in-kernel-tree
          modules require CRC32c functions, but a module built outside the
@@@ -157,11 -159,4 +159,11 @@@ config CHECK_SIGNATUR
  config HAVE_LMB
        boolean
  
 +config CPUMASK_OFFSTACK
 +      bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
 +      help
 +        Use dynamic allocation for cpumask_var_t, instead of putting
 +        them on the stack.  This is a bit more expensive, but avoids
 +        stack overflow.
 +
  endmenu