Merge branch 'linus' into perfcounters/core-v2

author Ingo Molnar <mingo@elte.hu>

Mon, 6 Apr 2009 07:02:57 +0000 (09:02 +0200)

committer Ingo Molnar <mingo@elte.hu>

Mon, 6 Apr 2009 07:02:57 +0000 (09:02 +0200)
author Ingo Molnar <mingo@elte.hu>
Mon, 6 Apr 2009 07:02:57 +0000 (09:02 +0200)
committer Ingo Molnar <mingo@elte.hu>
Mon, 6 Apr 2009 07:02:57 +0000 (09:02 +0200)
diff --combined arch/powerpc/include/asm/hw_irq.h

index e10f151,b7e034b..b43076f
--- 1/arch/powerpc/include/asm/hw_irq.h
--- 2/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@@ -129,38 -129,7 +129,38 @@@ static inline int irqs_disabled_flags(u
    * interrupt-retrigger: should we handle this via lost interrupts and IPIs
    * or should we not care like we do now ? --BenH.
    */
- struct hw_interrupt_type;
+ struct irq_chip;
   
+ +#ifdef CONFIG_PERF_COUNTERS
+ +static inline unsigned long get_perf_counter_pending(void)
+ +{
+ +      unsigned long x;
+ +
+ +      asm volatile("lbz %0,%1(13)"
+ +              : "=r" (x)
+ +              : "i" (offsetof(struct paca_struct, perf_counter_pending)));
+ +      return x;
+ +}
+ +
+ +static inline void set_perf_counter_pending(int x)
+ +{
+ +      asm volatile("stb %0,%1(13)" : :
+ +              "r" (x),
+ +              "i" (offsetof(struct paca_struct, perf_counter_pending)));
+ +}
+ +
+ +extern void perf_counter_do_pending(void);
+ +
+ +#else
+ +
+ +static inline unsigned long get_perf_counter_pending(void)
+ +{
+ +      return 0;
+ +}
+ +
+ +static inline void set_perf_counter_pending(int x) {}
+ +static inline void perf_counter_do_pending(void) {}
+ +#endif /* CONFIG_PERF_COUNTERS */
+ +
   #endif        /* __KERNEL__ */
   #endif        /* _ASM_POWERPC_HW_IRQ_H */
diff --combined arch/powerpc/include/asm/systbl.h

index d312eec,fe16649..affa8ca
--- 1/arch/powerpc/include/asm/systbl.h
--- 2/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@@ -65,7 -65,7 +65,7 @@@ SYSCALL(ni_syscall
   SYSX(sys_ni_syscall,sys_olduname, sys_olduname)
   COMPAT_SYS_SPU(umask)
   SYSCALL_SPU(chroot)
- SYSCALL(ustat)
+ COMPAT_SYS(ustat)
   SYSCALL_SPU(dup2)
   SYSCALL_SPU(getppid)
   SYSCALL_SPU(getpgrp)
@@@ -322,4 -322,3 +322,4 @@@ SYSCALL_SPU(epoll_create1
   SYSCALL_SPU(dup3)
   SYSCALL_SPU(pipe2)
   SYSCALL(inotify_init1)
+ +SYSCALL_SPU(perf_counter_open)
diff --combined arch/powerpc/kernel/Makefile

index 8e5e2c7,71901fb..9ba1bb7
--- 1/arch/powerpc/kernel/Makefile
--- 2/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@@ -18,12 -18,10 +18,10 @@@ CFLAGS_REMOVE_cputable.o = -pg -mno-sch
   CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
   CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
   CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
- 
- ifdef CONFIG_DYNAMIC_FTRACE
- # dynamic ftrace setup.
+ # do not trace tracer code
   CFLAGS_REMOVE_ftrace.o = -pg -mno-sched-epilog
- endif
- 
+ # timers used by tracing
+ CFLAGS_REMOVE_time.o = -pg -mno-sched-epilog
   endif
   
   obj-y                         := cputable.o ptrace.o syscalls.o \
@@@ -61,6 -59,7 +59,7 @@@ obj-$(CONFIG_HIBERNATION)     += swsusp.o s
   obj64-$(CONFIG_HIBERNATION)   += swsusp_asm64.o
   obj-$(CONFIG_MODULES)         += module.o module_$(CONFIG_WORD_SIZE).o
   obj-$(CONFIG_44x)             += cpu_setup_44x.o
+ obj-$(CONFIG_FSL_BOOKE)               += cpu_setup_fsl_booke.o dbell.o
   
   extra-$(CONFIG_PPC_STD_MMU)   := head_32.o
   extra-$(CONFIG_PPC64)         := head_64.o
@@@ -76,7 -75,7 +75,7 @@@ obj-y                         += time.o prom.o traps.o setup
   obj-$(CONFIG_PPC32)           += entry_32.o setup_32.o
   obj-$(CONFIG_PPC64)           += dma-iommu.o iommu.o
   obj-$(CONFIG_KGDB)            += kgdb.o
- obj-$(CONFIG_PPC_MULTIPLATFORM)       += prom_init.o
+ obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)  += prom_init.o
   obj-$(CONFIG_MODULES)         += ppc_ksyms.o
   obj-$(CONFIG_BOOTX_TEXT)      += btext.o
   obj-$(CONFIG_SMP)             += smp.o
@@@ -94,8 -93,7 +93,9 @@@ obj-$(CONFIG_AUDIT)           += audit.
   obj64-$(CONFIG_AUDIT)         += compat_audit.o
   
   obj-$(CONFIG_DYNAMIC_FTRACE)  += ftrace.o
+ obj-$(CONFIG_FUNCTION_GRAPH_TRACER)   += ftrace.o
+ +obj-$(CONFIG_PERF_COUNTERS)   += perf_counter.o power4-pmu.o ppc970-pmu.o \
+ +                                 power5-pmu.o power5+-pmu.o power6-pmu.o
   
   obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
   
diff --combined arch/powerpc/kernel/asm-offsets.c

index 3734973,1e40bc0..e981d1c
--- 1/arch/powerpc/kernel/asm-offsets.c
--- 2/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@@ -49,7 -49,7 +49,7 @@@
   #include <asm/iseries/alpaca.h>
   #endif
   #ifdef CONFIG_KVM
- #include <asm/kvm_44x.h>
+ #include <linux/kvm_host.h>
   #endif
   
   #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
@@@ -131,7 -131,6 +131,7 @@@ int main(void
         DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
         DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
         DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+ +      DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
         DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
         DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
         DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
@@@ -285,9 -284,6 +285,6 @@@
   #endif /* ! CONFIG_PPC64 */
   
         /* About the CPU features table */
-       DEFINE(CPU_SPEC_ENTRY_SIZE, sizeof(struct cpu_spec));
-       DEFINE(CPU_SPEC_PVR_MASK, offsetof(struct cpu_spec, pvr_mask));
-       DEFINE(CPU_SPEC_PVR_VALUE, offsetof(struct cpu_spec, pvr_value));
         DEFINE(CPU_SPEC_FEATURES, offsetof(struct cpu_spec, cpu_features));
         DEFINE(CPU_SPEC_SETUP, offsetof(struct cpu_spec, cpu_setup));
         DEFINE(CPU_SPEC_RESTORE, offsetof(struct cpu_spec, cpu_restore));
@@@ -362,8 -358,6 +359,6 @@@
         DEFINE(PTE_SIZE, sizeof(pte_t));
   
   #ifdef CONFIG_KVM
-       DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
- 
         DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
         DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
         DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
diff --combined arch/powerpc/kernel/entry_64.S

index f30b4e5,abfc323..43e0734
--- 1/arch/powerpc/kernel/entry_64.S
--- 2/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@@ -526,15 -526,6 +526,15 @@@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE
   2:
         TRACE_AND_RESTORE_IRQ(r5);
   
+ +#ifdef CONFIG_PERF_COUNTERS
+ +      /* check paca->perf_counter_pending if we're enabling ints */
+ +      lbz     r3,PACAPERFPEND(r13)
+ +      and.    r3,r3,r5
+ +      beq     27f
+ +      bl      .perf_counter_do_pending
+ +27:
+ +#endif /* CONFIG_PERF_COUNTERS */
+ +
         /* extract EE bit and use it to restore paca->hard_enabled */
         ld      r3,_MSR(r1)
         rldicl  r4,r3,49,63             /* r0 = (r3 >> 15) & 1 */
@@@ -917,6 -908,12 +917,12 @@@ _GLOBAL(ftrace_caller
   ftrace_call:
         bl      ftrace_stub
         nop
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ .globl ftrace_graph_call
+ ftrace_graph_call:
+       b       ftrace_graph_stub
+ _GLOBAL(ftrace_graph_stub)
+ #endif
         ld      r0, 128(r1)
         mtlr    r0
         addi    r1, r1, 112
@@@ -940,13 -937,90 +946,90 @@@ _GLOBAL(_mcount
         ld      r5,0(r5)
         mtctr   r5
         bctrl
- 
         nop
+ 
+ 
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       b       ftrace_graph_caller
+ #endif
         ld      r0, 128(r1)
         mtlr    r0
         addi    r1, r1, 112
   _GLOBAL(ftrace_stub)
         blr
   
- #endif
- #endif
+ #endif /* CONFIG_DYNAMIC_FTRACE */
+ 
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ _GLOBAL(ftrace_graph_caller)
+       /* load r4 with local address */
+       ld      r4, 128(r1)
+       subi    r4, r4, MCOUNT_INSN_SIZE
+ 
+       /* get the parent address */
+       ld      r11, 112(r1)
+       addi    r3, r11, 16
+ 
+       bl      .prepare_ftrace_return
+       nop
+ 
+       ld      r0, 128(r1)
+       mtlr    r0
+       addi    r1, r1, 112
+       blr
+ 
+ _GLOBAL(return_to_handler)
+       /* need to save return values */
+       std     r4,  -24(r1)
+       std     r3,  -16(r1)
+       std     r31, -8(r1)
+       mr      r31, r1
+       stdu    r1, -112(r1)
+ 
+       bl      .ftrace_return_to_handler
+       nop
+ 
+       /* return value has real return address */
+       mtlr    r3
+ 
+       ld      r1, 0(r1)
+       ld      r4,  -24(r1)
+       ld      r3,  -16(r1)
+       ld      r31, -8(r1)
+ 
+       /* Jump back to real return address */
+       blr
+ 
+ _GLOBAL(mod_return_to_handler)
+       /* need to save return values */
+       std     r4,  -32(r1)
+       std     r3,  -24(r1)
+       /* save TOC */
+       std     r2,  -16(r1)
+       std     r31, -8(r1)
+       mr      r31, r1
+       stdu    r1, -112(r1)
+ 
+       /*
+        * We are in a module using the module's TOC.
+        * Switch to our TOC to run inside the core kernel.
+        */
+       LOAD_REG_IMMEDIATE(r4,ftrace_return_to_handler)
+       ld      r2, 8(r4)
+ 
+       bl      .ftrace_return_to_handler
+       nop
+ 
+       /* return value has real return address */
+       mtlr    r3
+ 
+       ld      r1, 0(r1)
+       ld      r4,  -32(r1)
+       ld      r3,  -24(r1)
+       ld      r2,  -16(r1)
+       ld      r31, -8(r1)
+ 
+       /* Jump back to real return address */
+       blr
+ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+ #endif /* CONFIG_FUNCTION_TRACER */
diff --combined arch/powerpc/kernel/irq.c

index 7f8e6a9,5576147..0d2e37c
--- 1/arch/powerpc/kernel/irq.c
--- 2/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@@ -104,13 -104,6 +104,13 @@@ static inline notrace void set_soft_ena
         : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
   }
   
+ +#ifdef CONFIG_PERF_COUNTERS
+ +notrace void __weak perf_counter_do_pending(void)
+ +{
+ +      set_perf_counter_pending(0);
+ +}
+ +#endif
+ +
   notrace void raw_local_irq_restore(unsigned long en)
   {
         /*
@@@ -142,9 -135,6 +142,9 @@@
                         iseries_handle_interrupts();
         }
   
+ +      if (get_perf_counter_pending())
+ +              perf_counter_do_pending();
+ +
         /*
          * if (get_paca()->hard_enabled) return;
          * But again we need to take care that gcc gets hard_enabled directly
@@@ -181,7 -171,7 +181,7 @@@ int show_interrupts(struct seq_file *p
   {
         int i = *(loff_t *)v, j;
         struct irqaction *action;
-       irq_desc_t *desc;
+       struct irq_desc *desc;
         unsigned long flags;
   
         if (i == 0) {
@@@ -200,7 -190,7 +200,7 @@@
                 seq_printf(p, "%3d: ", i);
   #ifdef CONFIG_SMP
                 for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+                       seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
   #else
                 seq_printf(p, "%10u ", kstat_irqs(i));
   #endif /* CONFIG_SMP */
@@@ -1048,7 -1038,7 +1048,7 @@@ arch_initcall(irq_late_init)
   static int virq_debug_show(struct seq_file *m, void *private)
   {
         unsigned long flags;
-       irq_desc_t *desc;
+       struct irq_desc *desc;
         const char *p;
         char none[] = "none";
         int i;
diff --combined arch/powerpc/platforms/Kconfig.cputype

index dc0f3c9,9da795e..732ee93
--- 1/arch/powerpc/platforms/Kconfig.cputype
--- 2/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@@ -1,7 -1,6 +1,7 @@@
   config PPC64
         bool "64-bit kernel"
         default n
+ +      select HAVE_PERF_COUNTERS
         help
           This option selects whether a 32-bit or a 64-bit kernel
           will be built.
@@@ -58,9 -57,17 +58,17 @@@ config E20
   
   endchoice
   
+ # Until we have a choice of exclusive CPU types on 64-bit, we always
+ # use PPC_BOOK3S. On 32-bit, this is equivalent to 6xx which is
+ # "classic" MMU
+ 
+ config PPC_BOOK3S
+        def_bool y
+        depends on PPC64 || 6xx
+ 
   config POWER4_ONLY
         bool "Optimize for POWER4"
-       depends on PPC64
+       depends on PPC64 && PPC_BOOK3S
         default n
         ---help---
           Cause the compiler to optimize for POWER4/POWER5/PPC970 processors.
@@@ -69,16 -76,16 +77,16 @@@
   
   config POWER3
         bool
-       depends on PPC64
+       depends on PPC64 && PPC_BOOK3S
         default y if !POWER4_ONLY
   
   config POWER4
-       depends on PPC64
+       depends on PPC64 && PPC_BOOK3S
         def_bool y
   
   config TUNE_CELL
         bool "Optimize for Cell Broadband Engine"
-       depends on PPC64
+       depends on PPC64 && PPC_BOOK3S
         help
           Cause the compiler to optimize for the PPE of the Cell Broadband
           Engine. This will make the code run considerably faster on Cell
@@@ -148,7 -155,7 +156,7 @@@ config PHYS_64BI
   
   config ALTIVEC
         bool "AltiVec Support"
-       depends on CLASSIC32 || POWER4
+       depends on 6xx || POWER4
         ---help---
           This option enables kernel support for the Altivec extensions to the
           PowerPC processor. The kernel currently supports saving and restoring
@@@ -211,6 -218,10 +219,10 @@@ config PPC_MMU_NOHAS
         def_bool y
         depends on !PPC_STD_MMU
   
+ config PPC_BOOK3E_MMU
+       def_bool y
+       depends on FSL_BOOKE
+ 
   config PPC_MM_SLICES
         bool
         default y if HUGETLB_PAGE || (PPC_STD_MMU_64 && PPC_64K_PAGES)
diff --combined arch/x86/Kconfig

index e18dfd3,5b2196a..bcf4cae
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -34,12 -34,18 +34,18 @@@ config X8
         select HAVE_FUNCTION_TRACER
         select HAVE_FUNCTION_GRAPH_TRACER
         select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+       select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
+       select HAVE_FTRACE_SYSCALLS
         select HAVE_KVM
         select HAVE_ARCH_KGDB
         select HAVE_ARCH_TRACEHOOK
         select HAVE_GENERIC_DMA_COHERENT if X86_32
         select HAVE_EFFICIENT_UNALIGNED_ACCESS
         select USER_STACKTRACE_SUPPORT
+       select HAVE_DMA_API_DEBUG
+       select HAVE_KERNEL_GZIP
+       select HAVE_KERNEL_BZIP2
+       select HAVE_KERNEL_LZMA
   
   config ARCH_DEFCONFIG
         string
@@@ -135,6 -141,9 +141,9 @@@ config ARCH_HAS_CACHE_LINE_SIZ
   config HAVE_SETUP_PER_CPU_AREA
         def_bool y
   
+ config HAVE_DYNAMIC_PER_CPU_AREA
+       def_bool y
+ 
   config HAVE_CPUMASK_OF_CPU_MAP
         def_bool X86_64_SMP
   
@@@ -158,11 -167,17 +167,17 @@@ config AUDIT_ARC
   config ARCH_SUPPORTS_OPTIMIZED_INLINING
         def_bool y
   
+ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+       def_bool y
+ 
   # Use the generic interrupt handling code in kernel/irq/:
   config GENERIC_HARDIRQS
         bool
         default y
   
+ config GENERIC_HARDIRQS_NO__DO_IRQ
+        def_bool y
+ 
   config GENERIC_IRQ_PROBE
         bool
         default y
@@@ -712,7 -727,6 +727,7 @@@ config X86_UP_IOAPI
   config X86_LOCAL_APIC
         def_bool y
         depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+ +      select HAVE_PERF_COUNTERS if (!M386 && !M486)
   
   config X86_IO_APIC
         def_bool y
@@@ -778,6 -792,11 +793,11 @@@ config X86_MCE_AM
            Additional support for AMD specific MCE features such as
            the DRAM Error Threshold.
   
+ config X86_MCE_THRESHOLD
+       depends on X86_MCE_AMD || X86_MCE_INTEL
+       bool
+       default y
+ 
   config X86_MCE_NONFATAL
         tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
         depends on X86_32 && X86_MCE
@@@ -921,6 -940,12 +941,12 @@@ config X86_CPUI
           with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
           /dev/cpu/31/cpuid.
   
+ config X86_CPU_DEBUG
+       tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
+       ---help---
+         If you select this option, this will provide various x86 CPUs
+         information through debugfs.
+ 
   choice
         prompt "High Memory Support"
         default HIGHMEM4G if !X86_NUMAQ
@@@ -1113,7 -1138,7 +1139,7 @@@ config NUMA_EM
   
   config NODES_SHIFT
         int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
-       range 1 9   if X86_64
+       range 1 9
         default "9" if MAXSMP
         default "6" if X86_64
         default "4" if X86_NUMAQ
@@@ -1121,9 -1146,9 +1147,9 @@@
         depends on NEED_MULTIPLE_NODES
         ---help---
           Specify the maximum number of NUMA Nodes available on the target
-         system.  Increases memory reserved to accomodate various tables.
+         system.  Increases memory reserved to accommodate various tables.
   
- config HAVE_ARCH_BOOTMEM_NODE
+ config HAVE_ARCH_BOOTMEM
         def_bool y
         depends on X86_32 && NUMA
   
@@@ -1299,7 -1324,7 +1325,7 @@@ config MTRR_SANITIZE
           add writeback entries.
   
           Can be disabled with disable_mtrr_cleanup on the kernel command line.
-         The largest mtrr entry size for a continous block can be set with
+         The largest mtrr entry size for a continuous block can be set with
           mtrr_chunk_size.
   
           If unsure, say Y.
@@@ -1421,7 -1446,7 +1447,7 @@@ config CRASH_DUM
   config KEXEC_JUMP
         bool "kexec jump (EXPERIMENTAL)"
         depends on EXPERIMENTAL
-       depends on KEXEC && HIBERNATION && X86_32
+       depends on KEXEC && HIBERNATION
         ---help---
           Jump between original kernel and kexeced kernel and invoke
           code in physical address mode via KEXEC
@@@ -1814,8 -1839,8 +1840,8 @@@ config PCI_MMCONFI
   
   config DMAR
         bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
-       depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
-       ---help---
+       depends on PCI_MSI && ACPI && EXPERIMENTAL
+       help
           DMA remapping (DMAR) devices support enables independent address
           translations for Direct Memory Access (DMA) from devices.
           These DMA remapping devices are reported via ACPI tables
diff --combined arch/x86/ia32/ia32entry.S

index e4baa06,a505202..19c61ef
--- 1/arch/x86/ia32/ia32entry.S
--- 2/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@@ -557,7 -557,7 +557,7 @@@ ia32_sys_call_table
         .quad sys32_olduname
         .quad sys_umask         /* 60 */
         .quad sys_chroot
-       .quad sys32_ustat
+       .quad compat_sys_ustat
         .quad sys_dup2
         .quad sys_getppid
         .quad sys_getpgrp               /* 65 */
@@@ -825,8 -825,9 +825,10 @@@
         .quad compat_sys_signalfd4
         .quad sys_eventfd2
         .quad sys_epoll_create1
- -      .quad sys_dup3                  /* 330 */
+ +      .quad sys_dup3                          /* 330 */
         .quad sys_pipe2
         .quad sys_inotify_init1
+       .quad compat_sys_preadv
+       .quad compat_sys_pwritev
+ +      .quad sys_perf_counter_open
   ia32_syscall_end:
diff --combined arch/x86/include/asm/hardirq.h

index 46ebed7,039db6a..2545442
--- 1/arch/x86/include/asm/hardirq.h
--- 2/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@@ -12,7 -12,7 +12,8 @@@ typedef struct 
         unsigned int apic_timer_irqs;   /* arch dependent */
         unsigned int irq_spurious_count;
   #endif
+       unsigned int generic_irqs;      /* arch dependent */
+ +      unsigned int apic_perf_irqs;
   #ifdef CONFIG_SMP
         unsigned int irq_resched_count;
         unsigned int irq_call_count;
diff --combined arch/x86/include/asm/hw_irq.h

index f39881b,b762ea4..ae80f64
--- 1/arch/x86/include/asm/hw_irq.h
--- 2/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@@ -27,9 -27,8 +27,10 @@@
   
   /* Interrupt handlers registered during init_IRQ */
   extern void apic_timer_interrupt(void);
+ extern void generic_interrupt(void);
   extern void error_interrupt(void);
+ +extern void perf_counter_interrupt(void);
+ +
   extern void spurious_interrupt(void);
   extern void thermal_interrupt(void);
   extern void reschedule_interrupt(void);
diff --combined arch/x86/include/asm/thread_info.h

index ca7310e,8820a73..3ffd5d2
--- 1/arch/x86/include/asm/thread_info.h
--- 2/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@@ -83,7 -83,6 +83,7 @@@ struct thread_info 
   #define TIF_SYSCALL_AUDIT     7       /* syscall auditing active */
   #define TIF_SECCOMP           8       /* secure computing */
   #define TIF_MCE_NOTIFY                10      /* notify userspace of an MCE */
+ +#define TIF_PERF_COUNTERS     11      /* notify perf counter work */
   #define TIF_NOTSC             16      /* TSC is not accessible in userland */
   #define TIF_IA32              17      /* 32bit process */
   #define TIF_FORK              18      /* ret_from_fork */
@@@ -95,6 -94,7 +95,7 @@@
   #define TIF_FORCED_TF         24      /* true if TF in eflags artificially */
   #define TIF_DEBUGCTLMSR               25      /* uses thread_struct.debugctlmsr */
   #define TIF_DS_AREA_MSR               26      /* uses thread_struct.ds_area_msr */
+ #define TIF_SYSCALL_FTRACE    27      /* for ftrace syscall instrumentation */
   
   #define _TIF_SYSCALL_TRACE    (1 << TIF_SYSCALL_TRACE)
   #define _TIF_NOTIFY_RESUME    (1 << TIF_NOTIFY_RESUME)
@@@ -106,7 -106,6 +107,7 @@@
   #define _TIF_SYSCALL_AUDIT    (1 << TIF_SYSCALL_AUDIT)
   #define _TIF_SECCOMP          (1 << TIF_SECCOMP)
   #define _TIF_MCE_NOTIFY               (1 << TIF_MCE_NOTIFY)
+ +#define _TIF_PERF_COUNTERS    (1 << TIF_PERF_COUNTERS)
   #define _TIF_NOTSC            (1 << TIF_NOTSC)
   #define _TIF_IA32             (1 << TIF_IA32)
   #define _TIF_FORK             (1 << TIF_FORK)
@@@ -117,15 -116,17 +118,17 @@@
   #define _TIF_FORCED_TF                (1 << TIF_FORCED_TF)
   #define _TIF_DEBUGCTLMSR      (1 << TIF_DEBUGCTLMSR)
   #define _TIF_DS_AREA_MSR      (1 << TIF_DS_AREA_MSR)
+ #define _TIF_SYSCALL_FTRACE   (1 << TIF_SYSCALL_FTRACE)
   
   /* work to do in syscall_trace_enter() */
   #define _TIF_WORK_SYSCALL_ENTRY       \
-       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
+       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE |  \
          _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
   
   /* work to do in syscall_trace_leave() */
   #define _TIF_WORK_SYSCALL_EXIT        \
-       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
+       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |    \
+        _TIF_SYSCALL_FTRACE)
   
   /* work to do on interrupt/exception return */
   #define _TIF_WORK_MASK                                                        \
@@@ -134,11 -135,11 +137,11 @@@
            _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
   
   /* work to do on any return to user space */
- #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
+ #define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
   
   /* Only used for 64 bit */
   #define _TIF_DO_NOTIFY_MASK                                           \
- -      (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ +      (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
   
   /* flags to check in __switch_to() */
   #define _TIF_WORK_CTXSW                                                       \
diff --combined arch/x86/include/asm/unistd_32.h

index 7e47658,6e72d74..0b4d8c2
--- 1/arch/x86/include/asm/unistd_32.h
--- 2/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@@ -338,7 -338,8 +338,9 @@@
   #define __NR_dup3             330
   #define __NR_pipe2            331
   #define __NR_inotify_init1    332
+ #define __NR_preadv           333
+ #define __NR_pwritev          334
+ +#define __NR_perf_counter_open        333
   
   #ifdef __KERNEL__
   
diff --combined arch/x86/include/asm/unistd_64.h

index 53025fe,f818294..d9aad87
--- 1/arch/x86/include/asm/unistd_64.h
--- 2/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@@ -653,8 -653,11 +653,12 @@@ __SYSCALL(__NR_dup3, sys_dup3
   __SYSCALL(__NR_pipe2, sys_pipe2)
   #define __NR_inotify_init1                    294
   __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
- -
+ #define __NR_preadv                           295
+ __SYSCALL(__NR_preadv, sys_preadv)
+ #define __NR_pwritev                          296
+ __SYSCALL(__NR_pwritev, sys_pwritev)
+ +#define __NR_perf_counter_open                295
+ +__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
   
   #ifndef __NO_STUBS
   #define __ARCH_WANT_OLD_READDIR
diff --combined arch/x86/kernel/apic/apic.c

index 4732768,85eb8e1..b0e5e71
--- 1/arch/x86/kernel/apic/apic.c
--- 2/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@@ -34,7 -34,6 +34,7 @@@
   #include <linux/smp.h>
   #include <linux/mm.h>
   
+ +#include <asm/perf_counter.h>
   #include <asm/pgalloc.h>
   #include <asm/atomic.h>
   #include <asm/mpspec.h>
@@@ -47,6 -46,7 +47,7 @@@
   #include <asm/idle.h>
   #include <asm/mtrr.h>
   #include <asm/smp.h>
+ #include <asm/mce.h>
   
   unsigned int num_processors;
   
@@@ -755,8 -755,6 +756,8 @@@ static void local_apic_timer_interrupt(
         inc_irq_stat(apic_timer_irqs);
   
         evt->event_handler(evt);
+ +
+ +      perf_counter_unthrottle();
   }
   
   /*
@@@ -811,7 -809,7 +812,7 @@@ void clear_local_APIC(void
         u32 v;
   
         /* APIC hasn't been mapped yet */
-       if (!apic_phys)
+       if (!x2apic && !apic_phys)
                 return;
   
         maxlvt = lapic_get_maxlvt();
@@@ -845,6 -843,14 +846,14 @@@
                 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
         }
   #endif
+ #ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 6) {
+               v = apic_read(APIC_LVTCMCI);
+               if (!(v & APIC_LVT_MASKED))
+                       apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+       }
+ #endif
+ 
         /*
          * Clean APIC state for other OSs:
          */
@@@ -1121,7 -1127,6 +1130,7 @@@ void __cpuinit setup_local_APIC(void
                 apic_write(APIC_ESR, 0);
         }
   #endif
+ +      perf_counters_lapic_init(0);
   
         preempt_disable();
   
@@@ -1245,6 -1250,12 +1254,12 @@@
         apic_write(APIC_LVT1, value);
   
         preempt_enable();
+ 
+ #ifdef CONFIG_X86_MCE_INTEL
+       /* Recheck CMCI information after local APIC is up on CPU #0 */
+       if (smp_processor_id() == 0)
+               cmci_recheck();
+ #endif
   }
   
   void __cpuinit end_local_APIC_setup(void)
@@@ -1323,15 -1334,16 +1338,16 @@@ void __init enable_IR_x2apic(void
                 return;
         }
   
-       local_irq_save(flags);
-       mask_8259A();
- 
-       ret = save_mask_IO_APIC_setup();
+       ret = save_IO_APIC_setup();
         if (ret) {
                 pr_info("Saving IO-APIC state failed: %d\n", ret);
                 goto end;
         }
   
+       local_irq_save(flags);
+       mask_IO_APIC_setup();
+       mask_8259A();
+ 
         ret = enable_intr_remapping(1);
   
         if (ret && x2apic_preenabled) {
@@@ -1356,10 -1368,10 +1372,10 @@@ end_restore
         else
                 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
   
         unmask_8259A();
         local_irq_restore(flags);
   
+ end:
         if (!ret) {
                 if (!x2apic_preenabled)
                         pr_info("Enabled x2apic and interrupt-remapping\n");
@@@ -1512,12 -1524,10 +1528,10 @@@ void __init early_init_lapic_mapping(vo
    */
   void __init init_apic_mappings(void)
   {
- #ifdef CONFIG_X86_X2APIC
         if (x2apic) {
                 boot_cpu_physical_apicid = read_apic_id();
                 return;
         }
- #endif
   
         /*
          * If no local APIC can be found then set up a fake all
@@@ -1961,12 -1971,9 +1975,9 @@@ static int lapic_resume(struct sys_devi
   
         local_irq_save(flags);
   
- #ifdef CONFIG_X86_X2APIC
         if (x2apic)
                 enable_x2apic();
-       else
- #endif
-       {
+       else {
                 /*
                  * Make sure the APICBASE points to the right address
                  *
diff --combined arch/x86/kernel/cpu/Makefile

index c381330,4e242f9..3efcb2b
--- 1/arch/x86/kernel/cpu/Makefile
--- 2/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@@ -1,5 -1,5 +1,5 @@@
   #
- -# Makefile for x86-compatible CPU details and quirks
+ +# Makefile for x86-compatible CPU details, features and quirks
   #
   
   # Don't trace early stages of a secondary CPU boot
@@@ -14,21 -14,20 +14,22 @@@ obj-y                      += vmware.o hypervisor.
   obj-$(CONFIG_X86_32)  += bugs.o cmpxchg.o
   obj-$(CONFIG_X86_64)  += bugs_64.o
   
+ obj-$(CONFIG_X86_CPU_DEBUG)           += cpu_debug.o
+ 
   obj-$(CONFIG_CPU_SUP_INTEL)           += intel.o
   obj-$(CONFIG_CPU_SUP_AMD)             += amd.o
   obj-$(CONFIG_CPU_SUP_CYRIX_32)                += cyrix.o
- obj-$(CONFIG_CPU_SUP_CENTAUR_32)      += centaur.o
- obj-$(CONFIG_CPU_SUP_CENTAUR_64)      += centaur_64.o
+ obj-$(CONFIG_CPU_SUP_CENTAUR)         += centaur.o
   obj-$(CONFIG_CPU_SUP_TRANSMETA_32)    += transmeta.o
   obj-$(CONFIG_CPU_SUP_UMC_32)          += umc.o
   
- -obj-$(CONFIG_X86_MCE) += mcheck/
- -obj-$(CONFIG_MTRR)    += mtrr/
- -obj-$(CONFIG_CPU_FREQ)        += cpufreq/
+ +obj-$(CONFIG_PERF_COUNTERS)           += perf_counter.o
   
- -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+ +obj-$(CONFIG_X86_MCE)                 += mcheck/
+ +obj-$(CONFIG_MTRR)                    += mtrr/
+ +obj-$(CONFIG_CPU_FREQ)                        += cpufreq/
+ +
+ +obj-$(CONFIG_X86_LOCAL_APIC)          += perfctr-watchdog.o
   
   quiet_cmd_mkcapflags = MKCAP   $@
         cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --combined arch/x86/kernel/cpu/amd.c

index edcde52,7e4a459..fd69c51
--- 1/arch/x86/kernel/cpu/amd.c
--- 2/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@@ -5,6 -5,7 +5,7 @@@
   #include <asm/io.h>
   #include <asm/processor.h>
   #include <asm/apic.h>
+ #include <asm/cpu.h>
   
   #ifdef CONFIG_X86_64
   # include <asm/numa_64.h>
@@@ -141,6 -142,55 +142,55 @@@ static void __cpuinit init_amd_k6(struc
         }
   }
   
+ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+ {
+ #ifdef CONFIG_SMP
+       /* calling is from identify_secondary_cpu() ? */
+       if (c->cpu_index == boot_cpu_id)
+               return;
+ 
+       /*
+        * Certain Athlons might work (for various values of 'work') in SMP
+        * but they are not certified as MP capable.
+        */
+       /* Athlon 660/661 is valid. */
+       if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+           (c->x86_mask == 1)))
+               goto valid_k7;
+ 
+       /* Duron 670 is valid */
+       if ((c->x86_model == 7) && (c->x86_mask == 0))
+               goto valid_k7;
+ 
+       /*
+        * Athlon 662, Duron 671, and Athlon >model 7 have capability
+        * bit. It's worth noting that the A5 stepping (662) of some
+        * Athlon XP's have the MP bit set.
+        * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+        * more.
+        */
+       if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+           ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+            (c->x86_model > 7))
+               if (cpu_has_mp)
+                       goto valid_k7;
+ 
+       /* If we get here, not a certified SMP capable AMD system. */
+ 
+       /*
+        * Don't taint if we are running SMP kernel on a single non-MP
+        * approved Athlon
+        */
+       WARN_ONCE(1, "WARNING: This combination of AMD"
+               "processors is not suitable for SMP.\n");
+       if (!test_taint(TAINT_UNSAFE_SMP))
+               add_taint(TAINT_UNSAFE_SMP);
+ 
+ valid_k7:
+       ;
+ #endif
+ }
+ 
   static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
   {
         u32 l, h;
@@@ -175,6 -225,8 +225,8 @@@
         }
   
         set_cpu_cap(c, X86_FEATURE_K7);
+ 
+       amd_k7_smp_check(c);
   }
   #endif
   
@@@ -368,10 -420,6 +420,10 @@@ static void __cpuinit init_amd(struct c
         if (c->x86 >= 6)
                 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
   
+ +      /* Enable Performance counter for K7 and later */
+ +      if (c->x86 > 6 && c->x86 <= 0x11)
+ +              set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ +
         if (!c->x86_model_id[0]) {
                 switch (c->x86) {
                 case 0xf:
@@@ -454,7 -502,7 +506,7 @@@ static unsigned int __cpuinit amd_size_
   }
   #endif
   
- static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
         .c_vendor       = "AMD",
         .c_ident        = { "AuthenticAMD" },
   #ifdef CONFIG_X86_32
diff --combined arch/x86/kernel/cpu/common.c

index b66af09,c4f6678..a86769e
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -1,53 -1,50 +1,51 @@@
- #include <linux/init.h>
- #include <linux/kernel.h>
- #include <linux/sched.h>
- #include <linux/string.h>
   #include <linux/bootmem.h>
+ #include <linux/linkage.h>
   #include <linux/bitops.h>
+ #include <linux/kernel.h>
   #include <linux/module.h>
- #include <linux/kgdb.h>
- #include <linux/topology.h>
+ #include <linux/percpu.h>
+ #include <linux/string.h>
   #include <linux/delay.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+ #include <linux/kgdb.h>
   #include <linux/smp.h>
- #include <linux/percpu.h>
- #include <asm/i387.h>
- #include <asm/msr.h>
- #include <asm/io.h>
- #include <asm/linkage.h>
+ #include <linux/io.h>
+ 
+ #include <asm/stackprotector.h>
++#include <asm/perf_counter.h>
   #include <asm/mmu_context.h>
+ #include <asm/hypervisor.h>
+ #include <asm/processor.h>
+ #include <asm/sections.h>
+ #include <asm/topology.h>
+ #include <asm/cpumask.h>
+ #include <asm/pgtable.h>
+ #include <asm/atomic.h>
+ #include <asm/proto.h>
+ #include <asm/setup.h>
+ #include <asm/apic.h>
+ #include <asm/desc.h>
+ #include <asm/i387.h>
   #include <asm/mtrr.h>
+ #include <asm/numa.h>
+ #include <asm/asm.h>
+ #include <asm/cpu.h>
   #include <asm/mce.h>
- #include <asm/perf_counter.h>
+ #include <asm/msr.h>
   #include <asm/pat.h>
- #include <asm/asm.h>
- #include <asm/numa.h>
   #include <asm/smp.h>
- #include <asm/cpu.h>
- #include <asm/cpumask.h>
- #include <asm/apic.h>
   
   #ifdef CONFIG_X86_LOCAL_APIC
   #include <asm/uv/uv.h>
   #endif
   
- #include <asm/pgtable.h>
- #include <asm/processor.h>
- #include <asm/desc.h>
- #include <asm/atomic.h>
- #include <asm/proto.h>
- #include <asm/sections.h>
- #include <asm/setup.h>
- #include <asm/hypervisor.h>
- #include <asm/stackprotector.h>
- 
   #include "cpu.h"
   
- #ifdef CONFIG_X86_64
- 
   /* all of these masks are initialized in setup_cpu_local_masks() */
- cpumask_var_t cpu_callin_mask;
- cpumask_var_t cpu_callout_mask;
   cpumask_var_t cpu_initialized_mask;
+ cpumask_var_t cpu_callout_mask;
+ cpumask_var_t cpu_callin_mask;
   
   /* representing cpus for which sibling maps can be computed */
   cpumask_var_t cpu_sibling_setup_mask;
@@@ -61,17 -58,7 +59,7 @@@ void __init setup_cpu_local_masks(void
         alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
   }
   
- #else /* CONFIG_X86_32 */
- 
- cpumask_t cpu_callin_map;
- cpumask_t cpu_callout_map;
- cpumask_t cpu_initialized;
- cpumask_t cpu_sibling_setup_map;
- 
- #endif /* CONFIG_X86_32 */
- 
- 
- static struct cpu_dev *this_cpu __cpuinitdata;
+ static const struct cpu_dev *this_cpu __cpuinitdata;
   
   DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
   #ifdef CONFIG_X86_64
@@@ -80,48 -67,48 +68,48 @@@
          * IRET will check the segment types  kkeil 2000/10/28
          * Also sysret mandates a special GDT layout
          *
-        * The TLS descriptors are currently at a different place compared to i386.
+        * TLS descriptors are currently at a different place compared to i386.
          * Hopefully nobody expects them at a fixed place (Wine?)
          */
-       [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
-       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
-       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
-       [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
-       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
-       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+       [GDT_ENTRY_KERNEL32_CS]         = { { { 0x0000ffff, 0x00cf9b00 } } },
+       [GDT_ENTRY_KERNEL_CS]           = { { { 0x0000ffff, 0x00af9b00 } } },
+       [GDT_ENTRY_KERNEL_DS]           = { { { 0x0000ffff, 0x00cf9300 } } },
+       [GDT_ENTRY_DEFAULT_USER32_CS]   = { { { 0x0000ffff, 0x00cffb00 } } },
+       [GDT_ENTRY_DEFAULT_USER_DS]     = { { { 0x0000ffff, 0x00cff300 } } },
+       [GDT_ENTRY_DEFAULT_USER_CS]     = { { { 0x0000ffff, 0x00affb00 } } },
   #else
-       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
-       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
-       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
-       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+       [GDT_ENTRY_KERNEL_CS]           = { { { 0x0000ffff, 0x00cf9a00 } } },
+       [GDT_ENTRY_KERNEL_DS]           = { { { 0x0000ffff, 0x00cf9200 } } },
+       [GDT_ENTRY_DEFAULT_USER_CS]     = { { { 0x0000ffff, 0x00cffa00 } } },
+       [GDT_ENTRY_DEFAULT_USER_DS]     = { { { 0x0000ffff, 0x00cff200 } } },
         /*
          * Segments used for calling PnP BIOS have byte granularity.
          * They code segments and data segments have fixed 64k limits,
          * the transfer segment sizes are set at run time.
          */
         /* 32-bit code */
-       [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+       [GDT_ENTRY_PNPBIOS_CS32]        = { { { 0x0000ffff, 0x00409a00 } } },
         /* 16-bit code */
-       [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+       [GDT_ENTRY_PNPBIOS_CS16]        = { { { 0x0000ffff, 0x00009a00 } } },
         /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_DS]          = { { { 0x0000ffff, 0x00009200 } } },
         /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_TS1]         = { { { 0x00000000, 0x00009200 } } },
         /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_TS2]         = { { { 0x00000000, 0x00009200 } } },
         /*
          * The APM segments have byte granularity and their bases
          * are set at run time.  All have 64k limits.
          */
         /* 32-bit code */
-       [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+       [GDT_ENTRY_APMBIOS_BASE]        = { { { 0x0000ffff, 0x00409a00 } } },
         /* 16-bit code */
-       [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+       [GDT_ENTRY_APMBIOS_BASE+1]      = { { { 0x0000ffff, 0x00009a00 } } },
         /* data */
-       [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+       [GDT_ENTRY_APMBIOS_BASE+2]      = { { { 0x0000ffff, 0x00409200 } } },
   
-       [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
-       [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+       [GDT_ENTRY_ESPFIX_SS]           = { { { 0x00000000, 0x00c09200 } } },
+       [GDT_ENTRY_PERCPU]              = { { { 0x0000ffff, 0x00cf9200 } } },
         GDT_STACK_CANARY_INIT
   #endif
   } };
@@@ -165,16 -152,17 +153,17 @@@ static inline int flag_is_changeable_p(
          * the CPUID. Add "volatile" to not allow gcc to
          * optimize the subsequent calls to this function.
          */
-       asm volatile ("pushfl\n\t"
-                     "pushfl\n\t"
-                     "popl %0\n\t"
-                     "movl %0,%1\n\t"
-                     "xorl %2,%0\n\t"
-                     "pushl %0\n\t"
-                     "popfl\n\t"
-                     "pushfl\n\t"
-                     "popl %0\n\t"
-                     "popfl\n\t"
+       asm volatile ("pushfl           \n\t"
+                     "pushfl           \n\t"
+                     "popl %0          \n\t"
+                     "movl %0, %1      \n\t"
+                     "xorl %2, %0      \n\t"
+                     "pushl %0         \n\t"
+                     "popfl            \n\t"
+                     "pushfl           \n\t"
+                     "popl %0          \n\t"
+                     "popfl            \n\t"
+ 
                       : "=&r" (f1), "=&r" (f2)
                       : "ir" (flag));
   
@@@ -189,18 -177,22 +178,22 @@@ static int __cpuinit have_cpuid_p(void
   
   static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
   {
-       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
-               /* Disable processor serial number */
-               unsigned long lo, hi;
-               rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-               lo |= 0x200000;
-               wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-               printk(KERN_NOTICE "CPU serial number disabled.\n");
-               clear_cpu_cap(c, X86_FEATURE_PN);
- 
-               /* Disabling the serial number may affect the cpuid level */
-               c->cpuid_level = cpuid_eax(0);
-       }
+       unsigned long lo, hi;
+ 
+       if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+               return;
+ 
+       /* Disable processor serial number: */
+ 
+       rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+       lo |= 0x200000;
+       wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ 
+       printk(KERN_NOTICE "CPU serial number disabled.\n");
+       clear_cpu_cap(c, X86_FEATURE_PN);
+ 
+       /* Disabling the serial number may affect the cpuid level */
+       c->cpuid_level = cpuid_eax(0);
   }
   
   static int __init x86_serial_nr_setup(char *s)
@@@ -233,6 -225,7 +226,7 @@@ struct cpuid_dependent_feature 
         u32 feature;
         u32 level;
   };
+ 
   static const struct cpuid_dependent_feature __cpuinitconst
   cpuid_dependent_features[] = {
         { X86_FEATURE_MWAIT,            0x00000005 },
@@@ -244,7 -237,11 +238,11 @@@
   static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
   {
         const struct cpuid_dependent_feature *df;
+ 
         for (df = cpuid_dependent_features; df->feature; df++) {
+ 
+               if (!cpu_has(c, df->feature))
+                       continue;
                 /*
                  * Note: cpuid_level is set to -1 if unavailable, but
                  * extended_extended_level is set to 0 if unavailable
@@@ -252,32 -249,32 +250,32 @@@
                  * when signed; hence the weird messing around with
                  * signs here...
                  */
-               if (cpu_has(c, df->feature) &&
-                   ((s32)df->level < 0 ?
+               if (!((s32)df->level < 0 ?
                      (u32)df->level > (u32)c->extended_cpuid_level :
-                    (s32)df->level > (s32)c->cpuid_level)) {
-                       clear_cpu_cap(c, df->feature);
-                       if (warn)
-                               printk(KERN_WARNING
-                                      "CPU: CPU feature %s disabled "
-                                      "due to lack of CPUID level 0x%x\n",
-                                      x86_cap_flags[df->feature],
-                                      df->level);
-               }
+                    (s32)df->level > (s32)c->cpuid_level))
+                       continue;
+ 
+               clear_cpu_cap(c, df->feature);
+               if (!warn)
+                       continue;
+ 
+               printk(KERN_WARNING
+                      "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+                               x86_cap_flags[df->feature], df->level);
         }
   }
   
   /*
    * Naming convention should be: <Name> [(<Codename>)]
    * This table only is used unless init_<vendor>() below doesn't set it;
-  * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
-  *
+  * in particular, if CPUID levels 0x80000002..4 are supported, this
+  * isn't used
    */
   
   /* Look up CPU names by table lookup. */
- static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
   {
-       struct cpu_model_info *info;
+       const struct cpu_model_info *info;
   
         if (c->x86_model >= 16)
                 return NULL;    /* Range check */
@@@ -308,8 -305,10 +306,10 @@@ void load_percpu_segment(int cpu
         load_stack_canary_segment();
   }
   
- /* Current gdt points %fs at the "master" per-cpu area: after this,
-  * it's on the real one. */
+ /*
+  * Current gdt points %fs at the "master" per-cpu area: after this,
+  * it's on the real one.
+  */
   void switch_to_new_gdt(int cpu)
   {
         struct desc_ptr gdt_descr;
@@@ -322,7 -321,7 +322,7 @@@
         load_percpu_segment(cpu);
   }
   
- static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+ static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
   
   static void __cpuinit default_init(struct cpuinfo_x86 *c)
   {
@@@ -341,7 -340,7 +341,7 @@@
   #endif
   }
   
- static struct cpu_dev __cpuinitdata default_cpu = {
+ static const struct cpu_dev __cpuinitconst default_cpu = {
         .c_init = default_init,
         .c_vendor = "Unknown",
         .c_x86_vendor = X86_VENDOR_UNKNOWN,
@@@ -355,22 -354,24 +355,24 @@@ static void __cpuinit get_model_name(st
         if (c->extended_cpuid_level < 0x80000004)
                 return;
   
-       v = (unsigned int *) c->x86_model_id;
+       v = (unsigned int *)c->x86_model_id;
         cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
         cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
         cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
         c->x86_model_id[48] = 0;
   
-       /* Intel chips right-justify this string for some dumb reason;
-          undo that brain damage */
+       /*
+        * Intel chips right-justify this string for some dumb reason;
+        * undo that brain damage:
+        */
         p = q = &c->x86_model_id[0];
         while (*p == ' ')
-            p++;
+               p++;
         if (p != q) {
-            while (*p)
-                 *q++ = *p++;
-            while (q <= &c->x86_model_id[48])
-                 *q++ = '\0';  /* Zero-pad the rest */
+               while (*p)
+                       *q++ = *p++;
+               while (q <= &c->x86_model_id[48])
+                       *q++ = '\0';    /* Zero-pad the rest */
         }
   }
   
@@@ -439,27 -440,30 +441,30 @@@ void __cpuinit detect_ht(struct cpuinfo
   
         if (smp_num_siblings == 1) {
                 printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-       } else if (smp_num_siblings > 1) {
+               goto out;
+       }
   
-               if (smp_num_siblings > nr_cpu_ids) {
-                       printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
-                                       smp_num_siblings);
-                       smp_num_siblings = 1;
-                       return;
-               }
+       if (smp_num_siblings <= 1)
+               goto out;
   
-               index_msb = get_count_order(smp_num_siblings);
-               c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+       if (smp_num_siblings > nr_cpu_ids) {
+               pr_warning("CPU: Unsupported number of siblings %d",
+                          smp_num_siblings);
+               smp_num_siblings = 1;
+               return;
+       }
   
-               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+       index_msb = get_count_order(smp_num_siblings);
+       c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
   
-               index_msb = get_count_order(smp_num_siblings);
+       smp_num_siblings = smp_num_siblings / c->x86_max_cores;
   
-               core_bits = get_count_order(c->x86_max_cores);
+       index_msb = get_count_order(smp_num_siblings);
   
-               c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
-                                              ((1 << core_bits) - 1);
-       }
+       core_bits = get_count_order(c->x86_max_cores);
+ 
+       c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+                                      ((1 << core_bits) - 1);
   
   out:
         if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@@ -474,8 -478,8 +479,8 @@@
   static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
   {
         char *v = c->x86_vendor_id;
-       int i;
         static int printed;
+       int i;
   
         for (i = 0; i < X86_VENDOR_NUM; i++) {
                 if (!cpu_devs[i])
@@@ -484,6 -488,7 +489,7 @@@
                 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
                     (cpu_devs[i]->c_ident[1] &&
                      !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+ 
                         this_cpu = cpu_devs[i];
                         c->x86_vendor = this_cpu->c_x86_vendor;
                         return;
@@@ -492,7 -497,9 +498,9 @@@
   
         if (!printed) {
                 printed++;
-               printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+               printk(KERN_ERR
+                   "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+ 
                 printk(KERN_ERR "CPU: Your system may be unstable.\n");
         }
   
@@@ -512,14 -519,17 +520,17 @@@ void __cpuinit cpu_detect(struct cpuinf
         /* Intel-defined flags: level 0x00000001 */
         if (c->cpuid_level >= 0x00000001) {
                 u32 junk, tfms, cap0, misc;
+ 
                 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
                 c->x86 = (tfms >> 8) & 0xf;
                 c->x86_model = (tfms >> 4) & 0xf;
                 c->x86_mask = tfms & 0xf;
+ 
                 if (c->x86 == 0xf)
                         c->x86 += (tfms >> 20) & 0xff;
                 if (c->x86 >= 0x6)
                         c->x86_model += ((tfms >> 16) & 0xf) << 4;
+ 
                 if (cap0 & (1<<19)) {
                         c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
                         c->x86_cache_alignment = c->x86_clflush_size;
@@@ -535,6 -545,7 +546,7 @@@ static void __cpuinit get_cpu_cap(struc
         /* Intel-defined flags: level 0x00000001 */
         if (c->cpuid_level >= 0x00000001) {
                 u32 capability, excap;
+ 
                 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
                 c->x86_capability[0] = capability;
                 c->x86_capability[4] = excap;
@@@ -543,6 -554,7 +555,7 @@@
         /* AMD-defined flags: level 0x80000001 */
         xlvl = cpuid_eax(0x80000000);
         c->extended_cpuid_level = xlvl;
+ 
         if ((xlvl & 0xffff0000) == 0x80000000) {
                 if (xlvl >= 0x80000001) {
                         c->x86_capability[1] = cpuid_edx(0x80000001);
@@@ -550,13 -562,15 +563,15 @@@
                 }
         }
   
- #ifdef CONFIG_X86_64
         if (c->extended_cpuid_level >= 0x80000008) {
                 u32 eax = cpuid_eax(0x80000008);
   
                 c->x86_virt_bits = (eax >> 8) & 0xff;
                 c->x86_phys_bits = eax & 0xff;
         }
+ #ifdef CONFIG_X86_32
+       else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+               c->x86_phys_bits = 36;
   #endif
   
         if (c->extended_cpuid_level >= 0x80000007)
@@@ -603,8 -617,12 +618,12 @@@ static void __init early_identify_cpu(s
   {
   #ifdef CONFIG_X86_64
         c->x86_clflush_size = 64;
+       c->x86_phys_bits = 36;
+       c->x86_virt_bits = 48;
   #else
         c->x86_clflush_size = 32;
+       c->x86_phys_bits = 32;
+       c->x86_virt_bits = 32;
   #endif
         c->x86_cache_alignment = c->x86_clflush_size;
   
@@@ -635,12 -653,12 +654,12 @@@
   
   void __init early_cpu_init(void)
   {
-       struct cpu_dev **cdev;
+       const struct cpu_dev *const *cdev;
         int count = 0;
   
-       printk("KERNEL supported cpus:\n");
+       printk(KERN_INFO "KERNEL supported cpus:\n");
         for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
-               struct cpu_dev *cpudev = *cdev;
+               const struct cpu_dev *cpudev = *cdev;
                 unsigned int j;
   
                 if (count >= X86_VENDOR_NUM)
@@@ -651,7 -669,7 +670,7 @@@
                 for (j = 0; j < 2; j++) {
                         if (!cpudev->c_ident[j])
                                 continue;
-                       printk("  %s %s\n", cpudev->c_vendor,
+                       printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
                                 cpudev->c_ident[j]);
                 }
         }
@@@ -727,9 -745,13 +746,13 @@@ static void __cpuinit identify_cpu(stru
         c->x86_coreid_bits = 0;
   #ifdef CONFIG_X86_64
         c->x86_clflush_size = 64;
+       c->x86_phys_bits = 36;
+       c->x86_virt_bits = 48;
   #else
         c->cpuid_level = -1;    /* CPUID not detected */
         c->x86_clflush_size = 32;
+       c->x86_phys_bits = 32;
+       c->x86_virt_bits = 32;
   #endif
         c->x86_cache_alignment = c->x86_clflush_size;
         memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@@ -760,8 -782,8 +783,8 @@@
         squash_the_stupid_serial_number(c);
   
         /*
-        * The vendor-specific functions might have changed features.  Now
-        * we do "generic changes."
+        * The vendor-specific functions might have changed features.
+        * Now we do "generic changes."
          */
   
         /* Filter out anything that depends on CPUID levels we don't have */
@@@ -769,7 -791,7 +792,7 @@@
   
         /* If the model name is still unset, do table lookup. */
         if (!c->x86_model_id[0]) {
-               char *p;
+               const char *p;
                 p = table_lookup_model(c);
                 if (p)
                         strcpy(c->x86_model_id, p);
@@@ -825,13 -847,13 +848,14 @@@ static void vgetcpu_set_mode(void
   void __init identify_boot_cpu(void)
   {
         identify_cpu(&boot_cpu_data);
+       init_c1e_mask();
   #ifdef CONFIG_X86_32
         sysenter_setup();
         enable_sep_cpu();
   #else
         vgetcpu_set_mode();
   #endif
+ +      init_hw_perf_counters();
   }
   
   void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@@ -845,11 -867,11 +869,11 @@@
   }
   
   struct msr_range {
-       unsigned min;
-       unsigned max;
+       unsigned        min;
+       unsigned        max;
   };
   
- static struct msr_range msr_range_array[] __cpuinitdata = {
+ static const struct msr_range msr_range_array[] __cpuinitconst = {
         { 0x00000000, 0x00000418},
         { 0xc0000000, 0xc000040b},
         { 0xc0010000, 0xc0010142},
@@@ -858,14 -880,15 +882,15 @@@
   
   static void __cpuinit print_cpu_msr(void)
   {
+       unsigned index_min, index_max;
         unsigned index;
         u64 val;
         int i;
-       unsigned index_min, index_max;
   
         for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
                 index_min = msr_range_array[i].min;
                 index_max = msr_range_array[i].max;
+ 
                 for (index = index_min; index < index_max; index++) {
                         if (rdmsrl_amd_safe(index, &val))
                                 continue;
@@@ -875,6 -898,7 +900,7 @@@
   }
   
   static int show_msr __cpuinitdata;
+ 
   static __init int setup_show_msr(char *arg)
   {
         int num;
@@@ -896,12 -920,14 +922,14 @@@ __setup("noclflush", setup_noclflush)
   
   void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
   {
-       char *vendor = NULL;
+       const char *vendor = NULL;
   
-       if (c->x86_vendor < X86_VENDOR_NUM)
+       if (c->x86_vendor < X86_VENDOR_NUM) {
                 vendor = this_cpu->c_vendor;
-       else if (c->cpuid_level >= 0)
-               vendor = c->x86_vendor_id;
+       } else {
+               if (c->cpuid_level >= 0)
+                       vendor = c->x86_vendor_id;
+       }
   
         if (vendor && !strstr(c->x86_model_id, vendor))
                 printk(KERN_CONT "%s ", vendor);
@@@ -928,10 -954,12 +956,12 @@@
   static __init int setup_disablecpuid(char *arg)
   {
         int bit;
+ 
         if (get_option(&arg, &bit) && bit < NCAPINTS*32)
                 setup_clear_cpu_cap(bit);
         else
                 return 0;
+ 
         return 1;
   }
   __setup("clearcpuid=", setup_disablecpuid);
@@@ -941,6 -969,7 +971,7 @@@ struct desc_ptr idt_descr = { 256 * 16 
   
   DEFINE_PER_CPU_FIRST(union irq_stack_union,
                      irq_stack_union) __aligned(PAGE_SIZE);
+ 
   DEFINE_PER_CPU(char *, irq_stack_ptr) =
         init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
   
@@@ -950,12 -979,21 +981,21 @@@ EXPORT_PER_CPU_SYMBOL(kernel_stack)
   
   DEFINE_PER_CPU(unsigned int, irq_count) = -1;
   
+ /*
+  * Special IST stacks which the CPU switches to when it calls
+  * an IST-marked descriptor entry. Up to 7 stacks (hardware
+  * limit), all of them are 4K, except the debug stack which
+  * is 8K.
+  */
+ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+         [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
+         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
+ };
+ 
   static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
         [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
         __aligned(PAGE_SIZE);
   
- extern asmlinkage void ignore_sysret(void);
- 
   /* May not be marked __init: used by software suspend */
   void syscall_init(void)
   {
@@@ -985,7 -1023,7 +1025,7 @@@ unsigned long kernel_eflags
    */
   DEFINE_PER_CPU(struct orig_ist, orig_ist);
   
- #else /* x86_64 */
+ #else /* CONFIG_X86_64 */
   
   #ifdef CONFIG_CC_STACKPROTECTOR
   DEFINE_PER_CPU(unsigned long, stack_canary);
@@@ -997,9 -1035,26 +1037,26 @@@ struct pt_regs * __cpuinit idle_regs(st
         memset(regs, 0, sizeof(struct pt_regs));
         regs->fs = __KERNEL_PERCPU;
         regs->gs = __KERNEL_STACK_CANARY;
+ 
         return regs;
   }
- #endif        /* x86_64 */
+ #endif        /* CONFIG_X86_64 */
+ 
+ /*
+  * Clear all 6 debug registers:
+  */
+ static void clear_all_debug_regs(void)
+ {
+       int i;
+ 
+       for (i = 0; i < 8; i++) {
+               /* Ignore db4, db5 */
+               if ((i == 4) || (i == 5))
+                       continue;
+ 
+               set_debugreg(0, i);
+       }
+ }
   
   /*
    * cpu_init() initializes state that is per-CPU. Some data is already
@@@ -1009,15 -1064,20 +1066,20 @@@
    * A lot of state is already set up in PDA init for 64 bit
    */
   #ifdef CONFIG_X86_64
+ 
   void __cpuinit cpu_init(void)
   {
-       int cpu = stack_smp_processor_id();
-       struct tss_struct *t = &per_cpu(init_tss, cpu);
-       struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-       unsigned long v;
+       struct orig_ist *orig_ist;
         struct task_struct *me;
+       struct tss_struct *t;
+       unsigned long v;
+       int cpu;
         int i;
   
+       cpu = stack_smp_processor_id();
+       t = &per_cpu(init_tss, cpu);
+       orig_ist = &per_cpu(orig_ist, cpu);
+ 
   #ifdef CONFIG_NUMA
         if (cpu != 0 && percpu_read(node_number) == 0 &&
             cpu_to_node(cpu) != NUMA_NO_NODE)
@@@ -1058,19 -1118,17 +1120,17 @@@
          * set up and load the per-CPU TSS
          */
         if (!orig_ist->ist[0]) {
-               static const unsigned int sizes[N_EXCEPTION_STACKS] = {
-                 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
-                 [DEBUG_STACK - 1] = DEBUG_STKSZ
-               };
                 char *estacks = per_cpu(exception_stacks, cpu);
+ 
                 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-                       estacks += sizes[v];
+                       estacks += exception_stack_sizes[v];
                         orig_ist->ist[v] = t->x86_tss.ist[v] =
                                         (unsigned long)estacks;
                 }
         }
   
         t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ 
         /*
          * <= is required because the CPU will access up to
          * 8 bits beyond the end of the IO permission bitmap.
@@@ -1080,8 -1138,7 +1140,7 @@@
   
         atomic_inc(&init_mm.mm_count);
         me->active_mm = &init_mm;
-       if (me->mm)
-               BUG();
+       BUG_ON(me->mm);
         enter_lazy_tlb(&init_mm, me);
   
         load_sp0(t, &current->thread);
@@@ -1100,17 -1157,7 +1159,7 @@@
                 arch_kgdb_ops.correct_hw_break();
         else
   #endif
-       {
-               /*
-                * Clear all 6 debug registers:
-                */
-               set_debugreg(0UL, 0);
-               set_debugreg(0UL, 1);
-               set_debugreg(0UL, 2);
-               set_debugreg(0UL, 3);
-               set_debugreg(0UL, 6);
-               set_debugreg(0UL, 7);
-       }
+               clear_all_debug_regs();
   
         fpu_init();
   
@@@ -1131,7 -1178,8 +1180,8 @@@ void __cpuinit cpu_init(void
   
         if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
                 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
-               for (;;) local_irq_enable();
+               for (;;)
+                       local_irq_enable();
         }
   
         printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@@ -1147,8 -1195,7 +1197,7 @@@
          */
         atomic_inc(&init_mm.mm_count);
         curr->active_mm = &init_mm;
-       if (curr->mm)
-               BUG();
+       BUG_ON(curr->mm);
         enter_lazy_tlb(&init_mm, curr);
   
         load_sp0(t, thread);
@@@ -1161,13 -1208,7 +1210,7 @@@
         __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
   #endif
   
-       /* Clear all 6 debug registers: */
-       set_debugreg(0, 0);
-       set_debugreg(0, 1);
-       set_debugreg(0, 2);
-       set_debugreg(0, 3);
-       set_debugreg(0, 6);
-       set_debugreg(0, 7);
+       clear_all_debug_regs();
   
         /*
          * Force FPU initialization:
@@@ -1187,6 -1228,4 +1230,4 @@@
   
         xsave_init();
   }
- 
- 
   #endif
diff --combined arch/x86/kernel/entry_64.S

index 24c7031,a331ec3..3f129d9
--- 1/arch/x86/kernel/entry_64.S
--- 2/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@@ -368,6 -368,7 +368,7 @@@ ENTRY(save_rest
   END(save_rest)
   
   /* save complete stack frame */
+       .pushsection .kprobes.text, "ax"
   ENTRY(save_paranoid)
         XCPT_FRAME 1 RDI+8
         cld
@@@ -396,6 -397,7 +397,7 @@@
   1:    ret
         CFI_ENDPROC
   END(save_paranoid)
+       .popsection
   
   /*
    * A newly forked process directly context switches into this address.
@@@ -416,7 -418,6 +418,6 @@@ ENTRY(ret_from_fork
   
         GET_THREAD_INFO(%rcx)
   
-       CFI_REMEMBER_STATE
         RESTORE_REST
   
         testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
@@@ -428,7 -429,6 +429,6 @@@
         RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
         jmp ret_from_sys_call                   # go to the SYSRET fastpath
   
-       CFI_RESTORE_STATE
         CFI_ENDPROC
   END(ret_from_fork)
   
@@@ -984,6 -984,8 +984,8 @@@ apicinterrupt UV_BAU_MESSAGE 
   #endif
   apicinterrupt LOCAL_TIMER_VECTOR \
         apic_timer_interrupt smp_apic_timer_interrupt
+ apicinterrupt GENERIC_INTERRUPT_VECTOR \
+       generic_interrupt smp_generic_interrupt
   
   #ifdef CONFIG_SMP
   apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@@ -1023,11 -1025,6 +1025,11 @@@ apicinterrupt ERROR_APIC_VECTOR 
   apicinterrupt SPURIOUS_APIC_VECTOR \
         spurious_interrupt smp_spurious_interrupt
   
+ +#ifdef CONFIG_PERF_COUNTERS
+ +apicinterrupt LOCAL_PERF_VECTOR \
+ +      perf_counter_interrupt smp_perf_counter_interrupt
+ +#endif
+ +
   /*
    * Exception entry points.
    */
diff --combined arch/x86/kernel/irq.c

index 7c95c89,3aaf7b9..9c27543
--- 1/arch/x86/kernel/irq.c
--- 2/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@@ -15,6 -15,9 +15,9 @@@
   
   atomic_t irq_err_count;
   
+ /* Function pointer for generic interrupt vector handling */
+ void (*generic_interrupt_extension)(void) = NULL;
+ 
   /*
    * 'what should we do if we get a hw irq event on an illegal vector'.
    * each architecture has to answer this themselves.
@@@ -42,59 -45,60 +45,64 @@@ void ack_bad_irq(unsigned int irq
   /*
    * /proc/interrupts printing:
    */
- static int show_other_interrupts(struct seq_file *p)
+ static int show_other_interrupts(struct seq_file *p, int prec)
   {
         int j;
   
-       seq_printf(p, "NMI: ");
+       seq_printf(p, "%*s: ", prec, "NMI");
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
         seq_printf(p, "  Non-maskable interrupts\n");
   #ifdef CONFIG_X86_LOCAL_APIC
-       seq_printf(p, "LOC: ");
+       seq_printf(p, "%*s: ", prec, "LOC");
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
         seq_printf(p, "  Local timer interrupts\n");
+ 
+       seq_printf(p, "%*s: ", prec, "SPU");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+       seq_printf(p, "  Spurious interrupts\n");
+ +      seq_printf(p, "CNT: ");
+ +      for_each_online_cpu(j)
+ +              seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ +      seq_printf(p, "  Performance counter interrupts\n");
   #endif
+       if (generic_interrupt_extension) {
+               seq_printf(p, "PLT: ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+               seq_printf(p, "  Platform interrupts\n");
+       }
   #ifdef CONFIG_SMP
-       seq_printf(p, "RES: ");
+       seq_printf(p, "%*s: ", prec, "RES");
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
         seq_printf(p, "  Rescheduling interrupts\n");
-       seq_printf(p, "CAL: ");
+       seq_printf(p, "%*s: ", prec, "CAL");
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
         seq_printf(p, "  Function call interrupts\n");
-       seq_printf(p, "TLB: ");
+       seq_printf(p, "%*s: ", prec, "TLB");
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
         seq_printf(p, "  TLB shootdowns\n");
   #endif
   #ifdef CONFIG_X86_MCE
-       seq_printf(p, "TRM: ");
+       seq_printf(p, "%*s: ", prec, "TRM");
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
         seq_printf(p, "  Thermal event interrupts\n");
   # ifdef CONFIG_X86_64
-       seq_printf(p, "THR: ");
+       seq_printf(p, "%*s: ", prec, "THR");
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
         seq_printf(p, "  Threshold APIC interrupts\n");
   # endif
   #endif
- #ifdef CONFIG_X86_LOCAL_APIC
-       seq_printf(p, "SPU: ");
-       for_each_online_cpu(j)
-               seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
-       seq_printf(p, "  Spurious interrupts\n");
- #endif
-       seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+       seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
   #if defined(CONFIG_X86_IO_APIC)
-       seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+       seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
   #endif
         return 0;
   }
@@@ -102,19 -106,22 +110,22 @@@
   int show_interrupts(struct seq_file *p, void *v)
   {
         unsigned long flags, any_count = 0;
-       int i = *(loff_t *) v, j;
+       int i = *(loff_t *) v, j, prec;
         struct irqaction *action;
         struct irq_desc *desc;
   
         if (i > nr_irqs)
                 return 0;
   
+       for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+               j *= 10;
+ 
         if (i == nr_irqs)
-               return show_other_interrupts(p);
+               return show_other_interrupts(p, prec);
   
         /* print header */
         if (i == 0) {
-               seq_printf(p, "           ");
+               seq_printf(p, "%*s", prec + 8, "");
                 for_each_online_cpu(j)
                         seq_printf(p, "CPU%-8d", j);
                 seq_putc(p, '\n');
@@@ -125,23 -132,15 +136,15 @@@
                 return 0;
   
         spin_lock_irqsave(&desc->lock, flags);
- #ifndef CONFIG_SMP
-       any_count = kstat_irqs(i);
- #else
         for_each_online_cpu(j)
                 any_count |= kstat_irqs_cpu(i, j);
- #endif
         action = desc->action;
         if (!action && !any_count)
                 goto out;
   
-       seq_printf(p, "%3d: ", i);
- #ifndef CONFIG_SMP
-       seq_printf(p, "%10u ", kstat_irqs(i));
- #else
+       seq_printf(p, "%*d: ", prec, i);
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
- #endif
         seq_printf(p, " %8s", desc->chip->name);
         seq_printf(p, "-%-8s", desc->name);
   
@@@ -166,8 -165,10 +169,11 @@@ u64 arch_irq_stat_cpu(unsigned int cpu
   
   #ifdef CONFIG_X86_LOCAL_APIC
         sum += irq_stats(cpu)->apic_timer_irqs;
+       sum += irq_stats(cpu)->irq_spurious_count;
+ +      sum += irq_stats(cpu)->apic_perf_irqs;
   #endif
+       if (generic_interrupt_extension)
+               sum += irq_stats(cpu)->generic_irqs;
   #ifdef CONFIG_SMP
         sum += irq_stats(cpu)->irq_resched_count;
         sum += irq_stats(cpu)->irq_call_count;
@@@ -179,9 -180,6 +185,6 @@@
         sum += irq_stats(cpu)->irq_threshold_count;
   #endif
   #endif
- #ifdef CONFIG_X86_LOCAL_APIC
-       sum += irq_stats(cpu)->irq_spurious_count;
- #endif
         return sum;
   }
   
@@@ -231,4 -229,27 +234,27 @@@ unsigned int __irq_entry do_IRQ(struct 
         return 1;
   }
   
+ /*
+  * Handler for GENERIC_INTERRUPT_VECTOR.
+  */
+ void smp_generic_interrupt(struct pt_regs *regs)
+ {
+       struct pt_regs *old_regs = set_irq_regs(regs);
+ 
+       ack_APIC_irq();
+ 
+       exit_idle();
+ 
+       irq_enter();
+ 
+       inc_irq_stat(generic_irqs);
+ 
+       if (generic_interrupt_extension)
+               generic_interrupt_extension();
+ 
+       irq_exit();
+ 
+       set_irq_regs(old_regs);
+ }
+ 
   EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --combined arch/x86/kernel/irqinit_32.c

index f3e11cb,368b0a8..925d87c
--- 1/arch/x86/kernel/irqinit_32.c
--- 2/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@@ -50,7 -50,6 +50,6 @@@ static irqreturn_t math_error_irq(int c
    */
   static struct irqaction fpu_irq = {
         .handler = math_error_irq,
-       .mask = CPU_MASK_NONE,
         .name = "fpu",
   };
   
@@@ -83,7 -82,6 +82,6 @@@ void __init init_ISA_irqs(void
    */
   static struct irqaction irq2 = {
         .handler = no_action,
-       .mask = CPU_MASK_NONE,
         .name = "cascade",
   };
   
@@@ -120,8 -118,28 +118,8 @@@ int vector_used_by_percpu_irq(unsigned 
         return 0;
   }
   
- -/* Overridden in paravirt.c */
- -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
- -
- -void __init native_init_IRQ(void)
+ +static void __init smp_intr_init(void)
   {
- -      int i;
- -
- -      /* Execute any quirks before the call gates are initialised: */
- -      x86_quirk_pre_intr_init();
- -
- -      /*
- -       * Cover the whole vector space, no vector can escape
- -       * us. (some of these will be overridden and become
- -       * 'special' SMP interrupts)
- -       */
- -      for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
- -              /* SYSCALL_VECTOR was reserved in trap_init. */
- -              if (i != SYSCALL_VECTOR)
- -                      set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
- -      }
- -
- -
   #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
         /*
          * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@@ -150,53 -168,23 +148,56 @@@
         set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
         set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
   #endif
+ +}
+ +
+ +static void __init apic_intr_init(void)
+ +{
+ +      smp_intr_init();
   
   #ifdef CONFIG_X86_LOCAL_APIC
         /* self generated IPI for local APIC timer */
         alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
   
+       /* generic IPI for platform specific use */
+       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+ 
         /* IPI vectors for APIC spurious and error interrupts */
         alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
         alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
- -#endif
+ +# ifdef CONFIG_PERF_COUNTERS
+ +      alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+ +# endif
   
- -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+ +# ifdef CONFIG_X86_MCE_P4THERMAL
         /* thermal monitor LVT interrupt */
         alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+ +# endif
   #endif
+ +}
+ +
+ +/* Overridden in paravirt.c */
+ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+ +
+ +void __init native_init_IRQ(void)
+ +{
+ +      int i;
+ +
+ +      /* Execute any quirks before the call gates are initialised: */
+ +      x86_quirk_pre_intr_init();
+ +
+ +      apic_intr_init();
+ +
+ +      /*
+ +       * Cover the whole vector space, no vector can escape
+ +       * us. (some of these will be overridden and become
+ +       * 'special' SMP interrupts)
+ +       */
+ +      for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ +              int vector = FIRST_EXTERNAL_VECTOR + i;
+ +              /* SYSCALL_VECTOR was reserved in trap_init. */
+ +              if (!test_bit(vector, used_vectors))
+ +                      set_intr_gate(vector, interrupt[i]);
+ +      }
   
         if (!acpi_ioapic)
                 setup_irq(2, &irq2);
diff --combined arch/x86/kernel/irqinit_64.c

index 16e1fc6,8cd1053..665e2ab
--- 1/arch/x86/kernel/irqinit_64.c
--- 2/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@@ -45,7 -45,6 +45,6 @@@
   
   static struct irqaction irq2 = {
         .handler = no_action,
-       .mask = CPU_MASK_NONE,
         .name = "cascade",
   };
   DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@@ -147,14 -146,12 +146,17 @@@ static void __init apic_intr_init(void
         /* self generated IPI for local APIC timer */
         alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
   
+       /* generic IPI for platform specific use */
+       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+ 
         /* IPI vectors for APIC spurious and error interrupts */
         alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
         alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+ +
+ +      /* Performance monitoring interrupt: */
+ +#ifdef CONFIG_PERF_COUNTERS
+ +      alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+ +#endif
   }
   
   void __init native_init_IRQ(void)
@@@ -162,9 -159,6 +164,9 @@@
         int i;
   
         init_ISA_irqs();
+ +
+ +      apic_intr_init();
+ +
         /*
          * Cover the whole vector space, no vector can escape
          * us. (some of these will be overridden and become
@@@ -172,10 -166,12 +174,10 @@@
          */
         for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
                 int vector = FIRST_EXTERNAL_VECTOR + i;
- -              if (vector != IA32_SYSCALL_VECTOR)
+ +              if (!test_bit(vector, used_vectors))
                         set_intr_gate(vector, interrupt[i]);
         }
   
- -      apic_intr_init();
- -
         if (!acpi_ioapic)
                 setup_irq(2, &irq2);
   }
diff --combined arch/x86/kernel/signal.c

index 4d34410,1442516..611615a
--- 1/arch/x86/kernel/signal.c
--- 2/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@@ -6,7 -6,7 +6,7 @@@
    *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
    *  2000-2002   x86-64 support by Andi Kleen
    */
- -
+ +#include <linux/perf_counter.h>
   #include <linux/sched.h>
   #include <linux/mm.h>
   #include <linux/smp.h>
@@@ -187,6 -187,77 +187,77 @@@ setup_sigcontext(struct sigcontext __us
   /*
    * Set up a signal frame.
    */
+ 
+ /*
+  * Determine which stack to use..
+  */
+ static unsigned long align_sigframe(unsigned long sp)
+ {
+ #ifdef CONFIG_X86_32
+       /*
+        * Align the stack pointer according to the i386 ABI,
+        * i.e. so that on function entry ((sp + 4) & 15) == 0.
+        */
+       sp = ((sp + 4) & -16ul) - 4;
+ #else /* !CONFIG_X86_32 */
+       sp = round_down(sp, 16) - 8;
+ #endif
+       return sp;
+ }
+ 
+ static inline void __user *
+ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+            void __user **fpstate)
+ {
+       /* Default to using normal stack */
+       unsigned long sp = regs->sp;
+       int onsigstack = on_sig_stack(sp);
+ 
+ #ifdef CONFIG_X86_64
+       /* redzone */
+       sp -= 128;
+ #endif /* CONFIG_X86_64 */
+ 
+       if (!onsigstack) {
+               /* This is the X/Open sanctioned signal stack switching.  */
+               if (ka->sa.sa_flags & SA_ONSTACK) {
+                       if (current->sas_ss_size)
+                               sp = current->sas_ss_sp + current->sas_ss_size;
+               } else {
+ #ifdef CONFIG_X86_32
+                       /* This is the legacy signal stack switching. */
+                       if ((regs->ss & 0xffff) != __USER_DS &&
+                               !(ka->sa.sa_flags & SA_RESTORER) &&
+                                       ka->sa.sa_restorer)
+                               sp = (unsigned long) ka->sa.sa_restorer;
+ #endif /* CONFIG_X86_32 */
+               }
+       }
+ 
+       if (used_math()) {
+               sp -= sig_xstate_size;
+ #ifdef CONFIG_X86_64
+               sp = round_down(sp, 64);
+ #endif /* CONFIG_X86_64 */
+               *fpstate = (void __user *)sp;
+       }
+ 
+       sp = align_sigframe(sp - frame_size);
+ 
+       /*
+        * If we are on the alternate signal stack and would overflow it, don't.
+        * Return an always-bogus address instead so we will die with SIGSEGV.
+        */
+       if (onsigstack && !likely(on_sig_stack(sp)))
+               return (void __user *)-1L;
+ 
+       /* save i387 state */
+       if (used_math() && save_i387_xstate(*fpstate) < 0)
+               return (void __user *)-1L;
+ 
+       return (void __user *)sp;
+ }
+ 
   #ifdef CONFIG_X86_32
   static const struct {
         u16 poplmovl;
@@@ -210,54 -281,6 +281,6 @@@ static const struct 
         0
   };
   
- /*
-  * Determine which stack to use..
-  */
- static inline void __user *
- get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
-            void **fpstate)
- {
-       unsigned long sp;
- 
-       /* Default to using normal stack */
-       sp = regs->sp;
- 
-       /*
-        * If we are on the alternate signal stack and would overflow it, don't.
-        * Return an always-bogus address instead so we will die with SIGSEGV.
-        */
-       if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
-               return (void __user *) -1L;
- 
-       /* This is the X/Open sanctioned signal stack switching.  */
-       if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(sp) == 0)
-                       sp = current->sas_ss_sp + current->sas_ss_size;
-       } else {
-               /* This is the legacy signal stack switching. */
-               if ((regs->ss & 0xffff) != __USER_DS &&
-                       !(ka->sa.sa_flags & SA_RESTORER) &&
-                               ka->sa.sa_restorer)
-                       sp = (unsigned long) ka->sa.sa_restorer;
-       }
- 
-       if (used_math()) {
-               sp = sp - sig_xstate_size;
-               *fpstate = (struct _fpstate *) sp;
-               if (save_i387_xstate(*fpstate) < 0)
-                       return (void __user *)-1L;
-       }
- 
-       sp -= frame_size;
-       /*
-        * Align the stack pointer according to the i386 ABI,
-        * i.e. so that on function entry ((sp + 4) & 15) == 0.
-        */
-       sp = ((sp + 4) & -16ul) - 4;
- 
-       return (void __user *) sp;
- }
- 
   static int
   __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
               struct pt_regs *regs)
@@@ -388,24 -411,6 +411,6 @@@ static int __setup_rt_frame(int sig, st
         return 0;
   }
   #else /* !CONFIG_X86_32 */
- /*
-  * Determine which stack to use..
-  */
- static void __user *
- get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
- {
-       /* Default to using normal stack - redzone*/
-       sp -= 128;
- 
-       /* This is the X/Open sanctioned signal stack switching.  */
-       if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(sp) == 0)
-                       sp = current->sas_ss_sp + current->sas_ss_size;
-       }
- 
-       return (void __user *)round_down(sp - size, 64);
- }
- 
   static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                             sigset_t *set, struct pt_regs *regs)
   {
@@@ -414,15 -419,7 +419,7 @@@
         int err = 0;
         struct task_struct *me = current;
   
-       if (used_math()) {
-               fp = get_stack(ka, regs->sp, sig_xstate_size);
-               frame = (void __user *)round_down(
-                       (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
- 
-               if (save_i387_xstate(fp) < 0)
-                       return -EFAULT;
-       } else
-               frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
+       frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
   
         if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
                 return -EFAULT;
@@@ -875,11 -872,6 +872,11 @@@ do_notify_resume(struct pt_regs *regs, 
                 tracehook_notify_resume(regs);
         }
   
+ +      if (thread_info_flags & _TIF_PERF_COUNTERS) {
+ +              clear_thread_flag(TIF_PERF_COUNTERS);
+ +              perf_counter_notify(regs);
+ +      }
+ +
   #ifdef CONFIG_X86_32
         clear_thread_flag(TIF_IRET);
   #endif /* CONFIG_X86_32 */
diff --combined arch/x86/kernel/syscall_table_32.S

index b7607c4,ff5c873..c3ebbb9
--- 1/arch/x86/kernel/syscall_table_32.S
--- 2/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@@ -332,4 -332,5 +332,6 @@@ ENTRY(sys_call_table
         .long sys_dup3                  /* 330 */
         .long sys_pipe2
         .long sys_inotify_init1
+ +      .long sys_perf_counter_open
+       .long sys_preadv
+       .long sys_pwritev
diff --combined arch/x86/kernel/traps.c

index 1dba866,a1d2883..2cc162e
--- 1/arch/x86/kernel/traps.c
--- 2/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -118,47 -118,6 +118,6 @@@ die_if_kernel(const char *str, struct p
         if (!user_mode_vm(regs))
                 die(str, regs, err);
   }
- 
- /*
-  * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
-  * invalid offset set (the LAZY one) and the faulting thread has
-  * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
-  * we set the offset field correctly and return 1.
-  */
- static int lazy_iobitmap_copy(void)
- {
-       struct thread_struct *thread;
-       struct tss_struct *tss;
-       int cpu;
- 
-       cpu = get_cpu();
-       tss = &per_cpu(init_tss, cpu);
-       thread = &current->thread;
- 
-       if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
-           thread->io_bitmap_ptr) {
-               memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
-                      thread->io_bitmap_max);
-               /*
-                * If the previously set map was extending to higher ports
-                * than the current one, pad extra space with 0xff (no access).
-                */
-               if (thread->io_bitmap_max < tss->io_bitmap_max) {
-                       memset((char *) tss->io_bitmap +
-                               thread->io_bitmap_max, 0xff,
-                               tss->io_bitmap_max - thread->io_bitmap_max);
-               }
-               tss->io_bitmap_max = thread->io_bitmap_max;
-               tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-               tss->io_bitmap_owner = thread;
-               put_cpu();
- 
-               return 1;
-       }
-       put_cpu();
- 
-       return 0;
- }
   #endif
   
   static void __kprobes
@@@ -309,11 -268,6 +268,6 @@@ do_general_protection(struct pt_regs *r
         conditional_sti(regs);
   
   #ifdef CONFIG_X86_32
-       if (lazy_iobitmap_copy()) {
-               /* restart the faulting instruction */
-               return;
-       }
- 
         if (regs->flags & X86_VM_MASK)
                 goto gp_in_vm86;
   #endif
@@@ -991,13 -945,8 +945,13 @@@ void __init trap_init(void
   #endif
         set_intr_gate(19, &simd_coprocessor_error);
   
+ +      /* Reserve all the builtin and the syscall vector: */
+ +      for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+ +              set_bit(i, used_vectors);
+ +
   #ifdef CONFIG_IA32_EMULATION
         set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+ +      set_bit(IA32_SYSCALL_VECTOR, used_vectors);
   #endif
   
   #ifdef CONFIG_X86_32
@@@ -1014,9 -963,17 +968,9 @@@
         }
   
         set_system_trap_gate(SYSCALL_VECTOR, &system_call);
- -#endif
- -
- -      /* Reserve all the builtin and the syscall vector: */
- -      for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- -              set_bit(i, used_vectors);
- -
- -#ifdef CONFIG_X86_64
- -      set_bit(IA32_SYSCALL_VECTOR, used_vectors);
- -#else
         set_bit(SYSCALL_VECTOR, used_vectors);
   #endif
+ +
         /*
          * Should be a barrier for any external CPU state:
          */
diff --combined drivers/acpi/processor_idle.c

index 08def2f,4e6e758..429be89
--- 1/drivers/acpi/processor_idle.c
--- 2/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@@ -64,7 -64,6 +64,6 @@@
   #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
   ACPI_MODULE_NAME("processor_idle");
   #define ACPI_PROCESSOR_FILE_POWER     "power"
- #define US_TO_PM_TIMER_TICKS(t)               ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
   #define PM_TIMER_TICK_NS              (1000000000ULL/PM_TIMER_FREQUENCY)
   #define C2_OVERHEAD                   1       /* 1us */
   #define C3_OVERHEAD                   1       /* 1us */
@@@ -78,6 -77,10 +77,10 @@@ module_param(nocst, uint, 0000)
   static unsigned int latency_factor __read_mostly = 2;
   module_param(latency_factor, uint, 0644);
   
+ static s64 us_to_pm_timer_ticks(s64 t)
+ {
+       return div64_u64(t * PM_TIMER_FREQUENCY, 1000000);
+ }
   /*
    * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
    * For now disable this. Probably a bug somewhere else.
@@@ -101,57 -104,6 +104,6 @@@ static int set_max_cstate(const struct 
   /* Actually this shouldn't be __cpuinitdata, would be better to fix the
      callers to only run once -AK */
   static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET70WW")}, (void *)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW")}, (void *)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET43WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET45WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET47WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET50WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET52WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET55WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET56WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET59WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET61WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET62WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET64WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET65WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET68WW") }, (void*)1},
-       { set_max_cstate, "Medion 41700", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
-         DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J")}, (void *)1},
         { set_max_cstate, "Clevo 5600D", {
           DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
           DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")},
@@@ -159,25 -111,6 +111,6 @@@
         {},
   };
   
- static inline u32 ticks_elapsed(u32 t1, u32 t2)
- {
-       if (t2 >= t1)
-               return (t2 - t1);
-       else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-               return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-       else
-               return ((0xFFFFFFFF - t1) + t2);
- }
- 
- static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
- {
-       if (t2 >= t1)
-               return PM_TIMER_TICKS_TO_US(t2 - t1);
-       else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-               return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-       else
-               return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
- }
   
   /*
    * Callers should disable interrupts before the call and enable
@@@ -630,7 -563,7 +563,7 @@@ static void acpi_processor_power_verify
          * In either case, the proper way to
          * handle BM_RLD is to set it and leave it set.
          */
-       acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
+       acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
   
         return;
   }
@@@ -800,9 -733,9 +733,9 @@@ static int acpi_idle_bm_check(void
   {
         u32 bm_status = 0;
   
-       acpi_get_register_unlocked(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
+       acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
         if (bm_status)
-               acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
+               acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
         /*
          * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
          * the true state of bus mastering activity; forcing us to
@@@ -824,11 -757,8 +757,11 @@@
    */
   static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
   {
+ +      u64 perf_flags;
+ +
         /* Don't trace irqs off for idle */
         stop_critical_timings();
+ +      perf_flags = hw_perf_save_disable();
         if (cx->entry_method == ACPI_CSTATE_FFH) {
                 /* Call into architectural FFH based C-state */
                 acpi_processor_ffh_cstate_enter(cx);
@@@ -843,7 -773,6 +776,7 @@@
                    gets asserted in time to freeze execution properly. */
                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
         }
+ +      hw_perf_restore(perf_flags);
         start_critical_timings();
   }
   
@@@ -857,7 -786,8 +790,8 @@@
   static int acpi_idle_enter_c1(struct cpuidle_device *dev,
                               struct cpuidle_state *state)
   {
-       u32 t1, t2;
+       ktime_t  kt1, kt2;
+       s64 idle_time;
         struct acpi_processor *pr;
         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
   
@@@ -875,14 -805,15 +809,15 @@@
                 return 0;
         }
   
-       t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt1 = ktime_get_real();
         acpi_idle_do_entry(cx);
-       t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt2 = ktime_get_real();
+       idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
   
         local_irq_enable();
         cx->usage++;
   
-       return ticks_elapsed_in_us(t1, t2);
+       return idle_time;
   }
   
   /**
@@@ -895,8 -826,9 +830,9 @@@ static int acpi_idle_enter_simple(struc
   {
         struct acpi_processor *pr;
         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-       u32 t1, t2;
-       int sleep_ticks = 0;
+       ktime_t  kt1, kt2;
+       s64 idle_time;
+       s64 sleep_ticks = 0;
   
         pr = __get_cpu_var(processors);
   
@@@ -929,18 -861,19 +865,19 @@@
         if (cx->type == ACPI_STATE_C3)
                 ACPI_FLUSH_CPU_CACHE();
   
-       t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt1 = ktime_get_real();
         /* Tell the scheduler that we are going deep-idle: */
         sched_clock_idle_sleep_event();
         acpi_idle_do_entry(cx);
-       t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt2 = ktime_get_real();
+       idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
   
   #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
         /* TSC could halt in idle, so notify users */
         if (tsc_halts_in_c(cx->type))
                 mark_tsc_unstable("TSC halts in idle");;
   #endif
-       sleep_ticks = ticks_elapsed(t1, t2);
+       sleep_ticks = us_to_pm_timer_ticks(idle_time);
   
         /* Tell the scheduler how much we idled: */
         sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@@ -952,7 -885,7 +889,7 @@@
   
         acpi_state_timer_broadcast(pr, cx, 0);
         cx->time += sleep_ticks;
-       return ticks_elapsed_in_us(t1, t2);
+       return idle_time;
   }
   
   static int c3_cpu_count;
@@@ -970,8 -903,10 +907,10 @@@ static int acpi_idle_enter_bm(struct cp
   {
         struct acpi_processor *pr;
         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-       u32 t1, t2;
-       int sleep_ticks = 0;
+       ktime_t  kt1, kt2;
+       s64 idle_time;
+       s64 sleep_ticks = 0;
+ 
   
         pr = __get_cpu_var(processors);
   
@@@ -1032,20 -967,21 +971,21 @@@
                 c3_cpu_count++;
                 /* Disable bus master arbitration when all CPUs are in C3 */
                 if (c3_cpu_count == num_online_cpus())
-                       acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
+                       acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
                 spin_unlock(&c3_lock);
         } else if (!pr->flags.bm_check) {
                 ACPI_FLUSH_CPU_CACHE();
         }
   
-       t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt1 = ktime_get_real();
         acpi_idle_do_entry(cx);
-       t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt2 = ktime_get_real();
+       idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
   
         /* Re-enable bus master arbitration */
         if (pr->flags.bm_check && pr->flags.bm_control) {
                 spin_lock(&c3_lock);
-               acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+               acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
                 c3_cpu_count--;
                 spin_unlock(&c3_lock);
         }
@@@ -1055,7 -991,7 +995,7 @@@
         if (tsc_halts_in_c(ACPI_STATE_C3))
                 mark_tsc_unstable("TSC halts in idle");
   #endif
-       sleep_ticks = ticks_elapsed(t1, t2);
+       sleep_ticks = us_to_pm_timer_ticks(idle_time);
         /* Tell the scheduler how much we idled: */
         sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
   
@@@ -1066,7 -1002,7 +1006,7 @@@
   
         acpi_state_timer_broadcast(pr, cx, 0);
         cx->time += sleep_ticks;
-       return ticks_elapsed_in_us(t1, t2);
+       return idle_time;
   }
   
   struct cpuidle_driver acpi_idle_driver = {
diff --combined drivers/char/sysrq.c

index fa71b84,6de020d..0540d5d
--- 1/drivers/char/sysrq.c
--- 2/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@@ -25,7 -25,6 +25,7 @@@
   #include <linux/kbd_kern.h>
   #include <linux/proc_fs.h>
   #include <linux/quotaops.h>
+ +#include <linux/perf_counter.h>
   #include <linux/kernel.h>
   #include <linux/module.h>
   #include <linux/suspend.h>
@@@ -36,7 -35,7 +36,7 @@@
   #include <linux/vt_kern.h>
   #include <linux/workqueue.h>
   #include <linux/kexec.h>
- #include <linux/irq.h>
+ #include <linux/interrupt.h>
   #include <linux/hrtimer.h>
   #include <linux/oom.h>
   
@@@ -245,7 -244,6 +245,7 @@@ static void sysrq_handle_showregs(int k
         struct pt_regs *regs = get_irq_regs();
         if (regs)
                 show_regs(regs);
+ +      perf_counter_print_debug();
   }
   static struct sysrq_key_op sysrq_showregs_op = {
         .handler        = sysrq_handle_showregs,
@@@ -285,7 -283,7 +285,7 @@@ static void sysrq_ftrace_dump(int key, 
   }
   static struct sysrq_key_op sysrq_ftrace_dump_op = {
         .handler        = sysrq_ftrace_dump,
-       .help_msg       = "dumpZ-ftrace-buffer",
+       .help_msg       = "dump-ftrace-buffer(Z)",
         .action_msg     = "Dump ftrace buffer",
         .enable_mask    = SYSRQ_ENABLE_DUMP,
   };
@@@ -348,6 -346,19 +348,19 @@@ static struct sysrq_key_op sysrq_moom_o
         .enable_mask    = SYSRQ_ENABLE_SIGNAL,
   };
   
+ #ifdef CONFIG_BLOCK
+ static void sysrq_handle_thaw(int key, struct tty_struct *tty)
+ {
+       emergency_thaw_all();
+ }
+ static struct sysrq_key_op sysrq_thaw_op = {
+       .handler        = sysrq_handle_thaw,
+       .help_msg       = "thaw-filesystems(J)",
+       .action_msg     = "Emergency Thaw of all frozen filesystems",
+       .enable_mask    = SYSRQ_ENABLE_SIGNAL,
+ };
+ #endif
+ 
   static void sysrq_handle_kill(int key, struct tty_struct *tty)
   {
         send_sig_all(SIGKILL);
@@@ -398,9 -409,13 +411,13 @@@ static struct sysrq_key_op *sysrq_key_t
         &sysrq_moom_op,                 /* f */
         /* g: May be registered by ppc for kgdb */
         NULL,                           /* g */
-       NULL,                           /* h */
+       NULL,                           /* h - reserved for help */
         &sysrq_kill_op,                 /* i */
+ #ifdef CONFIG_BLOCK
+       &sysrq_thaw_op,                 /* j */
+ #else
         NULL,                           /* j */
+ #endif
         &sysrq_SAK_op,                  /* k */
   #ifdef CONFIG_SMP
         &sysrq_showallcpus_op,          /* l */
diff --combined fs/exec.c

index af1600c,052a961..e015c0b
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -33,7 -33,6 +33,7 @@@
   #include <linux/string.h>
   #include <linux/init.h>
   #include <linux/pagemap.h>
+ +#include <linux/perf_counter.h>
   #include <linux/highmem.h>
   #include <linux/spinlock.h>
   #include <linux/key.h>
@@@ -46,6 -45,7 +46,7 @@@
   #include <linux/proc_fs.h>
   #include <linux/mount.h>
   #include <linux/security.h>
+ #include <linux/ima.h>
   #include <linux/syscalls.h>
   #include <linux/tsacct_kern.h>
   #include <linux/cn_proc.h>
@@@ -53,6 -53,7 +54,7 @@@
   #include <linux/tracehook.h>
   #include <linux/kmod.h>
   #include <linux/fsnotify.h>
+ #include <linux/fs_struct.h>
   
   #include <asm/uaccess.h>
   #include <asm/mmu_context.h>
@@@ -128,6 -129,9 +130,9 @@@ SYSCALL_DEFINE1(uselib, const char __us
                                  MAY_READ | MAY_EXEC | MAY_OPEN);
         if (error)
                 goto exit;
+       error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+       if (error)
+               goto exit;
   
         file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
         error = PTR_ERR(file);
@@@ -675,6 -679,9 +680,9 @@@ struct file *open_exec(const char *name
         err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
         if (err)
                 goto out_path_put;
+       err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+       if (err)
+               goto out_path_put;
   
         file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
         if (IS_ERR(file))
@@@ -1011,13 -1018,6 +1019,13 @@@ int flush_old_exec(struct linux_binprm 
   
         current->personality &= ~bprm->per_clear;
   
+ +      /*
+ +       * Flush performance counters when crossing a
+ +       * security domain:
+ +       */
+ +      if (!get_dumpable(current->mm))
+ +              perf_counter_exit_task(current);
+ +
         /* An exec changes our domain. We are no longer part of the thread
            group */
   
@@@ -1057,32 -1057,35 +1065,35 @@@ EXPORT_SYMBOL(install_exec_creds)
    * - the caller must hold current->cred_exec_mutex to protect against
    *   PTRACE_ATTACH
    */
- void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
+ int check_unsafe_exec(struct linux_binprm *bprm)
   {
         struct task_struct *p = current, *t;
         unsigned long flags;
-       unsigned n_fs, n_files, n_sighand;
+       unsigned n_fs;
+       int res = 0;
   
         bprm->unsafe = tracehook_unsafe_exec(p);
   
         n_fs = 1;
-       n_files = 1;
-       n_sighand = 1;
+       write_lock(&p->fs->lock);
         lock_task_sighand(p, &flags);
         for (t = next_thread(p); t != p; t = next_thread(t)) {
                 if (t->fs == p->fs)
                         n_fs++;
-               if (t->files == files)
-                       n_files++;
-               n_sighand++;
         }
   
-       if (atomic_read(&p->fs->count) > n_fs ||
-           atomic_read(&p->files->count) > n_files ||
-           atomic_read(&p->sighand->count) > n_sighand)
+       if (p->fs->users > n_fs) {
                 bprm->unsafe |= LSM_UNSAFE_SHARE;
+       } else {
+               if (p->fs->in_exec)
+                       res = -EAGAIN;
+               p->fs->in_exec = 1;
+       }
   
         unlock_task_sighand(p, &flags);
+       write_unlock(&p->fs->lock);
+ 
+       return res;
   }
   
   /* 
@@@ -1192,6 -1195,9 +1203,9 @@@ int search_binary_handler(struct linux_
         retval = security_bprm_check(bprm);
         if (retval)
                 return retval;
+       retval = ima_bprm_check(bprm);
+       if (retval)
+               return retval;
   
         /* kernel module loader fixup */
         /* so we don't try to load run modprobe in kernel space. */
@@@ -1292,17 -1298,21 +1306,21 @@@ int do_execve(char * filename
         retval = mutex_lock_interruptible(&current->cred_exec_mutex);
         if (retval < 0)
                 goto out_free;
+       current->in_execve = 1;
   
         retval = -ENOMEM;
         bprm->cred = prepare_exec_creds();
         if (!bprm->cred)
                 goto out_unlock;
-       check_unsafe_exec(bprm, displaced);
+ 
+       retval = check_unsafe_exec(bprm);
+       if (retval)
+               goto out_unlock;
   
         file = open_exec(filename);
         retval = PTR_ERR(file);
         if (IS_ERR(file))
-               goto out_unlock;
+               goto out_unmark;
   
         sched_exec();
   
@@@ -1345,6 -1355,10 +1363,10 @@@
                 goto out;
   
         /* execve succeeded */
+       write_lock(&current->fs->lock);
+       current->fs->in_exec = 0;
+       write_unlock(&current->fs->lock);
+       current->in_execve = 0;
         mutex_unlock(&current->cred_exec_mutex);
         acct_update_integrals(current);
         free_bprm(bprm);
@@@ -1362,7 -1376,13 +1384,13 @@@ out_file
                 fput(bprm->file);
         }
   
+ out_unmark:
+       write_lock(&current->fs->lock);
+       current->fs->in_exec = 0;
+       write_unlock(&current->fs->lock);
+ 
   out_unlock:
+       current->in_execve = 0;
         mutex_unlock(&current->cred_exec_mutex);
   
   out_free:
diff --combined include/linux/init_task.h

index 2ee9694,af1de95..219748d
--- 1/include/linux/init_task.h
--- 2/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@@ -120,16 -120,6 +120,16 @@@ extern struct group_info init_groups
   
   extern struct cred init_cred;
   
+ +#ifdef CONFIG_PERF_COUNTERS
+ +# define INIT_PERF_COUNTERS(tsk)                                      \
+ +      .perf_counter_ctx.counter_list =                                \
+ +              LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),      \
+ +      .perf_counter_ctx.lock =                                        \
+ +              __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
+ +#else
+ +# define INIT_PERF_COUNTERS(tsk)
+ +#endif
+ +
   /*
    *  INIT_TASK is used to set up the first task table, touch at
    * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@@ -157,6 -147,7 +157,7 @@@
                 .nr_cpus_allowed = NR_CPUS,                             \
         },                                                              \
         .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
+       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
         .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
         .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
         .real_parent    = &tsk,                                         \
@@@ -194,7 -185,6 +195,7 @@@
         INIT_IDS                                                        \
         INIT_TRACE_IRQFLAGS                                             \
         INIT_LOCKDEP                                                    \
+ +      INIT_PERF_COUNTERS(tsk)                                         \
   }
   
   
diff --combined include/linux/kernel_stat.h

index ecfa668,0c8b89f..b6d2887
--- 1/include/linux/kernel_stat.h
--- 2/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@@ -28,7 -28,7 +28,7 @@@ struct cpu_usage_stat 
   
   struct kernel_stat {
         struct cpu_usage_stat   cpustat;
- #ifndef CONFIG_SPARSE_IRQ
+ #ifndef CONFIG_GENERIC_HARDIRQS
          unsigned int irqs[NR_IRQS];
   #endif
   };
@@@ -41,7 -41,7 +41,7 @@@ DECLARE_PER_CPU(struct kernel_stat, kst
   
   extern unsigned long long nr_context_switches(void);
   
- #ifndef CONFIG_SPARSE_IRQ
+ #ifndef CONFIG_GENERIC_HARDIRQS
   #define kstat_irqs_this_cpu(irq) \
         (kstat_this_cpu.irqs[irq])
   
@@@ -52,16 -52,19 +52,19 @@@ static inline void kstat_incr_irqs_this
   {
         kstat_this_cpu.irqs[irq]++;
   }
- #endif
- 
   
- #ifndef CONFIG_SPARSE_IRQ
   static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
   {
          return kstat_cpu(cpu).irqs[irq];
   }
   #else
+ #include <linux/irq.h>
   extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+ #define kstat_irqs_this_cpu(DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()])
+ #define kstat_incr_irqs_this_cpu(irqno, DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()]++)
+ 
   #endif
   
   /*
@@@ -78,15 -81,7 +81,15 @@@ static inline unsigned int kstat_irqs(u
         return sum;
   }
   
+ +
+ +/*
+ + * Lock/unlock the current runqueue - to extract task statistics:
+ + */
+ +extern void curr_rq_lock_irq_save(unsigned long *flags);
+ +extern void curr_rq_unlock_irq_restore(unsigned long *flags);
+ +extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
   extern unsigned long long task_delta_exec(struct task_struct *);
+ +
   extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
   extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
   extern void account_steal_time(cputime_t);
diff --combined include/linux/sched.h

index 3aee423,b94f354..75b2fc5
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -68,10 -68,9 +68,10 @@@ struct sched_param 
   #include <linux/smp.h>
   #include <linux/sem.h>
   #include <linux/signal.h>
- #include <linux/fs_struct.h>
+ #include <linux/path.h>
   #include <linux/compiler.h>
   #include <linux/completion.h>
+ +#include <linux/perf_counter.h>
   #include <linux/pid.h>
   #include <linux/percpu.h>
   #include <linux/topology.h>
@@@ -98,6 -97,7 +98,7 @@@ struct futex_pi_state
   struct robust_list_head;
   struct bio;
   struct bts_tracer;
+ struct fs_struct;
   
   /*
    * List of flags we want to share for kernel threads,
@@@ -137,9 -137,9 +138,11 @@@ extern unsigned long nr_running(void)
   extern unsigned long nr_uninterruptible(void);
   extern unsigned long nr_active(void);
   extern unsigned long nr_iowait(void);
+ +extern u64 cpu_nr_switches(int cpu);
+ +extern u64 cpu_nr_migrations(int cpu);
   
+ extern unsigned long get_parent_ip(unsigned long addr);
+ 
   struct seq_file;
   struct cfs_rq;
   struct task_group;
@@@ -334,7 -334,9 +337,9 @@@ extern signed long schedule_timeout(sig
   extern signed long schedule_timeout_interruptible(signed long timeout);
   extern signed long schedule_timeout_killable(signed long timeout);
   extern signed long schedule_timeout_uninterruptible(signed long timeout);
+ asmlinkage void __schedule(void);
   asmlinkage void schedule(void);
+ extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
   
   struct nsproxy;
   struct user_namespace;
@@@ -392,8 -394,15 +397,15 @@@ extern void arch_unmap_area_topdown(str
                 (mm)->hiwater_vm = (mm)->total_vm;      \
   } while (0)
   
- #define get_mm_hiwater_rss(mm)        max((mm)->hiwater_rss, get_mm_rss(mm))
- #define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
+ static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
+ {
+       return max(mm->hiwater_rss, get_mm_rss(mm));
+ }
+ 
+ static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
+ {
+       return max(mm->hiwater_vm, mm->total_vm);
+ }
   
   extern void set_dumpable(struct mm_struct *mm, int value);
   extern int get_dumpable(struct mm_struct *mm);
@@@ -541,25 -550,8 +553,8 @@@ struct signal_struct 
   
         struct list_head cpu_timers[3];
   
-       /* job control IDs */
- 
-       /*
-        * pgrp and session fields are deprecated.
-        * use the task_session_Xnr and task_pgrp_Xnr routines below
-        */
- 
-       union {
-               pid_t pgrp __deprecated;
-               pid_t __pgrp;
-       };
- 
         struct pid *tty_old_pgrp;
   
-       union {
-               pid_t session __deprecated;
-               pid_t __session;
-       };
- 
         /* boolean value for session group leader */
         int leader;
   
@@@ -1001,6 -993,7 +996,7 @@@ struct sched_class 
                               struct rq *busiest, struct sched_domain *sd,
                               enum cpu_idle_type idle);
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+       int (*needs_post_schedule) (struct rq *this_rq);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
   
@@@ -1055,8 -1048,10 +1051,11 @@@ struct sched_entity 
         u64                     last_wakeup;
         u64                     avg_overlap;
   
- -      u64                     nr_migrations;
+ +      u64                     nr_migrations;
+ +
+       u64                     start_runtime;
+       u64                     avg_wakeup;
+ 
   #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
         u64                     wait_max;
@@@ -1168,6 -1163,7 +1167,7 @@@ struct task_struct 
   #endif
   
         struct list_head tasks;
+       struct plist_node pushable_tasks;
   
         struct mm_struct *mm, *active_mm;
   
@@@ -1179,6 -1175,8 +1179,8 @@@
         /* ??? */
         unsigned int personality;
         unsigned did_exec:1;
+       unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
+                                * execve */
         pid_t pid;
         pid_t tgid;
   
@@@ -1331,6 -1329,7 +1333,7 @@@
         int lockdep_depth;
         unsigned int lockdep_recursion;
         struct held_lock held_locks[MAX_LOCK_DEPTH];
+       gfp_t lockdep_reclaim_gfp;
   #endif
   
   /* journalling filesystem info */
@@@ -1373,7 -1372,6 +1376,7 @@@
         struct list_head pi_state_list;
         struct futex_pi_state *pi_state_cache;
   #endif
+ +      struct perf_counter_context perf_counter_ctx;
   #ifdef CONFIG_NUMA
         struct mempolicy *mempolicy;
         short il_next;
@@@ -1409,6 -1407,8 +1412,8 @@@
         int curr_ret_stack;
         /* Stack of return addresses for return function tracing */
         struct ftrace_ret_stack *ret_stack;
+       /* time stamp for last schedule */
+       unsigned long long ftrace_timestamp;
         /*
          * Number of functions that haven't been traced
          * because of depth overrun.
@@@ -1423,6 -1423,9 +1428,9 @@@
   #endif
   };
   
+ /* Future-safe accessor for struct task_struct's cpus_allowed. */
+ #define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
+ 
   /*
    * Priority of a process goes from 0..MAX_PRIO-1, valid RT
    * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@@ -1454,16 -1457,6 +1462,6 @@@ static inline int rt_task(struct task_s
         return rt_prio(p->prio);
   }
   
- static inline void set_task_session(struct task_struct *tsk, pid_t session)
- {
-       tsk->signal->__session = session;
- }
- 
- static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
- {
-       tsk->signal->__pgrp = pgrp;
- }
- 
   static inline struct pid *task_pid(struct task_struct *task)
   {
         return task->pids[PIDTYPE_PID].pid;
@@@ -1474,6 -1467,11 +1472,11 @@@ static inline struct pid *task_tgid(str
         return task->group_leader->pids[PIDTYPE_PID].pid;
   }
   
+ /*
+  * Without tasklist or rcu lock it is not safe to dereference
+  * the result of task_pgrp/task_session even if task == current,
+  * we can race with another thread doing sys_setsid/sys_setpgid.
+  */
   static inline struct pid *task_pgrp(struct task_struct *task)
   {
         return task->group_leader->pids[PIDTYPE_PGID].pid;
@@@ -1499,17 -1497,23 +1502,23 @@@ struct pid_namespace
    *
    * see also pid_nr() etc in include/linux/pid.h
    */
+ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+                       struct pid_namespace *ns);
   
   static inline pid_t task_pid_nr(struct task_struct *tsk)
   {
         return tsk->pid;
   }
   
- pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+ static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
+                                       struct pid_namespace *ns)
+ {
+       return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+ }
   
   static inline pid_t task_pid_vnr(struct task_struct *tsk)
   {
-       return pid_vnr(task_pid(tsk));
+       return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
   }
   
   
@@@ -1526,31 -1530,34 +1535,34 @@@ static inline pid_t task_tgid_vnr(struc
   }
   
   
- static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+ static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
+                                       struct pid_namespace *ns)
   {
-       return tsk->signal->__pgrp;
+       return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
   }
   
- pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
- 
   static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
   {
-       return pid_vnr(task_pgrp(tsk));
+       return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
   }
   
   
- static inline pid_t task_session_nr(struct task_struct *tsk)
+ static inline pid_t task_session_nr_ns(struct task_struct *tsk,
+                                       struct pid_namespace *ns)
   {
-       return tsk->signal->__session;
+       return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
   }
   
- pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
- 
   static inline pid_t task_session_vnr(struct task_struct *tsk)
   {
-       return pid_vnr(task_session(tsk));
+       return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
   }
   
+ /* obsolete, do not use */
+ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+ {
+       return task_pgrp_nr_ns(tsk, &init_pid_ns);
+ }
   
   /**
    * pid_alive - check that a task structure is not stale
@@@ -1674,6 -1681,16 +1686,16 @@@ static inline int set_cpus_allowed(stru
         return set_cpus_allowed_ptr(p, &new_mask);
   }
   
+ /*
+  * Architectures can set this to 1 if they have specified
+  * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+  * but then during bootup it turns out that sched_clock()
+  * is reliable after all:
+  */
+ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ extern int sched_clock_stable;
+ #endif
+ 
   extern unsigned long long sched_clock(void);
   
   extern void sched_clock_init(void);
@@@ -1950,7 -1967,8 +1972,8 @@@ extern void mm_release(struct task_stru
   /* Allocate a new mm structure and copy contents from tsk->mm */
   extern struct mm_struct *dup_mm(struct task_struct *tsk);
   
- extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+ extern int copy_thread(unsigned long, unsigned long, unsigned long,
+                       struct task_struct *, struct pt_regs *);
   extern void flush_thread(void);
   extern void exit_thread(void);
   
@@@ -2035,6 -2053,11 +2058,11 @@@ static inline int thread_group_empty(st
   #define delay_group_leader(p) \
                 (thread_group_leader(p) && !thread_group_empty(p))
   
+ static inline int task_detached(struct task_struct *p)
+ {
+       return p->exit_signal == -1;
+ }
+ 
   /*
    * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
    * subscriptions and synchronises with wait4().  Also used in procfs.  Also
@@@ -2357,13 -2380,6 +2385,13 @@@ static inline void inc_syscw(struct tas
   #define TASK_SIZE_OF(tsk)     TASK_SIZE
   #endif
   
+ +/*
+ + * Call the function if the target task is executing on a CPU right now:
+ + */
+ +extern void task_oncpu_function_call(struct task_struct *p,
+ +                                   void (*func) (void *info), void *info);
+ +
+ +
   #ifdef CONFIG_MM_OWNER
   extern void mm_update_next_owner(struct mm_struct *mm);
   extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --combined include/linux/syscalls.h

index ab1d772,6470f74..471143b
--- 1/include/linux/syscalls.h
--- 2/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@@ -55,7 -55,6 +55,7 @@@ struct compat_timeval
   struct robust_list_head;
   struct getcpu_cache;
   struct old_linux_dirent;
+ +struct perf_counter_hw_event;
   
   #include <linux/types.h>
   #include <linux/aio_abi.h>
@@@ -66,6 -65,7 +66,7 @@@
   #include <asm/signal.h>
   #include <linux/quota.h>
   #include <linux/key.h>
+ #include <linux/ftrace.h>
   
   #define __SC_DECL1(t1, a1)    t1 a1
   #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
@@@ -96,7 -96,46 +97,46 @@@
   #define __SC_TEST5(t5, a5, ...)       __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
   #define __SC_TEST6(t6, a6, ...)       __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
   
+ #ifdef CONFIG_FTRACE_SYSCALLS
+ #define __SC_STR_ADECL1(t, a)         #a
+ #define __SC_STR_ADECL2(t, a, ...)    #a, __SC_STR_ADECL1(__VA_ARGS__)
+ #define __SC_STR_ADECL3(t, a, ...)    #a, __SC_STR_ADECL2(__VA_ARGS__)
+ #define __SC_STR_ADECL4(t, a, ...)    #a, __SC_STR_ADECL3(__VA_ARGS__)
+ #define __SC_STR_ADECL5(t, a, ...)    #a, __SC_STR_ADECL4(__VA_ARGS__)
+ #define __SC_STR_ADECL6(t, a, ...)    #a, __SC_STR_ADECL5(__VA_ARGS__)
+ 
+ #define __SC_STR_TDECL1(t, a)         #t
+ #define __SC_STR_TDECL2(t, a, ...)    #t, __SC_STR_TDECL1(__VA_ARGS__)
+ #define __SC_STR_TDECL3(t, a, ...)    #t, __SC_STR_TDECL2(__VA_ARGS__)
+ #define __SC_STR_TDECL4(t, a, ...)    #t, __SC_STR_TDECL3(__VA_ARGS__)
+ #define __SC_STR_TDECL5(t, a, ...)    #t, __SC_STR_TDECL4(__VA_ARGS__)
+ #define __SC_STR_TDECL6(t, a, ...)    #t, __SC_STR_TDECL5(__VA_ARGS__)
+ 
+ #define SYSCALL_METADATA(sname, nb)                           \
+       static const struct syscall_metadata __used             \
+         __attribute__((__aligned__(4)))                       \
+         __attribute__((section("__syscalls_metadata")))       \
+         __syscall_meta_##sname = {                            \
+               .name           = "sys"#sname,                  \
+               .nb_args        = nb,                           \
+               .types          = types_##sname,                \
+               .args           = args_##sname,                 \
+       }
+ 
+ #define SYSCALL_DEFINE0(sname)                                        \
+       static const struct syscall_metadata __used             \
+         __attribute__((__aligned__(4)))                       \
+         __attribute__((section("__syscalls_metadata")))       \
+         __syscall_meta_##sname = {                            \
+               .name           = "sys_"#sname,                 \
+               .nb_args        = 0,                            \
+       };                                                      \
+       asmlinkage long sys_##sname(void)
+ 
+ #else
   #define SYSCALL_DEFINE0(name)    asmlinkage long sys_##name(void)
+ #endif
+ 
   #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
   #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
   #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
@@@ -118,10 -157,26 +158,26 @@@
   #endif
   #endif
   
+ #ifdef CONFIG_FTRACE_SYSCALLS
+ #define SYSCALL_DEFINEx(x, sname, ...)                                \
+       static const char *types_##sname[] = {                  \
+               __SC_STR_TDECL##x(__VA_ARGS__)                  \
+       };                                                      \
+       static const char *args_##sname[] = {                   \
+               __SC_STR_ADECL##x(__VA_ARGS__)                  \
+       };                                                      \
+       SYSCALL_METADATA(sname, x);                             \
+       __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+ #else
+ #define SYSCALL_DEFINEx(x, sname, ...)                                \
+       __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+ #endif
+ 
   #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
   
   #define SYSCALL_DEFINE(name) static inline long SYSC_##name
- #define SYSCALL_DEFINEx(x, name, ...)                                 \
+ 
+ #define __SYSCALL_DEFINEx(x, name, ...)                                       \
         asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));           \
         static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));       \
         asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))            \
@@@ -135,7 -190,7 +191,7 @@@
   #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
   
   #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
- #define SYSCALL_DEFINEx(x, name, ...)                                 \
+ #define __SYSCALL_DEFINEx(x, name, ...)                                       \
         asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
   
   #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
@@@ -462,6 -517,10 +518,10 @@@ asmlinkage long sys_pread64(unsigned in
                             size_t count, loff_t pos);
   asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
                              size_t count, loff_t pos);
+ asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
+                          unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+ asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
+                           unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
   asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
   asmlinkage long sys_mkdir(const char __user *pathname, int mode);
   asmlinkage long sys_chdir(const char __user *filename);
@@@ -695,8 -754,4 +755,8 @@@ asmlinkage long sys_pipe(int __user *)
   
   int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
   
+ +
+ +asmlinkage long sys_perf_counter_open(
+ +              const struct perf_counter_hw_event __user *hw_event_uptr,
+ +              pid_t pid, int cpu, int group_fd, unsigned long flags);
   #endif
diff --combined init/Kconfig

index 5a3ad5c,09c7953..38a2ecd
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -101,6 -101,66 +101,66 @@@ config LOCALVERSION_AUT
   
           which is done within the script "scripts/setlocalversion".)
   
+ config HAVE_KERNEL_GZIP
+       bool
+ 
+ config HAVE_KERNEL_BZIP2
+       bool
+ 
+ config HAVE_KERNEL_LZMA
+       bool
+ 
+ choice
+       prompt "Kernel compression mode"
+       default KERNEL_GZIP
+       depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA
+       help
+         The linux kernel is a kind of self-extracting executable.
+         Several compression algorithms are available, which differ
+         in efficiency, compression and decompression speed.
+         Compression speed is only relevant when building a kernel.
+         Decompression speed is relevant at each boot.
+ 
+         If you have any problems with bzip2 or lzma compressed
+         kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older
+         version of this functionality (bzip2 only), for 2.4, was
+         supplied by Christian Ludwig)
+ 
+         High compression options are mostly useful for users, who
+         are low on disk space (embedded systems), but for whom ram
+         size matters less.
+ 
+         If in doubt, select 'gzip'
+ 
+ config KERNEL_GZIP
+       bool "Gzip"
+       depends on HAVE_KERNEL_GZIP
+       help
+         The old and tried gzip compression. Its compression ratio is
+         the poorest among the 3 choices; however its speed (both
+         compression and decompression) is the fastest.
+ 
+ config KERNEL_BZIP2
+       bool "Bzip2"
+       depends on HAVE_KERNEL_BZIP2
+       help
+         Its compression ratio and speed is intermediate.
+         Decompression speed is slowest among the three.  The kernel
+         size is about 10% smaller with bzip2, in comparison to gzip.
+         Bzip2 uses a large amount of memory. For modern kernels you
+         will need at least 8MB RAM or more for booting.
+ 
+ config KERNEL_LZMA
+       bool "LZMA"
+       depends on HAVE_KERNEL_LZMA
+       help
+         The most recent compression algorithm.
+         Its ratio is best, decompression speed is between the other
+         two. Compression is slowest.  The kernel size is about 33%
+         smaller with LZMA in comparison to gzip.
+ 
+ endchoice
+ 
   config SWAP
         bool "Support for paging of anonymous memory (swap)"
         depends on MMU && BLOCK
@@@ -471,7 -531,7 +531,7 @@@ config CGROUP_DEVIC
   
   config CPUSETS
         bool "Cpuset support"
-       depends on SMP && CGROUPS
+       depends on CGROUPS
         help
           This option will let you create and manage CPUSETs which
           allow dynamically partitioning a system into sets of CPUs and
@@@ -505,7 -565,7 +565,7 @@@ config CGROUP_MEM_RES_CTL
         select MM_OWNER
         help
           Provides a memory resource controller that manages both anonymous
-         memory and page cache. (See Documentation/controllers/memory.txt)
+         memory and page cache. (See Documentation/cgroups/memory.txt)
   
           Note that setting this option increases fixed memory overhead
           associated with each page of memory in the system. By this,
@@@ -537,6 -597,8 +597,8 @@@ config CGROUP_MEM_RES_CTLR_SWA
           is disabled by boot option, this will be automatically disabled and
           there will be no overhead from this. Even when you set this config=y,
           if boot option "noswapaccount" is set, swap will not be accounted.
+         Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
+         size is 4096bytes, 512k per 1Gbytes of swap.
   
   endif # CGROUPS
   
@@@ -627,7 -689,7 +689,7 @@@ config PID_N
         depends on NAMESPACES && EXPERIMENTAL
         help
           Support process id namespaces.  This allows having multiple
-         process with the same pid as long as they are in different
+         processes with the same pid as long as they are in different
           pid namespaces.  This is a building block of containers.
   
           Unless you want to work with an experimental feature
@@@ -675,6 -737,9 +737,9 @@@ config CC_OPTIMIZE_FOR_SIZ
   config SYSCTL
         bool
   
+ config ANON_INODES
+       bool
+ 
   menuconfig EMBEDDED
         bool "Configure standard kernel features (for small systems)"
         help
@@@ -780,18 -845,6 +845,6 @@@ config PCSPKR_PLATFOR
             This option allows to disable the internal PC-Speaker
             support, saving some memory.
   
- config COMPAT_BRK
-       bool "Disable heap randomization"
-       default y
-       help
-         Randomizing heap placement makes heap exploits harder, but it
-         also breaks ancient binaries (including anything libc5 based).
-         This option changes the bootup default to heap randomization
-         disabled, and can be overriden runtime by setting
-         /proc/sys/kernel/randomize_va_space to 2.
- 
-         On non-ancient distros (post-2000 ones) N is usually a safe choice.
- 
   config BASE_FULL
         default y
         bool "Enable full-sized data structures for core" if EMBEDDED
@@@ -809,9 -862,6 +862,6 @@@ config FUTE
           support for "fast userspace mutexes".  The resulting kernel may not
           run glibc-based applications correctly.
   
- config ANON_INODES
-       bool
- 
   config EPOLL
         bool "Enable eventpoll support" if EMBEDDED
         default y
@@@ -869,36 -919,6 +919,36 @@@ config AI
             by some high performance threaded applications. Disabling
             this option saves about 7k.
   
+ +config HAVE_PERF_COUNTERS
+ +      bool
+ +
+ +menu "Performance Counters"
+ +
+ +config PERF_COUNTERS
+ +      bool "Kernel Performance Counters"
+ +      depends on HAVE_PERF_COUNTERS
+ +      default y
+ +      select ANON_INODES
+ +      help
+ +        Enable kernel support for performance counter hardware.
+ +
+ +        Performance counters are special hardware registers available
+ +        on most modern CPUs. These registers count the number of certain
+ +        types of hw events: such as instructions executed, cachemisses
+ +        suffered, or branches mis-predicted - without slowing down the
+ +        kernel or applications. These registers can also trigger interrupts
+ +        when a threshold number of events have passed - and can thus be
+ +        used to profile the code that runs on that CPU.
+ +
+ +        The Linux Performance Counter subsystem provides an abstraction of
+ +        these hardware capabilities, available via a system call. It
+ +        provides per task and per CPU counters, and it provides event
+ +        capabilities on top of those.
+ +
+ +        Say Y if unsure.
+ +
+ +endmenu
+ +
   config VM_EVENT_COUNTERS
         default y
         bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
@@@ -927,6 -947,18 +977,18 @@@ config SLUB_DEBU
           SLUB sysfs support. /sys/slab will not exist and there will be
           no support for cache validation etc.
   
+ config COMPAT_BRK
+       bool "Disable heap randomization"
+       default y
+       help
+         Randomizing heap placement makes heap exploits harder, but it
+         also breaks ancient binaries (including anything libc5 based).
+         This option changes the bootup default to heap randomization
+         disabled, and can be overridden at runtime by setting
+         /proc/sys/kernel/randomize_va_space to 2.
+ 
+         On non-ancient distros (post-2000 ones) N is usually a safe choice.
+ 
   choice
         prompt "Choose SLAB allocator"
         default SLUB
@@@ -975,13 -1007,25 +1037,25 @@@ config TRACEPOINT
   
   config MARKERS
         bool "Activate markers"
-       depends on TRACEPOINTS
+       select TRACEPOINTS
         help
           Place an empty function call at each marker site. Can be
           dynamically changed for a probe function.
   
   source "arch/Kconfig"
   
+ config SLOW_WORK
+       default n
+       bool "Enable slow work thread pool"
+       help
+         The slow work thread pool provides a number of dynamically allocated
+         threads that can be used by the kernel to perform operations that
+         take a relatively long time.
+ 
+         An example of this would be CacheFiles doing a path lookup followed
+         by a series of mkdirs and a create call, all of which have to touch
+         disk.
+ 
   endmenu               # General setup
   
   config HAVE_GENERIC_DMA_COHERENT
@@@ -996,7 -1040,6 +1070,6 @@@ config SLABINF
   
   config RT_MUTEXES
         boolean
-       select PLIST
   
   config BASE_SMALL
         int
@@@ -1081,7 -1124,7 +1154,7 @@@ config INIT_ALL_POSSIBL
           cpu_possible_map, some of them chose to initialize cpu_possible_map
           with all 1s, and others with all 0s.  When they were centralised,
           it was better to provide this option than to break all the archs
-         and have several arch maintainers persuing me down dark alleys.
+         and have several arch maintainers pursuing me down dark alleys.
   
   config STOP_MACHINE
         bool
diff --combined kernel/Makefile

index 9ef39e5,bab1dff..63c6975
--- 1/kernel/Makefile
--- 2/kernel/Makefile
+++ b/kernel/Makefile
@@@ -93,7 -93,7 +93,8 @@@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT
   obj-$(CONFIG_FUNCTION_TRACER) += trace/
   obj-$(CONFIG_TRACING) += trace/
   obj-$(CONFIG_SMP) += sched_cpupri.o
+ obj-$(CONFIG_SLOW_WORK) += slow-work.o
+ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
   
   ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
   # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --combined kernel/exit.c

index f52c24e,6686ed1..7a14a2b
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -46,6 -46,7 +46,7 @@@
   #include <linux/blkdev.h>
   #include <linux/task_io_accounting_ops.h>
   #include <linux/tracehook.h>
+ #include <linux/fs_struct.h>
   #include <linux/init_task.h>
   #include <trace/sched.h>
   
@@@ -61,11 -62,6 +62,6 @@@ DEFINE_TRACE(sched_process_wait)
   
   static void exit_mm(struct task_struct * tsk);
   
- static inline int task_detached(struct task_struct *p)
- {
-       return p->exit_signal == -1;
- }
- 
   static void __unhash_process(struct task_struct *p)
   {
         nr_threads--;
@@@ -162,9 -158,6 +158,9 @@@ static void delayed_put_task_struct(str
   {
         struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
   
+ +#ifdef CONFIG_PERF_COUNTERS
+ +      WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+ +#endif
         trace_sched_process_free(tsk);
         put_task_struct(tsk);
   }
@@@ -365,16 -358,12 +361,12 @@@ static void reparent_to_kthreadd(void
   void __set_special_pids(struct pid *pid)
   {
         struct task_struct *curr = current->group_leader;
-       pid_t nr = pid_nr(pid);
   
-       if (task_session(curr) != pid) {
+       if (task_session(curr) != pid)
                 change_pid(curr, PIDTYPE_SID, pid);
-               set_task_session(curr, nr);
-       }
-       if (task_pgrp(curr) != pid) {
+ 
+       if (task_pgrp(curr) != pid)
                 change_pid(curr, PIDTYPE_PGID, pid);
-               set_task_pgrp(curr, nr);
-       }
   }
   
   static void set_special_pids(struct pid *pid)
@@@ -432,7 -421,6 +424,6 @@@ EXPORT_SYMBOL(disallow_signal)
   void daemonize(const char *name, ...)
   {
         va_list args;
-       struct fs_struct *fs;
         sigset_t blocked;
   
         va_start(args, name);
@@@ -465,11 -453,7 +456,7 @@@
   
         /* Become as one with the init task */
   
-       exit_fs(current);       /* current->fs->count--; */
-       fs = init_task.fs;
-       current->fs = fs;
-       atomic_inc(&fs->count);
- 
+       daemonize_fs_struct();
         exit_files(current);
         current->files = init_task.files;
         atomic_inc(&current->files->count);
@@@ -568,30 -552,6 +555,6 @@@ void exit_files(struct task_struct *tsk
         }
   }
   
- void put_fs_struct(struct fs_struct *fs)
- {
-       /* No need to hold fs->lock if we are killing it */
-       if (atomic_dec_and_test(&fs->count)) {
-               path_put(&fs->root);
-               path_put(&fs->pwd);
-               kmem_cache_free(fs_cachep, fs);
-       }
- }
- 
- void exit_fs(struct task_struct *tsk)
- {
-       struct fs_struct * fs = tsk->fs;
- 
-       if (fs) {
-               task_lock(tsk);
-               tsk->fs = NULL;
-               task_unlock(tsk);
-               put_fs_struct(fs);
-       }
- }
- 
- EXPORT_SYMBOL_GPL(exit_fs);
- 
   #ifdef CONFIG_MM_OWNER
   /*
    * Task p is exiting and it owned mm, lets find a new owner for it
@@@ -735,119 -695,6 +698,6 @@@ static void exit_mm(struct task_struct 
   }
   
   /*
-  * Return nonzero if @parent's children should reap themselves.
-  *
-  * Called with write_lock_irq(&tasklist_lock) held.
-  */
- static int ignoring_children(struct task_struct *parent)
- {
-       int ret;
-       struct sighand_struct *psig = parent->sighand;
-       unsigned long flags;
-       spin_lock_irqsave(&psig->siglock, flags);
-       ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
-              (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
-       spin_unlock_irqrestore(&psig->siglock, flags);
-       return ret;
- }
- 
- /*
-  * Detach all tasks we were using ptrace on.
-  * Any that need to be release_task'd are put on the @dead list.
-  *
-  * Called with write_lock(&tasklist_lock) held.
-  */
- static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
- {
-       struct task_struct *p, *n;
-       int ign = -1;
- 
-       list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-               __ptrace_unlink(p);
- 
-               if (p->exit_state != EXIT_ZOMBIE)
-                       continue;
- 
-               /*
-                * If it's a zombie, our attachedness prevented normal
-                * parent notification or self-reaping.  Do notification
-                * now if it would have happened earlier.  If it should
-                * reap itself, add it to the @dead list.  We can't call
-                * release_task() here because we already hold tasklist_lock.
-                *
-                * If it's our own child, there is no notification to do.
-                * But if our normal children self-reap, then this child
-                * was prevented by ptrace and we must reap it now.
-                */
-               if (!task_detached(p) && thread_group_empty(p)) {
-                       if (!same_thread_group(p->real_parent, parent))
-                               do_notify_parent(p, p->exit_signal);
-                       else {
-                               if (ign < 0)
-                                       ign = ignoring_children(parent);
-                               if (ign)
-                                       p->exit_signal = -1;
-                       }
-               }
- 
-               if (task_detached(p)) {
-                       /*
-                        * Mark it as in the process of being reaped.
-                        */
-                       p->exit_state = EXIT_DEAD;
-                       list_add(&p->ptrace_entry, dead);
-               }
-       }
- }
- 
- /*
-  * Finish up exit-time ptrace cleanup.
-  *
-  * Called without locks.
-  */
- static void ptrace_exit_finish(struct task_struct *parent,
-                              struct list_head *dead)
- {
-       struct task_struct *p, *n;
- 
-       BUG_ON(!list_empty(&parent->ptraced));
- 
-       list_for_each_entry_safe(p, n, dead, ptrace_entry) {
-               list_del_init(&p->ptrace_entry);
-               release_task(p);
-       }
- }
- 
- static void reparent_thread(struct task_struct *p, struct task_struct *father)
- {
-       if (p->pdeath_signal)
-               /* We already hold the tasklist_lock here.  */
-               group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
- 
-       list_move_tail(&p->sibling, &p->real_parent->children);
- 
-       /* If this is a threaded reparent there is no need to
-        * notify anyone anything has happened.
-        */
-       if (same_thread_group(p->real_parent, father))
-               return;
- 
-       /* We don't want people slaying init.  */
-       if (!task_detached(p))
-               p->exit_signal = SIGCHLD;
- 
-       /* If we'd notified the old parent about this child's death,
-        * also notify the new parent.
-        */
-       if (!ptrace_reparented(p) &&
-           p->exit_state == EXIT_ZOMBIE &&
-           !task_detached(p) && thread_group_empty(p))
-               do_notify_parent(p, p->exit_signal);
- 
-       kill_orphaned_pgrp(p, father);
- }
- 
- /*
    * When we die, we re-parent all our children.
    * Try to give them to another thread in our thread
    * group, and if no such member exists, give it to
@@@ -886,17 -733,51 +736,51 @@@ static struct task_struct *find_new_rea
         return pid_ns->child_reaper;
   }
   
+ /*
+ * Any that need to be release_task'd are put on the @dead list.
+  */
+ static void reparent_thread(struct task_struct *father, struct task_struct *p,
+                               struct list_head *dead)
+ {
+       if (p->pdeath_signal)
+               group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+ 
+       list_move_tail(&p->sibling, &p->real_parent->children);
+ 
+       if (task_detached(p))
+               return;
+       /*
+        * If this is a threaded reparent there is no need to
+        * notify anyone anything has happened.
+        */
+       if (same_thread_group(p->real_parent, father))
+               return;
+ 
+       /* We don't want people slaying init.  */
+       p->exit_signal = SIGCHLD;
+ 
+       /* If it has exited notify the new parent about this child's death. */
+       if (!p->ptrace &&
+           p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+               do_notify_parent(p, p->exit_signal);
+               if (task_detached(p)) {
+                       p->exit_state = EXIT_DEAD;
+                       list_move_tail(&p->sibling, dead);
+               }
+       }
+ 
+       kill_orphaned_pgrp(p, father);
+ }
+ 
   static void forget_original_parent(struct task_struct *father)
   {
         struct task_struct *p, *n, *reaper;
-       LIST_HEAD(ptrace_dead);
+       LIST_HEAD(dead_children);
+ 
+       exit_ptrace(father);
   
         write_lock_irq(&tasklist_lock);
         reaper = find_new_reaper(father);
-       /*
-        * First clean up ptrace if we were using it.
-        */
-       ptrace_exit(father, &ptrace_dead);
   
         list_for_each_entry_safe(p, n, &father->children, sibling) {
                 p->real_parent = reaper;
@@@ -904,13 -785,16 +788,16 @@@
                         BUG_ON(p->ptrace);
                         p->parent = p->real_parent;
                 }
-               reparent_thread(p, father);
+               reparent_thread(father, p, &dead_children);
         }
- 
         write_unlock_irq(&tasklist_lock);
+ 
         BUG_ON(!list_empty(&father->children));
   
-       ptrace_exit_finish(father, &ptrace_dead);
+       list_for_each_entry_safe(p, n, &dead_children, sibling) {
+               list_del_init(&p->sibling);
+               release_task(p);
+       }
   }
   
   /*
@@@ -1096,6 -980,10 +983,6 @@@ NORET_TYPE void do_exit(long code
         tsk->mempolicy = NULL;
   #endif
   #ifdef CONFIG_FUTEX
- -      /*
- -       * This must happen late, after the PID is not
- -       * hashed anymore:
- -       */
         if (unlikely(!list_empty(&tsk->pi_state_list)))
                 exit_pi_state_list(tsk);
         if (unlikely(current->pi_state_cache))
@@@ -1362,12 -1250,6 +1249,12 @@@ static int wait_task_zombie(struct task
          */
         read_unlock(&tasklist_lock);
   
+ +      /*
+ +       * Flush inherited counters to the parent - before the parent
+ +       * gets woken up by child-exit notifications.
+ +       */
+ +      perf_counter_exit_task(p);
+ +
         retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
         status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                 ? p->signal->group_exit_code : p->exit_code;
@@@ -1422,6 -1304,18 +1309,18 @@@
         return retval;
   }
   
+ static int *task_stopped_code(struct task_struct *p, bool ptrace)
+ {
+       if (ptrace) {
+               if (task_is_stopped_or_traced(p))
+                       return &p->exit_code;
+       } else {
+               if (p->signal->flags & SIGNAL_STOP_STOPPED)
+                       return &p->signal->group_exit_code;
+       }
+       return NULL;
+ }
+ 
   /*
    * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
    * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
@@@ -1432,7 -1326,7 +1331,7 @@@ static int wait_task_stopped(int ptrace
                              int options, struct siginfo __user *infop,
                              int __user *stat_addr, struct rusage __user *ru)
   {
-       int retval, exit_code, why;
+       int retval, exit_code, *p_code, why;
         uid_t uid = 0; /* unneeded, required by compiler */
         pid_t pid;
   
@@@ -1442,22 -1336,16 +1341,16 @@@
         exit_code = 0;
         spin_lock_irq(&p->sighand->siglock);
   
-       if (unlikely(!task_is_stopped_or_traced(p)))
-               goto unlock_sig;
- 
-       if (!ptrace && p->signal->group_stop_count > 0)
-               /*
-                * A group stop is in progress and this is the group leader.
-                * We won't report until all threads have stopped.
-                */
+       p_code = task_stopped_code(p, ptrace);
+       if (unlikely(!p_code))
                 goto unlock_sig;
   
-       exit_code = p->exit_code;
+       exit_code = *p_code;
         if (!exit_code)
                 goto unlock_sig;
   
         if (!unlikely(options & WNOWAIT))
-               p->exit_code = 0;
+               *p_code = 0;
   
         /* don't need the RCU readlock here as we're holding a spinlock */
         uid = __task_cred(p)->uid;
@@@ -1613,7 -1501,7 +1506,7 @@@ static int wait_consider_task(struct ta
          */
         *notask_error = 0;
   
-       if (task_is_stopped_or_traced(p))
+       if (task_stopped_code(p, ptrace))
                 return wait_task_stopped(ptrace, p, options,
                                          infop, stat_addr, ru);
   
@@@ -1817,7 -1705,7 +1710,7 @@@ SYSCALL_DEFINE4(wait4, pid_t, upid, in
                 pid = find_get_pid(-upid);
         } else if (upid == 0) {
                 type = PIDTYPE_PGID;
-               pid = get_pid(task_pgrp(current));
+               pid = get_task_pid(current, PIDTYPE_PGID);
         } else /* upid > 0 */ {
                 type = PIDTYPE_PID;
                 pid = find_get_pid(upid);
diff --combined kernel/fork.c

index 4640a3e,660c2b8..381d7f9
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -60,6 -60,7 +60,7 @@@
   #include <linux/tty.h>
   #include <linux/proc_fs.h>
   #include <linux/blkdev.h>
+ #include <linux/fs_struct.h>
   #include <trace/sched.h>
   #include <linux/magic.h>
   
@@@ -284,7 -285,7 +285,7 @@@ static int dup_mmap(struct mm_struct *m
         mm->free_area_cache = oldmm->mmap_base;
         mm->cached_hole_size = ~0UL;
         mm->map_count = 0;
-       cpus_clear(mm->cpu_vm_mask);
+       cpumask_clear(mm_cpumask(mm));
         mm->mm_rb = RB_ROOT;
         rb_link = &mm->mm_rb.rb_node;
         rb_parent = NULL;
@@@ -681,38 -682,21 +682,21 @@@ fail_nomem
         return retval;
   }
   
- static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
- {
-       struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
-       /* We don't need to lock fs - think why ;-) */
-       if (fs) {
-               atomic_set(&fs->count, 1);
-               rwlock_init(&fs->lock);
-               fs->umask = old->umask;
-               read_lock(&old->lock);
-               fs->root = old->root;
-               path_get(&old->root);
-               fs->pwd = old->pwd;
-               path_get(&old->pwd);
-               read_unlock(&old->lock);
-       }
-       return fs;
- }
- 
- struct fs_struct *copy_fs_struct(struct fs_struct *old)
- {
-       return __copy_fs_struct(old);
- }
- 
- EXPORT_SYMBOL_GPL(copy_fs_struct);
- 
   static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
   {
+       struct fs_struct *fs = current->fs;
         if (clone_flags & CLONE_FS) {
-               atomic_inc(&current->fs->count);
+               /* tsk->fs is already what we want */
+               write_lock(&fs->lock);
+               if (fs->in_exec) {
+                       write_unlock(&fs->lock);
+                       return -EAGAIN;
+               }
+               fs->users++;
+               write_unlock(&fs->lock);
                 return 0;
         }
-       tsk->fs = __copy_fs_struct(current->fs);
+       tsk->fs = copy_fs_struct(fs);
         if (!tsk->fs)
                 return -ENOMEM;
         return 0;
@@@ -841,6 -825,8 +825,8 @@@ static int copy_signal(unsigned long cl
         atomic_set(&sig->live, 1);
         init_waitqueue_head(&sig->wait_chldexit);
         sig->flags = 0;
+       if (clone_flags & CLONE_NEWPID)
+               sig->flags |= SIGNAL_UNKILLABLE;
         sig->group_exit_code = 0;
         sig->group_exit_task = NULL;
         sig->group_stop_count = 0;
@@@ -989,7 -975,6 +975,7 @@@ static struct task_struct *copy_process
                 goto fork_out;
   
         rt_mutex_init_task(p);
+ +      perf_counter_init_task(p);
   
   #ifdef CONFIG_PROVE_LOCKING
         DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
@@@ -1126,7 -1111,7 +1112,7 @@@
                 goto bad_fork_cleanup_mm;
         if ((retval = copy_io(clone_flags, p)))
                 goto bad_fork_cleanup_namespaces;
-       retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+       retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
         if (retval)
                 goto bad_fork_cleanup_io;
   
@@@ -1185,10 -1170,6 +1171,6 @@@
   #endif
         clear_all_latency_tracing(p);
   
-       /* Our parent execution domain becomes current domain
-          These must match for thread signalling to apply */
-       p->parent_exec_id = p->self_exec_id;
- 
         /* ok, now we should be set up.. */
         p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
         p->pdeath_signal = 0;
@@@ -1226,10 -1207,13 +1208,13 @@@
                 set_task_cpu(p, smp_processor_id());
   
         /* CLONE_PARENT re-uses the old parent */
-       if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+       if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                 p->real_parent = current->real_parent;
-       else
+               p->parent_exec_id = current->parent_exec_id;
+       } else {
                 p->real_parent = current;
+               p->parent_exec_id = current->self_exec_id;
+       }
   
         spin_lock(&current->sighand->siglock);
   
@@@ -1265,8 -1249,6 +1250,6 @@@
                         p->signal->leader_pid = pid;
                         tty_kref_put(p->signal->tty);
                         p->signal->tty = tty_kref_get(current->signal->tty);
-                       set_task_pgrp(p, task_pgrp_nr(current));
-                       set_task_session(p, task_session_nr(current));
                         attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
                         attach_pid(p, PIDTYPE_SID, task_session(current));
                         list_add_tail_rcu(&p->tasks, &init_task.tasks);
@@@ -1490,6 -1472,7 +1473,7 @@@ void __init proc_caches_init(void
         mm_cachep = kmem_cache_create("mm_struct",
                         sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
         mmap_init();
   }
   
@@@ -1545,12 -1528,16 +1529,16 @@@ static int unshare_fs(unsigned long uns
   {
         struct fs_struct *fs = current->fs;
   
-       if ((unshare_flags & CLONE_FS) &&
-           (fs && atomic_read(&fs->count) > 1)) {
-               *new_fsp = __copy_fs_struct(current->fs);
-               if (!*new_fsp)
-                       return -ENOMEM;
-       }
+       if (!(unshare_flags & CLONE_FS) || !fs)
+               return 0;
+ 
+       /* don't need lock here; in the worst case we'll do useless copy */
+       if (fs->users == 1)
+               return 0;
+ 
+       *new_fsp = copy_fs_struct(fs);
+       if (!*new_fsp)
+               return -ENOMEM;
   
         return 0;
   }
@@@ -1666,8 -1653,13 +1654,13 @@@ SYSCALL_DEFINE1(unshare, unsigned long
   
                 if (new_fs) {
                         fs = current->fs;
+                       write_lock(&fs->lock);
                         current->fs = new_fs;
-                       new_fs = fs;
+                       if (--fs->users)
+                               new_fs = NULL;
+                       else
+                               new_fs = fs;
+                       write_unlock(&fs->lock);
                 }
   
                 if (new_mm) {
@@@ -1706,7 -1698,7 +1699,7 @@@ bad_unshare_cleanup_sigh
   
   bad_unshare_cleanup_fs:
         if (new_fs)
-               put_fs_struct(new_fs);
+               free_fs_struct(new_fs);
   
   bad_unshare_cleanup_thread:
   bad_unshare_out:
diff --combined kernel/sched.c

index 78f4424,bec2498..39e7086
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -331,6 -331,13 +331,13 @@@ static DEFINE_PER_CPU(struct rt_rq, ini
    */
   static DEFINE_SPINLOCK(task_group_lock);
   
+ #ifdef CONFIG_SMP
+ static int root_task_group_empty(void)
+ {
+       return list_empty(&root_task_group.children);
+ }
+ #endif
+ 
   #ifdef CONFIG_FAIR_GROUP_SCHED
   #ifdef CONFIG_USER_SCHED
   # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@@ -391,6 -398,13 +398,13 @@@ static inline void set_task_rq(struct t
   
   #else
   
+ #ifdef CONFIG_SMP
+ static int root_task_group_empty(void)
+ {
+       return 1;
+ }
+ #endif
+ 
   static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
   static inline struct task_group *task_group(struct task_struct *p)
   {
@@@ -467,11 -481,17 +481,17 @@@ struct rt_rq 
         struct rt_prio_array active;
         unsigned long rt_nr_running;
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       int highest_prio; /* highest queued rt task prio */
+       struct {
+               int curr; /* highest queued rt task prio */
+ #ifdef CONFIG_SMP
+               int next; /* next highest */
+ #endif
+       } highest_prio;
   #endif
   #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+       struct plist_head pushable_tasks;
   #endif
         int rt_throttled;
         u64 rt_time;
@@@ -549,7 -569,6 +569,6 @@@ struct rq 
         unsigned long nr_running;
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-       unsigned char idle_at_tick;
   #ifdef CONFIG_NO_HZ
         unsigned long last_tick_seen;
         unsigned char in_nohz_recently;
@@@ -558,7 -577,6 +577,7 @@@
         struct load_weight load;
         unsigned long nr_load_updates;
         u64 nr_switches;
+ +      u64 nr_migrations_in;
   
         struct cfs_rq cfs;
         struct rt_rq rt;
@@@ -591,6 -609,7 +610,7 @@@
         struct root_domain *rd;
         struct sched_domain *sd;
   
+       unsigned char idle_at_tick;
         /* For active balancing */
         int active_balance;
         int push_cpu;
@@@ -619,9 -638,6 +639,6 @@@
         /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
   
         /* sys_sched_yield() stats */
-       unsigned int yld_exp_empty;
-       unsigned int yld_act_empty;
-       unsigned int yld_both_empty;
         unsigned int yld_count;
   
         /* schedule() stats */
@@@ -669,7 -685,7 +686,7 @@@ static inline int cpu_of(struct rq *rq
   #define task_rq(p)            cpu_rq(task_cpu(p))
   #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
   
- -static inline void update_rq_clock(struct rq *rq)
+ +inline void update_rq_clock(struct rq *rq)
   {
         rq->clock = sched_clock_cpu(cpu_of(rq));
   }
@@@ -980,26 -996,6 +997,26 @@@ static struct rq *task_rq_lock(struct t
         }
   }
   
+ +void curr_rq_lock_irq_save(unsigned long *flags)
+ +      __acquires(rq->lock)
+ +{
+ +      struct rq *rq;
+ +
+ +      local_irq_save(*flags);
+ +      rq = cpu_rq(smp_processor_id());
+ +      spin_lock(&rq->lock);
+ +}
+ +
+ +void curr_rq_unlock_irq_restore(unsigned long *flags)
+ +      __releases(rq->lock)
+ +{
+ +      struct rq *rq;
+ +
+ +      rq = cpu_rq(smp_processor_id());
+ +      spin_unlock(&rq->lock);
+ +      local_irq_restore(*flags);
+ +}
+ +
   void task_rq_unlock_wait(struct task_struct *p)
   {
         struct rq *rq = task_rq(p);
@@@ -1114,7 -1110,7 +1131,7 @@@ static void hrtick_start(struct rq *rq
         if (rq == this_rq()) {
                 hrtimer_restart(timer);
         } else if (!rq->hrtick_csd_pending) {
-               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
                 rq->hrtick_csd_pending = 1;
         }
   }
@@@ -1204,10 -1200,10 +1221,10 @@@ static void resched_task(struct task_st
   
         assert_spin_locked(&task_rq(p)->lock);
   
-       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+       if (test_tsk_need_resched(p))
                 return;
   
-       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+       set_tsk_need_resched(p);
   
         cpu = task_cpu(p);
         if (cpu == smp_processor_id())
@@@ -1263,7 -1259,7 +1280,7 @@@ void wake_up_idle_cpu(int cpu
          * lockless. The worst case is that the other CPU runs the
          * idle task through an additional NOOP schedule()
          */
-       set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+       set_tsk_need_resched(rq->idle);
   
         /* NEED_RESCHED must be visible before we test polling */
         smp_mb();
@@@ -1631,21 -1627,42 +1648,42 @@@ static inline void update_shares_locked
   
   #endif
   
+ #ifdef CONFIG_PREEMPT
+ 
   /*
-  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  * fair double_lock_balance: Safely acquires both rq->locks in a fair
+  * way at the expense of forcing extra atomic operations in all
+  * invocations.  This assures that the double_lock is acquired using the
+  * same underlying policy as the spinlock_t on this architecture, which
+  * reduces latency compared to the unfair variant below.  However, it
+  * also adds more overhead and therefore may reduce throughput.
    */
- static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+ {
+       spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+ 
+       return 1;
+ }
+ 
+ #else
+ /*
+  * Unfair double_lock_balance: Optimizes throughput at the expense of
+  * latency by eliminating extra atomic operations when the locks are
+  * already in proper order on entry.  This favors lower cpu-ids and will
+  * grant the double lock to lower cpus over higher ids under contention,
+  * regardless of entry order into the function.
+  */
+ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
   {
         int ret = 0;
   
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@@ -1658,6 -1675,22 +1696,22 @@@
         return ret;
   }
   
+ #endif /* CONFIG_PREEMPT */
+ 
+ /*
+  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  */
+ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ {
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+ 
+       return _double_lock_balance(this_rq, busiest);
+ }
+ 
   static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
   {
@@@ -1726,6 -1759,9 +1780,9 @@@ static void update_avg(u64 *avg, u64 sa
   
   static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
   {
+       if (wakeup)
+               p->se.start_runtime = p->se.sum_exec_runtime;
+ 
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
@@@ -1733,10 -1769,15 +1790,15 @@@
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
   {
-       if (sleep && p->se.last_wakeup) {
-               update_avg(&p->se.avg_overlap,
-                          p->se.sum_exec_runtime - p->se.last_wakeup);
-               p->se.last_wakeup = 0;
+       if (sleep) {
+               if (p->se.last_wakeup) {
+                       update_avg(&p->se.avg_overlap,
+                               p->se.sum_exec_runtime - p->se.last_wakeup);
+                       p->se.last_wakeup = 0;
+               } else {
+                       update_avg(&p->se.avg_wakeup,
+                               sysctl_sched_wakeup_granularity);
+               }
         }
   
         sched_info_dequeued(p);
@@@ -1906,15 -1947,12 +1968,15 @@@ void set_task_cpu(struct task_struct *p
                 p->se.sleep_start -= clock_offset;
         if (p->se.block_start)
                 p->se.block_start -= clock_offset;
+ +#endif
         if (old_cpu != new_cpu) {
- -              schedstat_inc(p, se.nr_migrations);
+ +              p->se.nr_migrations++;
+ +              new_rq->nr_migrations_in++;
+ +#ifdef CONFIG_SCHEDSTATS
                 if (task_hot(p, old_rq->clock, NULL))
                         schedstat_inc(p, se.nr_forced2_migrations);
- -      }
   #endif
+ +      }
         p->se.vruntime -= old_cfsrq->min_vruntime -
                                          new_cfsrq->min_vruntime;
   
@@@ -2041,7 -2079,7 +2103,7 @@@ unsigned long wait_task_inactive(struc
                  * it must be off the runqueue _entirely_, and not
                  * preempted!
                  *
-                * So if it wa still runnable (but just not actively
+                * So if it was still runnable (but just not actively
                  * running right now), it's preempted, and we should
                  * yield - it could be a while.
                  */
@@@ -2266,27 -2304,6 +2328,27 @@@ static int sched_balance_self(int cpu, 
   
   #endif /* CONFIG_SMP */
   
+ +/**
+ + * task_oncpu_function_call - call a function on the cpu on which a task runs
+ + * @p:                the task to evaluate
+ + * @func:     the function to be called
+ + * @info:     the function call argument
+ + *
+ + * Calls the function @func when the task is currently running. This might
+ + * be on the current CPU, which just calls the function directly
+ + */
+ +void task_oncpu_function_call(struct task_struct *p,
+ +                            void (*func) (void *info), void *info)
+ +{
+ +      int cpu;
+ +
+ +      preempt_disable();
+ +      cpu = task_cpu(p);
+ +      if (task_curr(p))
+ +              smp_call_function_single(cpu, func, info, 1);
+ +      preempt_enable();
+ +}
+ +
   /***
    * try_to_wake_up - wake up a thread
    * @p: the to-be-woken-up thread
@@@ -2312,7 -2329,7 +2374,7 @@@ static int try_to_wake_up(struct task_s
                 sync = 0;
   
   #ifdef CONFIG_SMP
-       if (sched_feat(LB_WAKEUP_UPDATE)) {
+       if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                 struct sched_domain *sd;
   
                 this_cpu = raw_smp_processor_id();
@@@ -2390,6 -2407,22 +2452,22 @@@ out_activate
         activate_task(rq, p, 1);
         success = 1;
   
+       /*
+        * Only attribute actual wakeups done by this task.
+        */
+       if (!in_interrupt()) {
+               struct sched_entity *se = &current->se;
+               u64 sample = se->sum_exec_runtime;
+ 
+               if (se->last_wakeup)
+                       sample -= se->last_wakeup;
+               else
+                       sample -= se->start_runtime;
+               update_avg(&se->avg_wakeup, sample);
+ 
+               se->last_wakeup = se->sum_exec_runtime;
+       }
+ 
   out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
@@@ -2400,8 -2433,6 +2478,6 @@@
                 p->sched_class->task_wake_up(rq, p);
   #endif
   out:
-       current->se.last_wakeup = current->se.sum_exec_runtime;
- 
         task_rq_unlock(rq, &flags);
   
         return success;
@@@ -2429,9 -2460,10 +2505,11 @@@ static void __sched_fork(struct task_st
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
+ +      p->se.nr_migrations             = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
+       p->se.start_runtime             = 0;
+       p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
   
   #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                = 0;
@@@ -2494,6 -2526,8 +2572,8 @@@ void sched_fork(struct task_struct *p, 
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
   #endif
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ 
         put_cpu();
   }
   
@@@ -2537,7 -2571,7 +2617,7 @@@ void wake_up_new_task(struct task_struc
   #ifdef CONFIG_PREEMPT_NOTIFIERS
   
   /**
-  * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+  * preempt_notifier_register - tell me when current is being preempted & rescheduled
    * @notifier: notifier struct to register
    */
   void preempt_notifier_register(struct preempt_notifier *notifier)
@@@ -2634,6 -2668,12 +2714,12 @@@ static void finish_task_switch(struct r
   {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+ #ifdef CONFIG_SMP
+       int post_schedule = 0;
+ 
+       if (current->sched_class->needs_post_schedule)
+               post_schedule = current->sched_class->needs_post_schedule(rq);
+ #endif
   
         rq->prev_mm = NULL;
   
@@@ -2650,10 -2690,9 +2736,10 @@@
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
+ +      perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
   #ifdef CONFIG_SMP
-       if (current->sched_class->post_schedule)
+       if (post_schedule)
                 current->sched_class->post_schedule(rq);
   #endif
   
@@@ -2813,21 -2852,6 +2899,21 @@@ unsigned long nr_active(void
   }
   
   /*
+ + * Externally visible per-cpu scheduler statistics:
+ + * cpu_nr_switches(cpu) - number of context switches on that cpu
+ + * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ + */
+ +u64 cpu_nr_switches(int cpu)
+ +{
+ +      return cpu_rq(cpu)->nr_switches;
+ +}
+ +
+ +u64 cpu_nr_migrations(int cpu)
+ +{
+ +      return cpu_rq(cpu)->nr_migrations_in;
+ +}
+ +
+ +/*
    * Update rq->cpu_load[] statistics. This function is usually called every
    * scheduler tick (TICK_NSEC).
    */
@@@ -2975,6 -2999,7 +3061,7 @@@ int can_migrate_task(struct task_struc
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned)
   {
+       int tsk_cache_hot = 0;
         /*
          * We do not migrate tasks that are:
          * 1) running (obviously), or
@@@ -2998,10 -3023,11 +3085,11 @@@
          * 2) too many balance attempts have failed.
          */
   
-       if (!task_hot(p, rq->clock, sd) ||
-                       sd->nr_balance_failed > sd->cache_nice_tries) {
+       tsk_cache_hot = task_hot(p, rq->clock, sd);
+       if (!tsk_cache_hot ||
+               sd->nr_balance_failed > sd->cache_nice_tries) {
   #ifdef CONFIG_SCHEDSTATS
-               if (task_hot(p, rq->clock, sd)) {
+               if (tsk_cache_hot) {
                         schedstat_inc(sd, lb_hot_gained[idle]);
                         schedstat_inc(p, se.nr_forced_migrations);
                 }
@@@ -3009,7 -3035,7 +3097,7 @@@
                 return 1;
         }
   
-       if (task_hot(p, rq->clock, sd)) {
+       if (tsk_cache_hot) {
                 schedstat_inc(p, se.nr_failed_migrations_hot);
                 return 0;
         }
@@@ -3049,6 -3075,16 +3137,16 @@@ next
         pulled++;
         rem_load_move -= p->se.load.weight;
   
+ #ifdef CONFIG_PREEMPT
+       /*
+        * NEWIDLE balancing is a source of latency, so preemptible kernels
+        * will stop after the first task is pulled to minimize the critical
+        * section.
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               goto out;
+ #endif
+ 
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@@ -3095,9 -3131,15 +3193,15 @@@ static int move_tasks(struct rq *this_r
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
   
+ #ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
- 
+ #endif
         } while (class && max_load_move > total_load_moved);
   
         return total_load_moved > 0;
@@@ -3147,246 -3189,480 +3251,480 @@@ static int move_one_task(struct rq *thi
   
         return 0;
   }
- 
+ /********** Helpers for find_busiest_group ************************/
   /*
-  * find_busiest_group finds and returns the busiest CPU group within the
-  * domain. It calculates and returns the amount of weighted load which
-  * should be moved to restore balance via the imbalance parameter.
+  * sd_lb_stats - Structure to store the statistics of a sched_domain
+  *            during load balancing.
    */
- static struct sched_group *
- find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const struct cpumask *cpus, int *balance)
- {
-       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
-       unsigned long max_pull;
-       unsigned long busiest_load_per_task, busiest_nr_running;
-       unsigned long this_load_per_task, this_nr_running;
-       int load_idx, group_imb = 0;
+ struct sd_lb_stats {
+       struct sched_group *busiest; /* Busiest group in this sd */
+       struct sched_group *this;  /* Local group in this sd */
+       unsigned long total_load;  /* Total load of all groups in sd */
+       unsigned long total_pwr;   /*   Total power of all groups in sd */
+       unsigned long avg_load;    /* Average load across all groups in sd */
+ 
+       /** Statistics of this group */
+       unsigned long this_load;
+       unsigned long this_load_per_task;
+       unsigned long this_nr_running;
+ 
+       /* Statistics of the busiest group */
+       unsigned long max_load;
+       unsigned long busiest_load_per_task;
+       unsigned long busiest_nr_running;
+ 
+       int group_imb; /* Is there imbalance in this sd */
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       int power_savings_balance = 1;
-       unsigned long leader_nr_running = 0, min_load_per_task = 0;
-       unsigned long min_nr_running = ULONG_MAX;
-       struct sched_group *group_min = NULL, *group_leader = NULL;
+       int power_savings_balance; /* Is powersave balance needed for this sd */
+       struct sched_group *group_min; /* Least loaded group in sd */
+       struct sched_group *group_leader; /* Group which relieves group_min */
+       unsigned long min_load_per_task; /* load_per_task in group_min */
+       unsigned long leader_nr_running; /* Nr running of group_leader */
+       unsigned long min_nr_running; /* Nr running of group_min */
   #endif
+ };
   
-       max_load = this_load = total_load = total_pwr = 0;
-       busiest_load_per_task = busiest_nr_running = 0;
-       this_load_per_task = this_nr_running = 0;
+ /*
+  * sg_lb_stats - stats of a sched_group required for load_balancing
+  */
+ struct sg_lb_stats {
+       unsigned long avg_load; /*Avg load across the CPUs of the group */
+       unsigned long group_load; /* Total load over the CPUs of the group */
+       unsigned long sum_nr_running; /* Nr tasks running in the group */
+       unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+       unsigned long group_capacity;
+       int group_imb; /* Is there an imbalance in the group ? */
+ };
+ 
+ /**
+  * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+  * @group: The group whose first cpu is to be returned.
+  */
+ static inline unsigned int group_first_cpu(struct sched_group *group)
+ {
+       return cpumask_first(sched_group_cpus(group));
+ }
+ 
+ /**
+  * get_sd_load_idx - Obtain the load index for a given sched domain.
+  * @sd: The sched_domain whose load_idx is to be obtained.
+  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+  */
+ static inline int get_sd_load_idx(struct sched_domain *sd,
+                                       enum cpu_idle_type idle)
+ {
+       int load_idx;
   
-       if (idle == CPU_NOT_IDLE)
+       switch (idle) {
+       case CPU_NOT_IDLE:
                 load_idx = sd->busy_idx;
-       else if (idle == CPU_NEWLY_IDLE)
+               break;
+ 
+       case CPU_NEWLY_IDLE:
                 load_idx = sd->newidle_idx;
-       else
+               break;
+       default:
                 load_idx = sd->idle_idx;
+               break;
+       }
   
-       do {
-               unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
-               int local_group;
-               int i;
-               int __group_imb = 0;
-               unsigned int balance_cpu = -1, first_idle_cpu = 0;
-               unsigned long sum_nr_running, sum_weighted_load;
-               unsigned long sum_avg_load_per_task;
-               unsigned long avg_load_per_task;
+       return load_idx;
+ }
   
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
   
-               if (local_group)
-                       balance_cpu = cpumask_first(sched_group_cpus(group));
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /**
+  * init_sd_power_savings_stats - Initialize power savings statistics for
+  * the given sched_domain, during load balancing.
+  *
+  * @sd: Sched domain whose power-savings statistics are to be initialized.
+  * @sds: Variable containing the statistics for sd.
+  * @idle: Idle status of the CPU at which we're performing load-balancing.
+  */
+ static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ {
+       /*
+        * Busy processors will not participate in power savings
+        * balance.
+        */
+       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+               sds->power_savings_balance = 0;
+       else {
+               sds->power_savings_balance = 1;
+               sds->min_nr_running = ULONG_MAX;
+               sds->leader_nr_running = 0;
+       }
+ }
   
-               /* Tally up the load of all CPUs in the group */
-               sum_weighted_load = sum_nr_running = avg_load = 0;
-               sum_avg_load_per_task = avg_load_per_task = 0;
+ /**
+  * update_sd_power_savings_stats - Update the power saving stats for a
+  * sched_domain while performing load balancing.
+  *
+  * @group: sched_group belonging to the sched_domain under consideration.
+  * @sds: Variable containing the statistics of the sched_domain
+  * @local_group: Does group contain the CPU for which we're performing
+  *            load balancing ?
+  * @sgs: Variable containing the statistics of the group.
+  */
+ static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ {
   
-               max_cpu_load = 0;
-               min_cpu_load = ~0UL;
+       if (!sds->power_savings_balance)
+               return;
   
-               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-                       struct rq *rq = cpu_rq(i);
+       /*
+        * If the local group is idle or completely loaded
+        * no need to do power savings balance at this domain
+        */
+       if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                               !sds->this_nr_running))
+               sds->power_savings_balance = 0;
   
-                       if (*sd_idle && rq->nr_running)
-                               *sd_idle = 0;
+       /*
+        * If a group is already running at full capacity or idle,
+        * don't include that group in power savings calculations
+        */
+       if (!sds->power_savings_balance ||
+               sgs->sum_nr_running >= sgs->group_capacity ||
+               !sgs->sum_nr_running)
+               return;
   
-                       /* Bias balancing toward cpus of our domain */
-                       if (local_group) {
-                               if (idle_cpu(i) && !first_idle_cpu) {
-                                       first_idle_cpu = 1;
-                                       balance_cpu = i;
-                               }
+       /*
+        * Calculate the group which has the least non-idle load.
+        * This is the group from where we need to pick up the load
+        * for saving power
+        */
+       if ((sgs->sum_nr_running < sds->min_nr_running) ||
+           (sgs->sum_nr_running == sds->min_nr_running &&
+            group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+               sds->group_min = group;
+               sds->min_nr_running = sgs->sum_nr_running;
+               sds->min_load_per_task = sgs->sum_weighted_load /
+                                               sgs->sum_nr_running;
+       }
   
-                               load = target_load(i, load_idx);
-                       } else {
-                               load = source_load(i, load_idx);
-                               if (load > max_cpu_load)
-                                       max_cpu_load = load;
-                               if (min_cpu_load > load)
-                                       min_cpu_load = load;
-                       }
+       /*
+        * Calculate the group which is almost near its
+        * capacity but still has some space to pick up some load
+        * from other group and save more power
+        */
+       if (sgs->sum_nr_running > sgs->group_capacity - 1)
+               return;
   
-                       avg_load += load;
-                       sum_nr_running += rq->nr_running;
-                       sum_weighted_load += weighted_cpuload(i);
+       if (sgs->sum_nr_running > sds->leader_nr_running ||
+           (sgs->sum_nr_running == sds->leader_nr_running &&
+            group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+               sds->group_leader = group;
+               sds->leader_nr_running = sgs->sum_nr_running;
+       }
+ }
   
-                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
-               }
+ /**
+  * check_power_save_busiest_group - see if there is potential for some power-savings balance
+  * @sds: Variable containing the statistics of the sched_domain
+  *    under consideration.
+  * @this_cpu: Cpu at which we're currently performing load-balancing.
+  * @imbalance: Variable to store the imbalance.
+  *
+  * Description:
+  * Check if we have potential to perform some power-savings balance.
+  * If yes, set the busiest group to be the least loaded group in the
+  * sched_domain, so that it's CPUs can be put to idle.
+  *
+  * Returns 1 if there is potential to perform power-savings balance.
+  * Else returns 0.
+  */
+ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+ {
+       if (!sds->power_savings_balance)
+               return 0;
   
-               /*
-                * First idle cpu or the first cpu(busiest) in this sched group
-                * is eligible for doing load balancing at this and above
-                * domains. In the newly idle case, we will allow all the cpu's
-                * to do the newly idle load balance.
-                */
-               if (idle != CPU_NEWLY_IDLE && local_group &&
-                   balance_cpu != this_cpu && balance) {
-                       *balance = 0;
-                       goto ret;
-               }
+       if (sds->this != sds->group_leader ||
+                       sds->group_leader == sds->group_min)
+               return 0;
   
-               total_load += avg_load;
-               total_pwr += group->__cpu_power;
+       *imbalance = sds->min_load_per_task;
+       sds->busiest = sds->group_min;
   
-               /* Adjust by relative CPU power of the group */
-               avg_load = sg_div_cpu_power(group,
-                               avg_load * SCHED_LOAD_SCALE);
+       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                       group_first_cpu(sds->group_leader);
+       }
   
+       return 1;
   
-               /*
-                * Consider the group unbalanced when the imbalance is larger
-                * than the average weight of two tasks.
-                *
-                * APZ: with cgroup the avg task weight can vary wildly and
-                *      might not be a suitable number - should we keep a
-                *      normalized nr_running number somewhere that negates
-                *      the hierarchy?
-                */
-               avg_load_per_task = sg_div_cpu_power(group,
-                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ }
+ #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ {
+       return;
+ }
+ 
+ static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ {
+       return;
+ }
+ 
+ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+ {
+       return 0;
+ }
+ #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ 
+ 
+ /**
+  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+  * @group: sched_group whose statistics are to be updated.
+  * @this_cpu: Cpu for which load balance is currently performed.
+  * @idle: Idle status of this_cpu
+  * @load_idx: Load index of sched_domain of this_cpu for load calc.
+  * @sd_idle: Idle status of the sched_domain containing group.
+  * @local_group: Does group contain this_cpu.
+  * @cpus: Set of cpus considered for load balancing.
+  * @balance: Should we balance.
+  * @sgs: variable to hold the statistics for this group.
+  */
+ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+                       enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                       int local_group, const struct cpumask *cpus,
+                       int *balance, struct sg_lb_stats *sgs)
+ {
+       unsigned long load, max_cpu_load, min_cpu_load;
+       int i;
+       unsigned int balance_cpu = -1, first_idle_cpu = 0;
+       unsigned long sum_avg_load_per_task;
+       unsigned long avg_load_per_task;
+ 
+       if (local_group)
+               balance_cpu = group_first_cpu(group);
   
-               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-                       __group_imb = 1;
+       /* Tally up the load of all CPUs in the group */
+       sum_avg_load_per_task = avg_load_per_task = 0;
+       max_cpu_load = 0;
+       min_cpu_load = ~0UL;
   
-               group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+       for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+               struct rq *rq = cpu_rq(i);
   
+               if (*sd_idle && rq->nr_running)
+                       *sd_idle = 0;
+ 
+               /* Bias balancing toward cpus of our domain */
                 if (local_group) {
-                       this_load = avg_load;
-                       this = group;
-                       this_nr_running = sum_nr_running;
-                       this_load_per_task = sum_weighted_load;
-               } else if (avg_load > max_load &&
-                          (sum_nr_running > group_capacity || __group_imb)) {
-                       max_load = avg_load;
-                       busiest = group;
-                       busiest_nr_running = sum_nr_running;
-                       busiest_load_per_task = sum_weighted_load;
-                       group_imb = __group_imb;
+                       if (idle_cpu(i) && !first_idle_cpu) {
+                               first_idle_cpu = 1;
+                               balance_cpu = i;
+                       }
+ 
+                       load = target_load(i, load_idx);
+               } else {
+                       load = source_load(i, load_idx);
+                       if (load > max_cpu_load)
+                               max_cpu_load = load;
+                       if (min_cpu_load > load)
+                               min_cpu_load = load;
                 }
   
- #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-               /*
-                * Busy processors will not participate in power savings
-                * balance.
-                */
-               if (idle == CPU_NOT_IDLE ||
-                               !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                       goto group_next;
+               sgs->group_load += load;
+               sgs->sum_nr_running += rq->nr_running;
+               sgs->sum_weighted_load += weighted_cpuload(i);
   
-               /*
-                * If the local group is idle or completely loaded
-                * no need to do power savings balance at this domain
-                */
-               if (local_group && (this_nr_running >= group_capacity ||
-                                   !this_nr_running))
-                       power_savings_balance = 0;
+               sum_avg_load_per_task += cpu_avg_load_per_task(i);
+       }
   
-               /*
-                * If a group is already running at full capacity or idle,
-                * don't include that group in power savings calculations
-                */
-               if (!power_savings_balance || sum_nr_running >= group_capacity
-                   || !sum_nr_running)
-                       goto group_next;
+       /*
+        * First idle cpu or the first cpu(busiest) in this sched group
+        * is eligible for doing load balancing at this and above
+        * domains. In the newly idle case, we will allow all the cpu's
+        * to do the newly idle load balance.
+        */
+       if (idle != CPU_NEWLY_IDLE && local_group &&
+           balance_cpu != this_cpu && balance) {
+               *balance = 0;
+               return;
+       }
   
-               /*
-                * Calculate the group which has the least non-idle load.
-                * This is the group from where we need to pick up the load
-                * for saving power
-                */
-               if ((sum_nr_running < min_nr_running) ||
-                   (sum_nr_running == min_nr_running &&
-                    cpumask_first(sched_group_cpus(group)) >
-                    cpumask_first(sched_group_cpus(group_min)))) {
-                       group_min = group;
-                       min_nr_running = sum_nr_running;
-                       min_load_per_task = sum_weighted_load /
-                                               sum_nr_running;
-               }
+       /* Adjust by relative CPU power of the group */
+       sgs->avg_load = sg_div_cpu_power(group,
+                       sgs->group_load * SCHED_LOAD_SCALE);
   
-               /*
-                * Calculate the group which is almost near its
-                * capacity but still has some space to pick up some load
-                * from other group and save more power
-                */
-               if (sum_nr_running <= group_capacity - 1) {
-                       if (sum_nr_running > leader_nr_running ||
-                           (sum_nr_running == leader_nr_running &&
-                            cpumask_first(sched_group_cpus(group)) <
-                            cpumask_first(sched_group_cpus(group_leader)))) {
-                               group_leader = group;
-                               leader_nr_running = sum_nr_running;
-                       }
+ 
+       /*
+        * Consider the group unbalanced when the imbalance is larger
+        * than the average weight of two tasks.
+        *
+        * APZ: with cgroup the avg task weight can vary wildly and
+        *      might not be a suitable number - should we keep a
+        *      normalized nr_running number somewhere that negates
+        *      the hierarchy?
+        */
+       avg_load_per_task = sg_div_cpu_power(group,
+                       sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ 
+       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+               sgs->group_imb = 1;
+ 
+       sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ 
+ }
+ 
+ /**
+  * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+  * @sd: sched_domain whose statistics are to be updated.
+  * @this_cpu: Cpu for which load balance is currently performed.
+  * @idle: Idle status of this_cpu
+  * @sd_idle: Idle status of the sched_domain containing group.
+  * @cpus: Set of cpus considered for load balancing.
+  * @balance: Should we balance.
+  * @sds: variable to hold the statistics for this sched_domain.
+  */
+ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                       enum cpu_idle_type idle, int *sd_idle,
+                       const struct cpumask *cpus, int *balance,
+                       struct sd_lb_stats *sds)
+ {
+       struct sched_group *group = sd->groups;
+       struct sg_lb_stats sgs;
+       int load_idx;
+ 
+       init_sd_power_savings_stats(sd, sds, idle);
+       load_idx = get_sd_load_idx(sd, idle);
+ 
+       do {
+               int local_group;
+ 
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
+               memset(&sgs, 0, sizeof(sgs));
+               update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+                               local_group, cpus, balance, &sgs);
+ 
+               if (local_group && balance && !(*balance))
+                       return;
+ 
+               sds->total_load += sgs.group_load;
+               sds->total_pwr += group->__cpu_power;
+ 
+               if (local_group) {
+                       sds->this_load = sgs.avg_load;
+                       sds->this = group;
+                       sds->this_nr_running = sgs.sum_nr_running;
+                       sds->this_load_per_task = sgs.sum_weighted_load;
+               } else if (sgs.avg_load > sds->max_load &&
+                          (sgs.sum_nr_running > sgs.group_capacity ||
+                               sgs.group_imb)) {
+                       sds->max_load = sgs.avg_load;
+                       sds->busiest = group;
+                       sds->busiest_nr_running = sgs.sum_nr_running;
+                       sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->group_imb = sgs.group_imb;
                 }
- group_next:
- #endif
+ 
+               update_sd_power_savings_stats(group, sds, local_group, &sgs);
                 group = group->next;
         } while (group != sd->groups);
   
-       if (!busiest || this_load >= max_load || busiest_nr_running == 0)
-               goto out_balanced;
- 
-       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+ }
   
-       if (this_load >= avg_load ||
-                       100*max_load <= sd->imbalance_pct*this_load)
-               goto out_balanced;
+ /**
+  * fix_small_imbalance - Calculate the minor imbalance that exists
+  *                    amongst the groups of a sched_domain, during
+  *                    load balancing.
+  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+  * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+  * @imbalance: Variable to store the imbalance.
+  */
+ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                               int this_cpu, unsigned long *imbalance)
+ {
+       unsigned long tmp, pwr_now = 0, pwr_move = 0;
+       unsigned int imbn = 2;
+ 
+       if (sds->this_nr_running) {
+               sds->this_load_per_task /= sds->this_nr_running;
+               if (sds->busiest_load_per_task >
+                               sds->this_load_per_task)
+                       imbn = 1;
+       } else
+               sds->this_load_per_task =
+                       cpu_avg_load_per_task(this_cpu);
   
-       busiest_load_per_task /= busiest_nr_running;
-       if (group_imb)
-               busiest_load_per_task = min(busiest_load_per_task, avg_load);
+       if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+                       sds->busiest_load_per_task * imbn) {
+               *imbalance = sds->busiest_load_per_task;
+               return;
+       }
   
         /*
-        * We're trying to get all the cpus to the average_load, so we don't
-        * want to push ourselves above the average load, nor do we wish to
-        * reduce the max loaded cpu below the average load, as either of these
-        * actions would just result in more rebalancing later, and ping-pong
-        * tasks around. Thus we look for the minimum possible imbalance.
-        * Negative imbalances (*we* are more loaded than anyone else) will
-        * be counted as no imbalance for these purposes -- we can't fix that
-        * by pulling tasks to us. Be careful of negative numbers as they'll
-        * appear as very large values with unsigned longs.
+        * OK, we don't have enough imbalance to justify moving tasks,
+        * however we may be able to increase total CPU power used by
+        * moving them.
          */
-       if (max_load <= busiest_load_per_task)
-               goto out_balanced;
   
+       pwr_now += sds->busiest->__cpu_power *
+                       min(sds->busiest_load_per_task, sds->max_load);
+       pwr_now += sds->this->__cpu_power *
+                       min(sds->this_load_per_task, sds->this_load);
+       pwr_now /= SCHED_LOAD_SCALE;
+ 
+       /* Amount of load we'd subtract */
+       tmp = sg_div_cpu_power(sds->busiest,
+                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       if (sds->max_load > tmp)
+               pwr_move += sds->busiest->__cpu_power *
+                       min(sds->busiest_load_per_task, sds->max_load - tmp);
+ 
+       /* Amount of load we'd add */
+       if (sds->max_load * sds->busiest->__cpu_power <
+               sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+               tmp = sg_div_cpu_power(sds->this,
+                       sds->max_load * sds->busiest->__cpu_power);
+       else
+               tmp = sg_div_cpu_power(sds->this,
+                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       pwr_move += sds->this->__cpu_power *
+                       min(sds->this_load_per_task, sds->this_load + tmp);
+       pwr_move /= SCHED_LOAD_SCALE;
+ 
+       /* Move if we gain throughput */
+       if (pwr_move > pwr_now)
+               *imbalance = sds->busiest_load_per_task;
+ }
+ 
+ /**
+  * calculate_imbalance - Calculate the amount of imbalance present within the
+  *                     groups of a given sched_domain during load balance.
+  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+  * @this_cpu: Cpu for which currently load balance is being performed.
+  * @imbalance: The variable to store the imbalance.
+  */
+ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+               unsigned long *imbalance)
+ {
+       unsigned long max_pull;
         /*
          * In the presence of smp nice balancing, certain scenarios can have
          * max load less than avg load(as we skip the groups at or below
          * its cpu_power, while calculating max_load..)
          */
-       if (max_load < avg_load) {
+       if (sds->max_load < sds->avg_load) {
                 *imbalance = 0;
-               goto small_imbalance;
+               return fix_small_imbalance(sds, this_cpu, imbalance);
         }
   
         /* Don't want to pull so many tasks that a group would go idle */
-       max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+       max_pull = min(sds->max_load - sds->avg_load,
+                       sds->max_load - sds->busiest_load_per_task);
   
         /* How much load to actually move to equalise the imbalance */
-       *imbalance = min(max_pull * busiest->__cpu_power,
-                               (avg_load - this_load) * this->__cpu_power)
+       *imbalance = min(max_pull * sds->busiest->__cpu_power,
+               (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                         / SCHED_LOAD_SCALE;
   
         /*
@@@ -3395,78 -3671,110 +3733,110 @@@
          * a think about bumping its value to force at least one task to be
          * moved
          */
-       if (*imbalance < busiest_load_per_task) {
-               unsigned long tmp, pwr_now, pwr_move;
-               unsigned int imbn;
- 
- small_imbalance:
-               pwr_move = pwr_now = 0;
-               imbn = 2;
-               if (this_nr_running) {
-                       this_load_per_task /= this_nr_running;
-                       if (busiest_load_per_task > this_load_per_task)
-                               imbn = 1;
-               } else
-                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
+       if (*imbalance < sds->busiest_load_per_task)
+               return fix_small_imbalance(sds, this_cpu, imbalance);
   
-               if (max_load - this_load + busiest_load_per_task >=
-                                       busiest_load_per_task * imbn) {
-                       *imbalance = busiest_load_per_task;
-                       return busiest;
-               }
+ }
+ /******* find_busiest_group() helpers end here *********************/
   
-               /*
-                * OK, we don't have enough imbalance to justify moving tasks,
-                * however we may be able to increase total CPU power used by
-                * moving them.
-                */
+ /**
+  * find_busiest_group - Returns the busiest group within the sched_domain
+  * if there is an imbalance. If there isn't an imbalance, and
+  * the user has opted for power-savings, it returns a group whose
+  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+  * such a group exists.
+  *
+  * Also calculates the amount of weighted load which should be moved
+  * to restore balance.
+  *
+  * @sd: The sched_domain whose busiest group is to be returned.
+  * @this_cpu: The cpu for which load balancing is currently being performed.
+  * @imbalance: Variable which stores amount of weighted load which should
+  *            be moved to restore balance/put a group to idle.
+  * @idle: The idle status of this_cpu.
+  * @sd_idle: The idleness of sd
+  * @cpus: The set of CPUs under consideration for load-balancing.
+  * @balance: Pointer to a variable indicating if this_cpu
+  *    is the appropriate cpu to perform load balancing at this_level.
+  *
+  * Returns:   - the busiest group if imbalance exists.
+  *            - If no imbalance and user has opted for power-savings balance,
+  *               return the least loaded group whose CPUs can be
+  *               put to idle by rebalancing its tasks onto our group.
+  */
+ static struct sched_group *
+ find_busiest_group(struct sched_domain *sd, int this_cpu,
+                  unsigned long *imbalance, enum cpu_idle_type idle,
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
+ {
+       struct sd_lb_stats sds;
   
-               pwr_now += busiest->__cpu_power *
-                               min(busiest_load_per_task, max_load);
-               pwr_now += this->__cpu_power *
-                               min(this_load_per_task, this_load);
-               pwr_now /= SCHED_LOAD_SCALE;
- 
-               /* Amount of load we'd subtract */
-               tmp = sg_div_cpu_power(busiest,
-                               busiest_load_per_task * SCHED_LOAD_SCALE);
-               if (max_load > tmp)
-                       pwr_move += busiest->__cpu_power *
-                               min(busiest_load_per_task, max_load - tmp);
- 
-               /* Amount of load we'd add */
-               if (max_load * busiest->__cpu_power <
-                               busiest_load_per_task * SCHED_LOAD_SCALE)
-                       tmp = sg_div_cpu_power(this,
-                                       max_load * busiest->__cpu_power);
-               else
-                       tmp = sg_div_cpu_power(this,
-                               busiest_load_per_task * SCHED_LOAD_SCALE);
-               pwr_move += this->__cpu_power *
-                               min(this_load_per_task, this_load + tmp);
-               pwr_move /= SCHED_LOAD_SCALE;
+       memset(&sds, 0, sizeof(sds));
   
-               /* Move if we gain throughput */
-               if (pwr_move > pwr_now)
-                       *imbalance = busiest_load_per_task;
-       }
+       /*
+        * Compute the various statistics relavent for load balancing at
+        * this level.
+        */
+       update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+                                       balance, &sds);
+ 
+       /* Cases where imbalance does not exist from POV of this_cpu */
+       /* 1) this_cpu is not the appropriate cpu to perform load balancing
+        *    at this level.
+        * 2) There is no busy sibling group to pull from.
+        * 3) This group is the busiest group.
+        * 4) This group is more busy than the avg busieness at this
+        *    sched_domain.
+        * 5) The imbalance is within the specified limit.
+        * 6) Any rebalance would lead to ping-pong
+        */
+       if (balance && !(*balance))
+               goto ret;
   
-       return busiest;
+       if (!sds.busiest || sds.busiest_nr_running == 0)
+               goto out_balanced;
   
- out_balanced:
- #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-               goto ret;
+       if (sds.this_load >= sds.max_load)
+               goto out_balanced;
   
-       if (this == group_leader && group_leader != group_min) {
-               *imbalance = min_load_per_task;
-               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-                               cpumask_first(sched_group_cpus(group_leader));
-               }
-               return group_min;
-       }
- #endif
+       sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+ 
+       if (sds.this_load >= sds.avg_load)
+               goto out_balanced;
+ 
+       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+               goto out_balanced;
+ 
+       sds.busiest_load_per_task /= sds.busiest_nr_running;
+       if (sds.group_imb)
+               sds.busiest_load_per_task =
+                       min(sds.busiest_load_per_task, sds.avg_load);
+ 
+       /*
+        * We're trying to get all the cpus to the average_load, so we don't
+        * want to push ourselves above the average load, nor do we wish to
+        * reduce the max loaded cpu below the average load, as either of these
+        * actions would just result in more rebalancing later, and ping-pong
+        * tasks around. Thus we look for the minimum possible imbalance.
+        * Negative imbalances (*we* are more loaded than anyone else) will
+        * be counted as no imbalance for these purposes -- we can't fix that
+        * by pulling tasks to us. Be careful of negative numbers as they'll
+        * appear as very large values with unsigned longs.
+        */
+       if (sds.max_load <= sds.busiest_load_per_task)
+               goto out_balanced;
+ 
+       /* Looks like there is an imbalance. Compute it */
+       calculate_imbalance(&sds, this_cpu, imbalance);
+       return sds.busiest;
+ 
+ out_balanced:
+       /*
+        * There is no obvious imbalance. But check if we can do some balancing
+        * to save power.
+        */
+       if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+               return sds.busiest;
   ret:
         *imbalance = 0;
         return NULL;
@@@ -3510,19 -3818,23 +3880,23 @@@ find_busiest_queue(struct sched_group *
    */
   #define MAX_PINNED_INTERVAL   512
   
+ /* Working cpumask for load_balance and load_balance_newidle. */
+ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+ 
   /*
    * Check this_cpu to ensure it is balanced within domain. Attempt to move
    * tasks if there is an imbalance.
    */
   static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, struct cpumask *cpus)
+                       int *balance)
   {
         int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
         struct sched_group *group;
         unsigned long imbalance;
         struct rq *busiest;
         unsigned long flags;
+       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
   
         cpumask_setall(cpus);
   
@@@ -3677,8 -3989,7 +4051,7 @@@ out
    * this_rq is locked.
    */
   static int
- load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       struct cpumask *cpus)
+ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
   {
         struct sched_group *group;
         struct rq *busiest = NULL;
@@@ -3686,6 -3997,7 +4059,7 @@@
         int ld_moved = 0;
         int sd_idle = 0;
         int all_pinned = 0;
+       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
   
         cpumask_setall(cpus);
   
@@@ -3826,10 -4138,6 +4200,6 @@@ static void idle_balance(int this_cpu, 
         struct sched_domain *sd;
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
-       cpumask_var_t tmpmask;
- 
-       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
-               return;
   
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@@ -3840,7 -4148,7 +4210,7 @@@
                 if (sd->flags & SD_BALANCE_NEWIDLE)
                         /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, tmpmask);
+                                                          sd);
   
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
@@@ -3855,7 -4163,6 +4225,6 @@@
                  */
                 this_rq->next_balance = next_balance;
         }
-       free_cpumask_var(tmpmask);
   }
   
   /*
@@@ -4005,11 -4312,6 +4374,6 @@@ static void rebalance_domains(int cpu, 
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
         int need_serialize;
-       cpumask_var_t tmp;
- 
-       /* Fails alloc?  Rebalancing probably not a priority right now. */
-       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
-               return;
   
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
@@@ -4034,7 -4336,7 +4398,7 @@@
                 }
   
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance)) {
                                 /*
                                  * We've pulled tasks over so either we're no
                                  * longer idle, or one of our SMT siblings is
@@@ -4068,8 -4370,6 +4432,6 @@@ out
          */
         if (likely(update_next_balance))
                 rq->next_balance = next_balance;
- 
-       free_cpumask_var(tmp);
   }
   
   /*
@@@ -4119,6 -4419,11 +4481,11 @@@ static void run_rebalance_domains(struc
   #endif
   }
   
+ static inline int on_null_domain(int cpu)
+ {
+       return !rcu_dereference(cpu_rq(cpu)->sd);
+ }
+ 
   /*
    * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
    *
@@@ -4176,7 -4481,9 +4543,9 @@@ static inline void trigger_load_balance
             cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
   #endif
-       if (time_after_eq(jiffies, rq->next_balance))
+       /* Don't need to rebalance while attached to NULL domain */
+       if (time_after_eq(jiffies, rq->next_balance) &&
+           likely(!on_null_domain(cpu)))
                 raise_softirq(SCHED_SOFTIRQ);
   }
   
@@@ -4199,29 -4506,6 +4568,29 @@@ EXPORT_PER_CPU_SYMBOL(kstat)
    * Return any ns on the sched_clock that have not yet been banked in
    * @p in case that task is currently running.
    */
+ +unsigned long long __task_delta_exec(struct task_struct *p, int update)
+ +{
+ +      s64 delta_exec;
+ +      struct rq *rq;
+ +
+ +      rq = task_rq(p);
+ +      WARN_ON_ONCE(!runqueue_is_locked());
+ +      WARN_ON_ONCE(!task_current(rq, p));
+ +
+ +      if (update)
+ +              update_rq_clock(rq);
+ +
+ +      delta_exec = rq->clock - p->se.exec_start;
+ +
+ +      WARN_ON_ONCE(delta_exec < 0);
+ +
+ +      return delta_exec;
+ +}
+ +
+ +/*
+ + * Return any ns on the sched_clock that have not yet been banked in
+ + * @p in case that task is currently running.
+ + */
   unsigned long long task_delta_exec(struct task_struct *p)
   {
         unsigned long flags;
@@@ -4481,7 -4765,6 +4850,7 @@@ void scheduler_tick(void
         update_rq_clock(rq);
         update_cpu_load(rq);
         curr->sched_class->task_tick(rq, curr, 0);
+ +      perf_counter_task_tick(curr, cpu);
         spin_unlock(&rq->lock);
   
   #ifdef CONFIG_SMP
@@@ -4490,10 -4773,7 +4859,7 @@@
   #endif
   }
   
- #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-                               defined(CONFIG_PREEMPT_TRACER))
- 
- static inline unsigned long get_parent_ip(unsigned long addr)
+ unsigned long get_parent_ip(unsigned long addr)
   {
         if (in_lock_functions(addr)) {
                 addr = CALLER_ADDR2;
@@@ -4503,6 -4783,9 +4869,9 @@@
         return addr;
   }
   
+ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+ 
   void __kprobes add_preempt_count(int val)
   {
   #ifdef CONFIG_DEBUG_PREEMPT
@@@ -4594,11 -4877,33 +4963,33 @@@ static inline void schedule_debug(struc
   #endif
   }
   
+ static void put_prev_task(struct rq *rq, struct task_struct *prev)
+ {
+       if (prev->state == TASK_RUNNING) {
+               u64 runtime = prev->se.sum_exec_runtime;
+ 
+               runtime -= prev->se.prev_sum_exec_runtime;
+               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+ 
+               /*
+                * In order to avoid avg_overlap growing stale when we are
+                * indeed overlapping and hence not getting put to sleep, grow
+                * the avg_overlap on preemption.
+                *
+                * We use the average preemption runtime because that
+                * correlates to the amount of cache footprint a task can
+                * build up.
+                */
+               update_avg(&prev->se.avg_overlap, runtime);
+       }
+       prev->sched_class->put_prev_task(rq, prev);
+ }
+ 
   /*
    * Pick up the highest-prio task:
    */
   static inline struct task_struct *
- pick_next_task(struct rq *rq, struct task_struct *prev)
+ pick_next_task(struct rq *rq)
   {
         const struct sched_class *class;
         struct task_struct *p;
@@@ -4629,15 -4934,13 +5020,13 @@@
   /*
    * schedule() is the main scheduler function.
    */
- asmlinkage void __sched schedule(void)
+ asmlinkage void __sched __schedule(void)
   {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
   
- need_resched:
-       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_qsctr_inc(cpu);
@@@ -4672,12 -4975,11 +5061,12 @@@ need_resched_nonpreemptible
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
   
-       prev->sched_class->put_prev_task(rq, prev);
-       next = pick_next_task(rq, prev);
+       put_prev_task(rq, prev);
+       next = pick_next_task(rq);
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
+ +              perf_counter_task_sched_out(prev, cpu);
   
                 rq->nr_switches++;
                 rq->curr = next;
@@@ -4695,13 -4997,80 +5084,80 @@@
   
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
+ }
   
+ asmlinkage void __sched schedule(void)
+ {
+ need_resched:
+       preempt_disable();
+       __schedule();
         preempt_enable_no_resched();
         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                 goto need_resched;
   }
   EXPORT_SYMBOL(schedule);
   
+ #ifdef CONFIG_SMP
+ /*
+  * Look out! "owner" is an entirely speculative pointer
+  * access and not reliable.
+  */
+ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+ {
+       unsigned int cpu;
+       struct rq *rq;
+ 
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
+ 
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * Need to access the cpu field knowing that
+        * DEBUG_PAGEALLOC could have unmapped it if
+        * the mutex owner just released it and exited.
+        */
+       if (probe_kernel_address(&owner->cpu, cpu))
+               goto out;
+ #else
+       cpu = owner->cpu;
+ #endif
+ 
+       /*
+        * Even if the access succeeded (likely case),
+        * the cpu field may no longer be valid.
+        */
+       if (cpu >= nr_cpumask_bits)
+               goto out;
+ 
+       /*
+        * We need to validate that we can do a
+        * get_cpu() and that we have the percpu area.
+        */
+       if (!cpu_online(cpu))
+               goto out;
+ 
+       rq = cpu_rq(cpu);
+ 
+       for (;;) {
+               /*
+                * Owner changed, break to re-assess state.
+                */
+               if (lock->owner != owner)
+                       break;
+ 
+               /*
+                * Is that owner really running on that cpu?
+                */
+               if (task_thread_info(rq->curr) != owner || need_resched())
+                       return 0;
+ 
+               cpu_relax();
+       }
+ out:
+       return 1;
+ }
+ #endif
+ 
   #ifdef CONFIG_PREEMPT
   /*
    * this is the entry point to schedule() from in-kernel preemption
@@@ -4729,7 -5098,7 +5185,7 @@@ asmlinkage void __sched preempt_schedul
                  * between schedule and now.
                  */
                 barrier();
-       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+       } while (need_resched());
   }
   EXPORT_SYMBOL(preempt_schedule);
   
@@@ -4758,7 -5127,7 +5214,7 @@@ asmlinkage void __sched preempt_schedul
                  * between schedule and now.
                  */
                 barrier();
-       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+       } while (need_resched());
   }
   
   #endif /* CONFIG_PREEMPT */
@@@ -4819,11 -5188,17 +5275,17 @@@ void __wake_up_locked(wait_queue_head_
         __wake_up_common(q, mode, 1, 0, NULL);
   }
   
+ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+ {
+       __wake_up_common(q, mode, 1, 0, key);
+ }
+ 
   /**
-  * __wake_up_sync - wake up threads blocked on a waitqueue.
+  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
    * @q: the waitqueue
    * @mode: which threads
    * @nr_exclusive: how many wake-one or wake-many threads to wake up
+  * @key: opaque value to be passed to wakeup targets
    *
    * The sync wakeup differs that the waker knows that it will schedule
    * away soon, so while the target thread will be woken up, it will not
@@@ -4832,8 -5207,8 +5294,8 @@@
    *
    * On UP it can prevent extra preemption.
    */
- void
- __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+                       int nr_exclusive, void *key)
   {
         unsigned long flags;
         int sync = 1;
@@@ -4845,9 -5220,18 +5307,18 @@@
                 sync = 0;
   
         spin_lock_irqsave(&q->lock, flags);
-       __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+       __wake_up_common(q, mode, nr_exclusive, sync, key);
         spin_unlock_irqrestore(&q->lock, flags);
   }
+ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+ 
+ /*
+  * __wake_up_sync - see __wake_up_sync_key()
+  */
+ void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ {
+       __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+ }
   EXPORT_SYMBOL_GPL(__wake_up_sync);    /* For internal use only */
   
   /**
@@@ -5232,7 -5616,7 +5703,7 @@@ SYSCALL_DEFINE1(nice, int, increment
         if (increment > 40)
                 increment = 40;
   
-       nice = PRIO_TO_NICE(current->static_prio) + increment;
+       nice = TASK_NICE(current) + increment;
         if (nice < -20)
                 nice = -20;
         if (nice > 19)
@@@ -6505,7 -6889,7 +6976,7 @@@ static void migrate_dead_tasks(unsigne
                 if (!rq->nr_running)
                         break;
                 update_rq_clock(rq);
-               next = pick_next_task(rq, rq->curr);
+               next = pick_next_task(rq);
                 if (!next)
                         break;
                 next->sched_class->put_prev_task(rq, next);
@@@ -7336,7 -7720,7 +7807,7 @@@ cpu_to_core_group(int cpu, const struc
   {
         int group;
   
-       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
         group = cpumask_first(mask);
         if (sg)
                 *sg = &per_cpu(sched_group_core, group).sg;
@@@ -7365,7 -7749,7 +7836,7 @@@ cpu_to_phys_group(int cpu, const struc
         cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
         group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
-       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
         group = cpumask_first(mask);
   #else
         group = cpu;
@@@ -7708,7 -8092,7 +8179,7 @@@ static int __build_sched_domains(const 
                 SD_INIT(sd, SIBLING);
                 set_domain_attribute(sd, attr);
                 cpumask_and(sched_domain_span(sd),
-                           &per_cpu(cpu_sibling_map, i), cpu_map);
+                           topology_thread_cpumask(i), cpu_map);
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7719,7 -8103,7 +8190,7 @@@
         /* Set up CPU (sibling) groups */
         for_each_cpu(i, cpu_map) {
                 cpumask_and(this_sibling_map,
-                           &per_cpu(cpu_sibling_map, i), cpu_map);
+                           topology_thread_cpumask(i), cpu_map);
                 if (i != cpumask_first(this_sibling_map))
                         continue;
   
@@@ -8300,11 -8684,15 +8771,15 @@@ static void init_rt_rq(struct rt_rq *rt
         __set_bit(MAX_RT_PRIO, array->bitmap);
   
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       rt_rq->highest_prio = MAX_RT_PRIO;
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ #ifdef CONFIG_SMP
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
+ #endif
   #endif
   #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
   #endif
   
         rt_rq->rt_time = 0;
@@@ -8391,6 -8779,9 +8866,9 @@@ void __init sched_init(void
   #ifdef CONFIG_USER_SCHED
         alloc_size *= 2;
   #endif
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+       alloc_size += num_possible_cpus() * cpumask_size();
+ #endif
         /*
          * As sched_init() is called before page_alloc is setup,
          * we use alloc_bootmem().
@@@ -8428,6 -8819,12 +8906,12 @@@
                 ptr += nr_cpu_ids * sizeof(void **);
   #endif /* CONFIG_USER_SCHED */
   #endif /* CONFIG_RT_GROUP_SCHED */
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+               for_each_possible_cpu(i) {
+                       per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                       ptr += cpumask_size();
+               }
+ #endif /* CONFIG_CPUMASK_OFFSTACK */
         }
   
   #ifdef CONFIG_SMP
@@@ -9572,7 -9969,7 +10056,7 @@@ cpuacct_destroy(struct cgroup_subsys *s
   
   static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
   {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
         u64 data;
   
   #ifndef CONFIG_64BIT
@@@ -9591,7 -9988,7 +10075,7 @@@
   
   static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
   {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
   
   #ifndef CONFIG_64BIT
         /*
@@@ -9680,14 -10077,14 +10164,14 @@@ static void cpuacct_charge(struct task_
         struct cpuacct *ca;
         int cpu;
   
-       if (!cpuacct_subsys.active)
+       if (unlikely(!cpuacct_subsys.active))
                 return;
   
         cpu = task_cpu(tsk);
         ca = task_ca(tsk);
   
         for (; ca; ca = ca->parent) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
   }
diff --combined kernel/sys.c

index 7306f94,51dbb55..14c4c56
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -14,7 -14,6 +14,7 @@@
   #include <linux/prctl.h>
   #include <linux/highuid.h>
   #include <linux/fs.h>
+ +#include <linux/perf_counter.h>
   #include <linux/resource.h>
   #include <linux/kernel.h>
   #include <linux/kexec.h>
@@@ -35,6 -34,7 +35,7 @@@
   #include <linux/seccomp.h>
   #include <linux/cpu.h>
   #include <linux/ptrace.h>
+ #include <linux/fs_struct.h>
   
   #include <linux/compat.h>
   #include <linux/syscalls.h>
@@@ -1014,10 -1014,8 +1015,8 @@@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pi
         if (err)
                 goto out;
   
-       if (task_pgrp(p) != pgrp) {
+       if (task_pgrp(p) != pgrp)
                 change_pid(p, PIDTYPE_PGID, pgrp);
-               set_task_pgrp(p, pid_nr(pgrp));
-       }
   
         err = 0;
   out:
@@@ -1801,12 -1799,6 +1800,12 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                 case PR_SET_TSC:
                         error = SET_TSC_CTL(arg2);
                         break;
+ +              case PR_TASK_PERF_COUNTERS_DISABLE:
+ +                      error = perf_counter_task_disable();
+ +                      break;
+ +              case PR_TASK_PERF_COUNTERS_ENABLE:
+ +                      error = perf_counter_task_enable();
+ +                      break;
                 case PR_GET_TIMERSLACK:
                         error = current->timer_slack_ns;
                         break;
author	Ingo Molnar <mingo@elte.hu>
	Mon, 6 Apr 2009 07:02:57 +0000 (09:02 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Mon, 6 Apr 2009 07:02:57 +0000 (09:02 +0200)
		1	2
arch/powerpc/include/asm/hw_irq.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/systbl.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/asm-offsets.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/Kconfig.cputype	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/ia32/ia32entry.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/hardirq.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/hw_irq.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/thread_info.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/unistd_32.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/unistd_64.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apic/apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/amd.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irqinit_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irqinit_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/signal.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/syscall_table_32.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/traps.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/acpi/processor_idle.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/char/sysrq.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/init_task.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kernel_stat.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/syscalls.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history