Merge branch 'linus' into perfcounters/core-v2
authorIngo Molnar <mingo@elte.hu>
Mon, 6 Apr 2009 07:02:57 +0000 (09:02 +0200)
committerIngo Molnar <mingo@elte.hu>
Mon, 6 Apr 2009 07:02:57 +0000 (09:02 +0200)
Merge reason: we have gathered quite a few conflicts, need to merge upstream

Conflicts:
arch/powerpc/kernel/Makefile
arch/x86/ia32/ia32entry.S
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/unistd_32.h
arch/x86/include/asm/unistd_64.h
arch/x86/kernel/cpu/common.c
arch/x86/kernel/irq.c
arch/x86/kernel/syscall_table_32.S
arch/x86/mm/iomap_32.c
include/linux/sched.h
kernel/Makefile

Signed-off-by: Ingo Molnar <mingo@elte.hu>
38 files changed:
1  2 
arch/powerpc/include/asm/hw_irq.h
arch/powerpc/include/asm/systbl.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/irq.c
arch/powerpc/platforms/Kconfig.cputype
arch/x86/Kconfig
arch/x86/ia32/ia32entry.S
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/hw_irq.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/unistd_32.h
arch/x86/include/asm/unistd_64.h
arch/x86/kernel/apic/apic.c
arch/x86/kernel/cpu/Makefile
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/entry_64.S
arch/x86/kernel/irq.c
arch/x86/kernel/irqinit_32.c
arch/x86/kernel/irqinit_64.c
arch/x86/kernel/signal.c
arch/x86/kernel/syscall_table_32.S
arch/x86/kernel/traps.c
drivers/acpi/processor_idle.c
drivers/char/sysrq.c
fs/exec.c
include/linux/init_task.h
include/linux/kernel_stat.h
include/linux/sched.h
include/linux/syscalls.h
init/Kconfig
kernel/Makefile
kernel/exit.c
kernel/fork.c
kernel/sched.c
kernel/sys.c

@@@ -129,38 -129,7 +129,38 @@@ static inline int irqs_disabled_flags(u
   * interrupt-retrigger: should we handle this via lost interrupts and IPIs
   * or should we not care like we do now ? --BenH.
   */
- struct hw_interrupt_type;
+ struct irq_chip;
  
 +#ifdef CONFIG_PERF_COUNTERS
 +static inline unsigned long get_perf_counter_pending(void)
 +{
 +      unsigned long x;
 +
 +      asm volatile("lbz %0,%1(13)"
 +              : "=r" (x)
 +              : "i" (offsetof(struct paca_struct, perf_counter_pending)));
 +      return x;
 +}
 +
 +static inline void set_perf_counter_pending(int x)
 +{
 +      asm volatile("stb %0,%1(13)" : :
 +              "r" (x),
 +              "i" (offsetof(struct paca_struct, perf_counter_pending)));
 +}
 +
 +extern void perf_counter_do_pending(void);
 +
 +#else
 +
 +static inline unsigned long get_perf_counter_pending(void)
 +{
 +      return 0;
 +}
 +
 +static inline void set_perf_counter_pending(int x) {}
 +static inline void perf_counter_do_pending(void) {}
 +#endif /* CONFIG_PERF_COUNTERS */
 +
  #endif        /* __KERNEL__ */
  #endif        /* _ASM_POWERPC_HW_IRQ_H */
@@@ -65,7 -65,7 +65,7 @@@ SYSCALL(ni_syscall
  SYSX(sys_ni_syscall,sys_olduname, sys_olduname)
  COMPAT_SYS_SPU(umask)
  SYSCALL_SPU(chroot)
SYSCALL(ustat)
COMPAT_SYS(ustat)
  SYSCALL_SPU(dup2)
  SYSCALL_SPU(getppid)
  SYSCALL_SPU(getpgrp)
@@@ -322,4 -322,3 +322,4 @@@ SYSCALL_SPU(epoll_create1
  SYSCALL_SPU(dup3)
  SYSCALL_SPU(pipe2)
  SYSCALL(inotify_init1)
 +SYSCALL_SPU(perf_counter_open)
@@@ -18,12 -18,10 +18,10 @@@ CFLAGS_REMOVE_cputable.o = -pg -mno-sch
  CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
  CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
  CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
- ifdef CONFIG_DYNAMIC_FTRACE
- # dynamic ftrace setup.
+ # do not trace tracer code
  CFLAGS_REMOVE_ftrace.o = -pg -mno-sched-epilog
- endif
+ # timers used by tracing
+ CFLAGS_REMOVE_time.o = -pg -mno-sched-epilog
  endif
  
  obj-y                         := cputable.o ptrace.o syscalls.o \
@@@ -61,6 -59,7 +59,7 @@@ obj-$(CONFIG_HIBERNATION)     += swsusp.o s
  obj64-$(CONFIG_HIBERNATION)   += swsusp_asm64.o
  obj-$(CONFIG_MODULES)         += module.o module_$(CONFIG_WORD_SIZE).o
  obj-$(CONFIG_44x)             += cpu_setup_44x.o
+ obj-$(CONFIG_FSL_BOOKE)               += cpu_setup_fsl_booke.o dbell.o
  
  extra-$(CONFIG_PPC_STD_MMU)   := head_32.o
  extra-$(CONFIG_PPC64)         := head_64.o
@@@ -76,7 -75,7 +75,7 @@@ obj-y                         += time.o prom.o traps.o setup
  obj-$(CONFIG_PPC32)           += entry_32.o setup_32.o
  obj-$(CONFIG_PPC64)           += dma-iommu.o iommu.o
  obj-$(CONFIG_KGDB)            += kgdb.o
- obj-$(CONFIG_PPC_MULTIPLATFORM)       += prom_init.o
+ obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)  += prom_init.o
  obj-$(CONFIG_MODULES)         += ppc_ksyms.o
  obj-$(CONFIG_BOOTX_TEXT)      += btext.o
  obj-$(CONFIG_SMP)             += smp.o
@@@ -94,8 -93,7 +93,9 @@@ obj-$(CONFIG_AUDIT)           += audit.
  obj64-$(CONFIG_AUDIT)         += compat_audit.o
  
  obj-$(CONFIG_DYNAMIC_FTRACE)  += ftrace.o
+ obj-$(CONFIG_FUNCTION_GRAPH_TRACER)   += ftrace.o
 +obj-$(CONFIG_PERF_COUNTERS)   += perf_counter.o power4-pmu.o ppc970-pmu.o \
 +                                 power5-pmu.o power5+-pmu.o power6-pmu.o
  
  obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
  
@@@ -49,7 -49,7 +49,7 @@@
  #include <asm/iseries/alpaca.h>
  #endif
  #ifdef CONFIG_KVM
- #include <asm/kvm_44x.h>
+ #include <linux/kvm_host.h>
  #endif
  
  #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
@@@ -131,7 -131,6 +131,7 @@@ int main(void
        DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
        DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
        DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
 +      DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
        DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
        DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
        DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
  #endif /* ! CONFIG_PPC64 */
  
        /* About the CPU features table */
-       DEFINE(CPU_SPEC_ENTRY_SIZE, sizeof(struct cpu_spec));
-       DEFINE(CPU_SPEC_PVR_MASK, offsetof(struct cpu_spec, pvr_mask));
-       DEFINE(CPU_SPEC_PVR_VALUE, offsetof(struct cpu_spec, pvr_value));
        DEFINE(CPU_SPEC_FEATURES, offsetof(struct cpu_spec, cpu_features));
        DEFINE(CPU_SPEC_SETUP, offsetof(struct cpu_spec, cpu_setup));
        DEFINE(CPU_SPEC_RESTORE, offsetof(struct cpu_spec, cpu_restore));
        DEFINE(PTE_SIZE, sizeof(pte_t));
  
  #ifdef CONFIG_KVM
-       DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
        DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
        DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
        DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
@@@ -526,15 -526,6 +526,15 @@@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE
  2:
        TRACE_AND_RESTORE_IRQ(r5);
  
 +#ifdef CONFIG_PERF_COUNTERS
 +      /* check paca->perf_counter_pending if we're enabling ints */
 +      lbz     r3,PACAPERFPEND(r13)
 +      and.    r3,r3,r5
 +      beq     27f
 +      bl      .perf_counter_do_pending
 +27:
 +#endif /* CONFIG_PERF_COUNTERS */
 +
        /* extract EE bit and use it to restore paca->hard_enabled */
        ld      r3,_MSR(r1)
        rldicl  r4,r3,49,63             /* r0 = (r3 >> 15) & 1 */
@@@ -917,6 -908,12 +917,12 @@@ _GLOBAL(ftrace_caller
  ftrace_call:
        bl      ftrace_stub
        nop
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ .globl ftrace_graph_call
+ ftrace_graph_call:
+       b       ftrace_graph_stub
+ _GLOBAL(ftrace_graph_stub)
+ #endif
        ld      r0, 128(r1)
        mtlr    r0
        addi    r1, r1, 112
@@@ -940,13 -937,90 +946,90 @@@ _GLOBAL(_mcount
        ld      r5,0(r5)
        mtctr   r5
        bctrl
        nop
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       b       ftrace_graph_caller
+ #endif
        ld      r0, 128(r1)
        mtlr    r0
        addi    r1, r1, 112
  _GLOBAL(ftrace_stub)
        blr
  
- #endif
- #endif
+ #endif /* CONFIG_DYNAMIC_FTRACE */
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ _GLOBAL(ftrace_graph_caller)
+       /* load r4 with local address */
+       ld      r4, 128(r1)
+       subi    r4, r4, MCOUNT_INSN_SIZE
+       /* get the parent address */
+       ld      r11, 112(r1)
+       addi    r3, r11, 16
+       bl      .prepare_ftrace_return
+       nop
+       ld      r0, 128(r1)
+       mtlr    r0
+       addi    r1, r1, 112
+       blr
+ _GLOBAL(return_to_handler)
+       /* need to save return values */
+       std     r4,  -24(r1)
+       std     r3,  -16(r1)
+       std     r31, -8(r1)
+       mr      r31, r1
+       stdu    r1, -112(r1)
+       bl      .ftrace_return_to_handler
+       nop
+       /* return value has real return address */
+       mtlr    r3
+       ld      r1, 0(r1)
+       ld      r4,  -24(r1)
+       ld      r3,  -16(r1)
+       ld      r31, -8(r1)
+       /* Jump back to real return address */
+       blr
+ _GLOBAL(mod_return_to_handler)
+       /* need to save return values */
+       std     r4,  -32(r1)
+       std     r3,  -24(r1)
+       /* save TOC */
+       std     r2,  -16(r1)
+       std     r31, -8(r1)
+       mr      r31, r1
+       stdu    r1, -112(r1)
+       /*
+        * We are in a module using the module's TOC.
+        * Switch to our TOC to run inside the core kernel.
+        */
+       LOAD_REG_IMMEDIATE(r4,ftrace_return_to_handler)
+       ld      r2, 8(r4)
+       bl      .ftrace_return_to_handler
+       nop
+       /* return value has real return address */
+       mtlr    r3
+       ld      r1, 0(r1)
+       ld      r4,  -32(r1)
+       ld      r3,  -24(r1)
+       ld      r2,  -16(r1)
+       ld      r31, -8(r1)
+       /* Jump back to real return address */
+       blr
+ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+ #endif /* CONFIG_FUNCTION_TRACER */
@@@ -104,13 -104,6 +104,13 @@@ static inline notrace void set_soft_ena
        : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
  }
  
 +#ifdef CONFIG_PERF_COUNTERS
 +notrace void __weak perf_counter_do_pending(void)
 +{
 +      set_perf_counter_pending(0);
 +}
 +#endif
 +
  notrace void raw_local_irq_restore(unsigned long en)
  {
        /*
                        iseries_handle_interrupts();
        }
  
 +      if (get_perf_counter_pending())
 +              perf_counter_do_pending();
 +
        /*
         * if (get_paca()->hard_enabled) return;
         * But again we need to take care that gcc gets hard_enabled directly
@@@ -181,7 -171,7 +181,7 @@@ int show_interrupts(struct seq_file *p
  {
        int i = *(loff_t *)v, j;
        struct irqaction *action;
-       irq_desc_t *desc;
+       struct irq_desc *desc;
        unsigned long flags;
  
        if (i == 0) {
                seq_printf(p, "%3d: ", i);
  #ifdef CONFIG_SMP
                for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+                       seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
  #else
                seq_printf(p, "%10u ", kstat_irqs(i));
  #endif /* CONFIG_SMP */
@@@ -1048,7 -1038,7 +1048,7 @@@ arch_initcall(irq_late_init)
  static int virq_debug_show(struct seq_file *m, void *private)
  {
        unsigned long flags;
-       irq_desc_t *desc;
+       struct irq_desc *desc;
        const char *p;
        char none[] = "none";
        int i;
@@@ -1,7 -1,6 +1,7 @@@
  config PPC64
        bool "64-bit kernel"
        default n
 +      select HAVE_PERF_COUNTERS
        help
          This option selects whether a 32-bit or a 64-bit kernel
          will be built.
@@@ -58,9 -57,17 +58,17 @@@ config E20
  
  endchoice
  
+ # Until we have a choice of exclusive CPU types on 64-bit, we always
+ # use PPC_BOOK3S. On 32-bit, this is equivalent to 6xx which is
+ # "classic" MMU
+ config PPC_BOOK3S
+        def_bool y
+        depends on PPC64 || 6xx
  config POWER4_ONLY
        bool "Optimize for POWER4"
-       depends on PPC64
+       depends on PPC64 && PPC_BOOK3S
        default n
        ---help---
          Cause the compiler to optimize for POWER4/POWER5/PPC970 processors.
  
  config POWER3
        bool
-       depends on PPC64
+       depends on PPC64 && PPC_BOOK3S
        default y if !POWER4_ONLY
  
  config POWER4
-       depends on PPC64
+       depends on PPC64 && PPC_BOOK3S
        def_bool y
  
  config TUNE_CELL
        bool "Optimize for Cell Broadband Engine"
-       depends on PPC64
+       depends on PPC64 && PPC_BOOK3S
        help
          Cause the compiler to optimize for the PPE of the Cell Broadband
          Engine. This will make the code run considerably faster on Cell
@@@ -148,7 -155,7 +156,7 @@@ config PHYS_64BI
  
  config ALTIVEC
        bool "AltiVec Support"
-       depends on CLASSIC32 || POWER4
+       depends on 6xx || POWER4
        ---help---
          This option enables kernel support for the Altivec extensions to the
          PowerPC processor. The kernel currently supports saving and restoring
@@@ -211,6 -218,10 +219,10 @@@ config PPC_MMU_NOHAS
        def_bool y
        depends on !PPC_STD_MMU
  
+ config PPC_BOOK3E_MMU
+       def_bool y
+       depends on FSL_BOOKE
  config PPC_MM_SLICES
        bool
        default y if HUGETLB_PAGE || (PPC_STD_MMU_64 && PPC_64K_PAGES)
diff --combined arch/x86/Kconfig
@@@ -34,12 -34,18 +34,18 @@@ config X8
        select HAVE_FUNCTION_TRACER
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+       select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
+       select HAVE_FTRACE_SYSCALLS
        select HAVE_KVM
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_TRACEHOOK
        select HAVE_GENERIC_DMA_COHERENT if X86_32
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
        select USER_STACKTRACE_SUPPORT
+       select HAVE_DMA_API_DEBUG
+       select HAVE_KERNEL_GZIP
+       select HAVE_KERNEL_BZIP2
+       select HAVE_KERNEL_LZMA
  
  config ARCH_DEFCONFIG
        string
@@@ -135,6 -141,9 +141,9 @@@ config ARCH_HAS_CACHE_LINE_SIZ
  config HAVE_SETUP_PER_CPU_AREA
        def_bool y
  
+ config HAVE_DYNAMIC_PER_CPU_AREA
+       def_bool y
  config HAVE_CPUMASK_OF_CPU_MAP
        def_bool X86_64_SMP
  
@@@ -158,11 -167,17 +167,17 @@@ config AUDIT_ARC
  config ARCH_SUPPORTS_OPTIMIZED_INLINING
        def_bool y
  
+ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+       def_bool y
  # Use the generic interrupt handling code in kernel/irq/:
  config GENERIC_HARDIRQS
        bool
        default y
  
+ config GENERIC_HARDIRQS_NO__DO_IRQ
+        def_bool y
  config GENERIC_IRQ_PROBE
        bool
        default y
@@@ -712,7 -727,6 +727,7 @@@ config X86_UP_IOAPI
  config X86_LOCAL_APIC
        def_bool y
        depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
 +      select HAVE_PERF_COUNTERS if (!M386 && !M486)
  
  config X86_IO_APIC
        def_bool y
@@@ -778,6 -792,11 +793,11 @@@ config X86_MCE_AM
           Additional support for AMD specific MCE features such as
           the DRAM Error Threshold.
  
+ config X86_MCE_THRESHOLD
+       depends on X86_MCE_AMD || X86_MCE_INTEL
+       bool
+       default y
  config X86_MCE_NONFATAL
        tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
        depends on X86_32 && X86_MCE
@@@ -921,6 -940,12 +941,12 @@@ config X86_CPUI
          with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
          /dev/cpu/31/cpuid.
  
+ config X86_CPU_DEBUG
+       tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
+       ---help---
+         If you select this option, this will provide various x86 CPUs
+         information through debugfs.
  choice
        prompt "High Memory Support"
        default HIGHMEM4G if !X86_NUMAQ
@@@ -1113,7 -1138,7 +1139,7 @@@ config NUMA_EM
  
  config NODES_SHIFT
        int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
-       range 1 9   if X86_64
+       range 1 9
        default "9" if MAXSMP
        default "6" if X86_64
        default "4" if X86_NUMAQ
        depends on NEED_MULTIPLE_NODES
        ---help---
          Specify the maximum number of NUMA Nodes available on the target
-         system.  Increases memory reserved to accomodate various tables.
+         system.  Increases memory reserved to accommodate various tables.
  
- config HAVE_ARCH_BOOTMEM_NODE
+ config HAVE_ARCH_BOOTMEM
        def_bool y
        depends on X86_32 && NUMA
  
@@@ -1299,7 -1324,7 +1325,7 @@@ config MTRR_SANITIZE
          add writeback entries.
  
          Can be disabled with disable_mtrr_cleanup on the kernel command line.
-         The largest mtrr entry size for a continous block can be set with
+         The largest mtrr entry size for a continuous block can be set with
          mtrr_chunk_size.
  
          If unsure, say Y.
@@@ -1421,7 -1446,7 +1447,7 @@@ config CRASH_DUM
  config KEXEC_JUMP
        bool "kexec jump (EXPERIMENTAL)"
        depends on EXPERIMENTAL
-       depends on KEXEC && HIBERNATION && X86_32
+       depends on KEXEC && HIBERNATION
        ---help---
          Jump between original kernel and kexeced kernel and invoke
          code in physical address mode via KEXEC
@@@ -1814,8 -1839,8 +1840,8 @@@ config PCI_MMCONFI
  
  config DMAR
        bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
-       depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
-       ---help---
+       depends on PCI_MSI && ACPI && EXPERIMENTAL
+       help
          DMA remapping (DMAR) devices support enables independent address
          translations for Direct Memory Access (DMA) from devices.
          These DMA remapping devices are reported via ACPI tables
@@@ -557,7 -557,7 +557,7 @@@ ia32_sys_call_table
        .quad sys32_olduname
        .quad sys_umask         /* 60 */
        .quad sys_chroot
-       .quad sys32_ustat
+       .quad compat_sys_ustat
        .quad sys_dup2
        .quad sys_getppid
        .quad sys_getpgrp               /* 65 */
        .quad compat_sys_signalfd4
        .quad sys_eventfd2
        .quad sys_epoll_create1
 -      .quad sys_dup3                  /* 330 */
 +      .quad sys_dup3                          /* 330 */
        .quad sys_pipe2
        .quad sys_inotify_init1
+       .quad compat_sys_preadv
+       .quad compat_sys_pwritev
 +      .quad sys_perf_counter_open
  ia32_syscall_end:
@@@ -12,7 -12,7 +12,8 @@@ typedef struct 
        unsigned int apic_timer_irqs;   /* arch dependent */
        unsigned int irq_spurious_count;
  #endif
+       unsigned int generic_irqs;      /* arch dependent */
 +      unsigned int apic_perf_irqs;
  #ifdef CONFIG_SMP
        unsigned int irq_resched_count;
        unsigned int irq_call_count;
  
  /* Interrupt handlers registered during init_IRQ */
  extern void apic_timer_interrupt(void);
+ extern void generic_interrupt(void);
  extern void error_interrupt(void);
 +extern void perf_counter_interrupt(void);
 +
  extern void spurious_interrupt(void);
  extern void thermal_interrupt(void);
  extern void reschedule_interrupt(void);
@@@ -83,7 -83,6 +83,7 @@@ struct thread_info 
  #define TIF_SYSCALL_AUDIT     7       /* syscall auditing active */
  #define TIF_SECCOMP           8       /* secure computing */
  #define TIF_MCE_NOTIFY                10      /* notify userspace of an MCE */
 +#define TIF_PERF_COUNTERS     11      /* notify perf counter work */
  #define TIF_NOTSC             16      /* TSC is not accessible in userland */
  #define TIF_IA32              17      /* 32bit process */
  #define TIF_FORK              18      /* ret_from_fork */
@@@ -95,6 -94,7 +95,7 @@@
  #define TIF_FORCED_TF         24      /* true if TF in eflags artificially */
  #define TIF_DEBUGCTLMSR               25      /* uses thread_struct.debugctlmsr */
  #define TIF_DS_AREA_MSR               26      /* uses thread_struct.ds_area_msr */
+ #define TIF_SYSCALL_FTRACE    27      /* for ftrace syscall instrumentation */
  
  #define _TIF_SYSCALL_TRACE    (1 << TIF_SYSCALL_TRACE)
  #define _TIF_NOTIFY_RESUME    (1 << TIF_NOTIFY_RESUME)
  #define _TIF_SYSCALL_AUDIT    (1 << TIF_SYSCALL_AUDIT)
  #define _TIF_SECCOMP          (1 << TIF_SECCOMP)
  #define _TIF_MCE_NOTIFY               (1 << TIF_MCE_NOTIFY)
 +#define _TIF_PERF_COUNTERS    (1 << TIF_PERF_COUNTERS)
  #define _TIF_NOTSC            (1 << TIF_NOTSC)
  #define _TIF_IA32             (1 << TIF_IA32)
  #define _TIF_FORK             (1 << TIF_FORK)
  #define _TIF_FORCED_TF                (1 << TIF_FORCED_TF)
  #define _TIF_DEBUGCTLMSR      (1 << TIF_DEBUGCTLMSR)
  #define _TIF_DS_AREA_MSR      (1 << TIF_DS_AREA_MSR)
+ #define _TIF_SYSCALL_FTRACE   (1 << TIF_SYSCALL_FTRACE)
  
  /* work to do in syscall_trace_enter() */
  #define _TIF_WORK_SYSCALL_ENTRY       \
-       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
+       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE |  \
         _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
  
  /* work to do in syscall_trace_leave() */
  #define _TIF_WORK_SYSCALL_EXIT        \
-       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
+       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |    \
+        _TIF_SYSCALL_FTRACE)
  
  /* work to do on interrupt/exception return */
  #define _TIF_WORK_MASK                                                        \
           _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
  
  /* work to do on any return to user space */
- #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
+ #define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
  
  /* Only used for 64 bit */
  #define _TIF_DO_NOTIFY_MASK                                           \
 -      (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
 +      (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
  
  /* flags to check in __switch_to() */
  #define _TIF_WORK_CTXSW                                                       \
  #define __NR_dup3             330
  #define __NR_pipe2            331
  #define __NR_inotify_init1    332
+ #define __NR_preadv           333
+ #define __NR_pwritev          334
 +#define __NR_perf_counter_open        333
  
  #ifdef __KERNEL__
  
@@@ -653,8 -653,11 +653,12 @@@ __SYSCALL(__NR_dup3, sys_dup3
  __SYSCALL(__NR_pipe2, sys_pipe2)
  #define __NR_inotify_init1                    294
  __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
 -
+ #define __NR_preadv                           295
+ __SYSCALL(__NR_preadv, sys_preadv)
+ #define __NR_pwritev                          296
+ __SYSCALL(__NR_pwritev, sys_pwritev)
 +#define __NR_perf_counter_open                295
 +__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
  
  #ifndef __NO_STUBS
  #define __ARCH_WANT_OLD_READDIR
@@@ -34,7 -34,6 +34,7 @@@
  #include <linux/smp.h>
  #include <linux/mm.h>
  
 +#include <asm/perf_counter.h>
  #include <asm/pgalloc.h>
  #include <asm/atomic.h>
  #include <asm/mpspec.h>
@@@ -47,6 -46,7 +47,7 @@@
  #include <asm/idle.h>
  #include <asm/mtrr.h>
  #include <asm/smp.h>
+ #include <asm/mce.h>
  
  unsigned int num_processors;
  
@@@ -755,8 -755,6 +756,8 @@@ static void local_apic_timer_interrupt(
        inc_irq_stat(apic_timer_irqs);
  
        evt->event_handler(evt);
 +
 +      perf_counter_unthrottle();
  }
  
  /*
@@@ -811,7 -809,7 +812,7 @@@ void clear_local_APIC(void
        u32 v;
  
        /* APIC hasn't been mapped yet */
-       if (!apic_phys)
+       if (!x2apic && !apic_phys)
                return;
  
        maxlvt = lapic_get_maxlvt();
                apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
        }
  #endif
+ #ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 6) {
+               v = apic_read(APIC_LVTCMCI);
+               if (!(v & APIC_LVT_MASKED))
+                       apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+       }
+ #endif
        /*
         * Clean APIC state for other OSs:
         */
@@@ -1121,7 -1127,6 +1130,7 @@@ void __cpuinit setup_local_APIC(void
                apic_write(APIC_ESR, 0);
        }
  #endif
 +      perf_counters_lapic_init(0);
  
        preempt_disable();
  
        apic_write(APIC_LVT1, value);
  
        preempt_enable();
+ #ifdef CONFIG_X86_MCE_INTEL
+       /* Recheck CMCI information after local APIC is up on CPU #0 */
+       if (smp_processor_id() == 0)
+               cmci_recheck();
+ #endif
  }
  
  void __cpuinit end_local_APIC_setup(void)
@@@ -1323,15 -1334,16 +1338,16 @@@ void __init enable_IR_x2apic(void
                return;
        }
  
-       local_irq_save(flags);
-       mask_8259A();
-       ret = save_mask_IO_APIC_setup();
+       ret = save_IO_APIC_setup();
        if (ret) {
                pr_info("Saving IO-APIC state failed: %d\n", ret);
                goto end;
        }
  
+       local_irq_save(flags);
+       mask_IO_APIC_setup();
+       mask_8259A();
        ret = enable_intr_remapping(1);
  
        if (ret && x2apic_preenabled) {
@@@ -1356,10 -1368,10 +1372,10 @@@ end_restore
        else
                reinit_intr_remapped_IO_APIC(x2apic_preenabled);
  
        unmask_8259A();
        local_irq_restore(flags);
  
+ end:
        if (!ret) {
                if (!x2apic_preenabled)
                        pr_info("Enabled x2apic and interrupt-remapping\n");
@@@ -1512,12 -1524,10 +1528,10 @@@ void __init early_init_lapic_mapping(vo
   */
  void __init init_apic_mappings(void)
  {
- #ifdef CONFIG_X86_X2APIC
        if (x2apic) {
                boot_cpu_physical_apicid = read_apic_id();
                return;
        }
- #endif
  
        /*
         * If no local APIC can be found then set up a fake all
@@@ -1961,12 -1971,9 +1975,9 @@@ static int lapic_resume(struct sys_devi
  
        local_irq_save(flags);
  
- #ifdef CONFIG_X86_X2APIC
        if (x2apic)
                enable_x2apic();
-       else
- #endif
-       {
+       else {
                /*
                 * Make sure the APICBASE points to the right address
                 *
@@@ -1,5 -1,5 +1,5 @@@
  #
 -# Makefile for x86-compatible CPU details and quirks
 +# Makefile for x86-compatible CPU details, features and quirks
  #
  
  # Don't trace early stages of a secondary CPU boot
@@@ -14,21 -14,20 +14,22 @@@ obj-y                      += vmware.o hypervisor.
  obj-$(CONFIG_X86_32)  += bugs.o cmpxchg.o
  obj-$(CONFIG_X86_64)  += bugs_64.o
  
+ obj-$(CONFIG_X86_CPU_DEBUG)           += cpu_debug.o
  obj-$(CONFIG_CPU_SUP_INTEL)           += intel.o
  obj-$(CONFIG_CPU_SUP_AMD)             += amd.o
  obj-$(CONFIG_CPU_SUP_CYRIX_32)                += cyrix.o
- obj-$(CONFIG_CPU_SUP_CENTAUR_32)      += centaur.o
- obj-$(CONFIG_CPU_SUP_CENTAUR_64)      += centaur_64.o
+ obj-$(CONFIG_CPU_SUP_CENTAUR)         += centaur.o
  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)    += transmeta.o
  obj-$(CONFIG_CPU_SUP_UMC_32)          += umc.o
  
 -obj-$(CONFIG_X86_MCE) += mcheck/
 -obj-$(CONFIG_MTRR)    += mtrr/
 -obj-$(CONFIG_CPU_FREQ)        += cpufreq/
 +obj-$(CONFIG_PERF_COUNTERS)           += perf_counter.o
  
 -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
 +obj-$(CONFIG_X86_MCE)                 += mcheck/
 +obj-$(CONFIG_MTRR)                    += mtrr/
 +obj-$(CONFIG_CPU_FREQ)                        += cpufreq/
 +
 +obj-$(CONFIG_X86_LOCAL_APIC)          += perfctr-watchdog.o
  
  quiet_cmd_mkcapflags = MKCAP   $@
        cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
@@@ -5,6 -5,7 +5,7 @@@
  #include <asm/io.h>
  #include <asm/processor.h>
  #include <asm/apic.h>
+ #include <asm/cpu.h>
  
  #ifdef CONFIG_X86_64
  # include <asm/numa_64.h>
@@@ -141,6 -142,55 +142,55 @@@ static void __cpuinit init_amd_k6(struc
        }
  }
  
+ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+ {
+ #ifdef CONFIG_SMP
+       /* calling is from identify_secondary_cpu() ? */
+       if (c->cpu_index == boot_cpu_id)
+               return;
+       /*
+        * Certain Athlons might work (for various values of 'work') in SMP
+        * but they are not certified as MP capable.
+        */
+       /* Athlon 660/661 is valid. */
+       if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+           (c->x86_mask == 1)))
+               goto valid_k7;
+       /* Duron 670 is valid */
+       if ((c->x86_model == 7) && (c->x86_mask == 0))
+               goto valid_k7;
+       /*
+        * Athlon 662, Duron 671, and Athlon >model 7 have capability
+        * bit. It's worth noting that the A5 stepping (662) of some
+        * Athlon XP's have the MP bit set.
+        * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+        * more.
+        */
+       if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+           ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+            (c->x86_model > 7))
+               if (cpu_has_mp)
+                       goto valid_k7;
+       /* If we get here, not a certified SMP capable AMD system. */
+       /*
+        * Don't taint if we are running SMP kernel on a single non-MP
+        * approved Athlon
+        */
+       WARN_ONCE(1, "WARNING: This combination of AMD"
+               "processors is not suitable for SMP.\n");
+       if (!test_taint(TAINT_UNSAFE_SMP))
+               add_taint(TAINT_UNSAFE_SMP);
+ valid_k7:
+       ;
+ #endif
+ }
  static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
  {
        u32 l, h;
        }
  
        set_cpu_cap(c, X86_FEATURE_K7);
+       amd_k7_smp_check(c);
  }
  #endif
  
@@@ -368,10 -420,6 +420,10 @@@ static void __cpuinit init_amd(struct c
        if (c->x86 >= 6)
                set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
  
 +      /* Enable Performance counter for K7 and later */
 +      if (c->x86 > 6 && c->x86 <= 0x11)
 +              set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 +
        if (!c->x86_model_id[0]) {
                switch (c->x86) {
                case 0xf:
@@@ -454,7 -502,7 +506,7 @@@ static unsigned int __cpuinit amd_size_
  }
  #endif
  
- static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
        .c_vendor       = "AMD",
        .c_ident        = { "AuthenticAMD" },
  #ifdef CONFIG_X86_32
@@@ -1,53 -1,50 +1,51 @@@
- #include <linux/init.h>
- #include <linux/kernel.h>
- #include <linux/sched.h>
- #include <linux/string.h>
  #include <linux/bootmem.h>
+ #include <linux/linkage.h>
  #include <linux/bitops.h>
+ #include <linux/kernel.h>
  #include <linux/module.h>
- #include <linux/kgdb.h>
- #include <linux/topology.h>
+ #include <linux/percpu.h>
+ #include <linux/string.h>
  #include <linux/delay.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+ #include <linux/kgdb.h>
  #include <linux/smp.h>
- #include <linux/percpu.h>
- #include <asm/i387.h>
- #include <asm/msr.h>
- #include <asm/io.h>
- #include <asm/linkage.h>
+ #include <linux/io.h>
+ #include <asm/stackprotector.h>
++#include <asm/perf_counter.h>
  #include <asm/mmu_context.h>
+ #include <asm/hypervisor.h>
+ #include <asm/processor.h>
+ #include <asm/sections.h>
+ #include <asm/topology.h>
+ #include <asm/cpumask.h>
+ #include <asm/pgtable.h>
+ #include <asm/atomic.h>
+ #include <asm/proto.h>
+ #include <asm/setup.h>
+ #include <asm/apic.h>
+ #include <asm/desc.h>
+ #include <asm/i387.h>
  #include <asm/mtrr.h>
+ #include <asm/numa.h>
+ #include <asm/asm.h>
+ #include <asm/cpu.h>
  #include <asm/mce.h>
- #include <asm/perf_counter.h>
+ #include <asm/msr.h>
  #include <asm/pat.h>
- #include <asm/asm.h>
- #include <asm/numa.h>
  #include <asm/smp.h>
- #include <asm/cpu.h>
- #include <asm/cpumask.h>
- #include <asm/apic.h>
  
  #ifdef CONFIG_X86_LOCAL_APIC
  #include <asm/uv/uv.h>
  #endif
  
- #include <asm/pgtable.h>
- #include <asm/processor.h>
- #include <asm/desc.h>
- #include <asm/atomic.h>
- #include <asm/proto.h>
- #include <asm/sections.h>
- #include <asm/setup.h>
- #include <asm/hypervisor.h>
- #include <asm/stackprotector.h>
  #include "cpu.h"
  
- #ifdef CONFIG_X86_64
  /* all of these masks are initialized in setup_cpu_local_masks() */
- cpumask_var_t cpu_callin_mask;
- cpumask_var_t cpu_callout_mask;
  cpumask_var_t cpu_initialized_mask;
+ cpumask_var_t cpu_callout_mask;
+ cpumask_var_t cpu_callin_mask;
  
  /* representing cpus for which sibling maps can be computed */
  cpumask_var_t cpu_sibling_setup_mask;
@@@ -61,17 -58,7 +59,7 @@@ void __init setup_cpu_local_masks(void
        alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
  }
  
- #else /* CONFIG_X86_32 */
- cpumask_t cpu_callin_map;
- cpumask_t cpu_callout_map;
- cpumask_t cpu_initialized;
- cpumask_t cpu_sibling_setup_map;
- #endif /* CONFIG_X86_32 */
- static struct cpu_dev *this_cpu __cpuinitdata;
+ static const struct cpu_dev *this_cpu __cpuinitdata;
  
  DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
  #ifdef CONFIG_X86_64
         * IRET will check the segment types  kkeil 2000/10/28
         * Also sysret mandates a special GDT layout
         *
-        * The TLS descriptors are currently at a different place compared to i386.
+        * TLS descriptors are currently at a different place compared to i386.
         * Hopefully nobody expects them at a fixed place (Wine?)
         */
-       [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
-       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
-       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
-       [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
-       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
-       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+       [GDT_ENTRY_KERNEL32_CS]         = { { { 0x0000ffff, 0x00cf9b00 } } },
+       [GDT_ENTRY_KERNEL_CS]           = { { { 0x0000ffff, 0x00af9b00 } } },
+       [GDT_ENTRY_KERNEL_DS]           = { { { 0x0000ffff, 0x00cf9300 } } },
+       [GDT_ENTRY_DEFAULT_USER32_CS]   = { { { 0x0000ffff, 0x00cffb00 } } },
+       [GDT_ENTRY_DEFAULT_USER_DS]     = { { { 0x0000ffff, 0x00cff300 } } },
+       [GDT_ENTRY_DEFAULT_USER_CS]     = { { { 0x0000ffff, 0x00affb00 } } },
  #else
-       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
-       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
-       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
-       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+       [GDT_ENTRY_KERNEL_CS]           = { { { 0x0000ffff, 0x00cf9a00 } } },
+       [GDT_ENTRY_KERNEL_DS]           = { { { 0x0000ffff, 0x00cf9200 } } },
+       [GDT_ENTRY_DEFAULT_USER_CS]     = { { { 0x0000ffff, 0x00cffa00 } } },
+       [GDT_ENTRY_DEFAULT_USER_DS]     = { { { 0x0000ffff, 0x00cff200 } } },
        /*
         * Segments used for calling PnP BIOS have byte granularity.
         * They code segments and data segments have fixed 64k limits,
         * the transfer segment sizes are set at run time.
         */
        /* 32-bit code */
-       [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+       [GDT_ENTRY_PNPBIOS_CS32]        = { { { 0x0000ffff, 0x00409a00 } } },
        /* 16-bit code */
-       [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+       [GDT_ENTRY_PNPBIOS_CS16]        = { { { 0x0000ffff, 0x00009a00 } } },
        /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_DS]          = { { { 0x0000ffff, 0x00009200 } } },
        /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_TS1]         = { { { 0x00000000, 0x00009200 } } },
        /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_TS2]         = { { { 0x00000000, 0x00009200 } } },
        /*
         * The APM segments have byte granularity and their bases
         * are set at run time.  All have 64k limits.
         */
        /* 32-bit code */
-       [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+       [GDT_ENTRY_APMBIOS_BASE]        = { { { 0x0000ffff, 0x00409a00 } } },
        /* 16-bit code */
-       [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+       [GDT_ENTRY_APMBIOS_BASE+1]      = { { { 0x0000ffff, 0x00009a00 } } },
        /* data */
-       [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+       [GDT_ENTRY_APMBIOS_BASE+2]      = { { { 0x0000ffff, 0x00409200 } } },
  
-       [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
-       [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+       [GDT_ENTRY_ESPFIX_SS]           = { { { 0x00000000, 0x00c09200 } } },
+       [GDT_ENTRY_PERCPU]              = { { { 0x0000ffff, 0x00cf9200 } } },
        GDT_STACK_CANARY_INIT
  #endif
  } };
@@@ -165,16 -152,17 +153,17 @@@ static inline int flag_is_changeable_p(
         * the CPUID. Add "volatile" to not allow gcc to
         * optimize the subsequent calls to this function.
         */
-       asm volatile ("pushfl\n\t"
-                     "pushfl\n\t"
-                     "popl %0\n\t"
-                     "movl %0,%1\n\t"
-                     "xorl %2,%0\n\t"
-                     "pushl %0\n\t"
-                     "popfl\n\t"
-                     "pushfl\n\t"
-                     "popl %0\n\t"
-                     "popfl\n\t"
+       asm volatile ("pushfl           \n\t"
+                     "pushfl           \n\t"
+                     "popl %0          \n\t"
+                     "movl %0, %1      \n\t"
+                     "xorl %2, %0      \n\t"
+                     "pushl %0         \n\t"
+                     "popfl            \n\t"
+                     "pushfl           \n\t"
+                     "popl %0          \n\t"
+                     "popfl            \n\t"
                      : "=&r" (f1), "=&r" (f2)
                      : "ir" (flag));
  
@@@ -189,18 -177,22 +178,22 @@@ static int __cpuinit have_cpuid_p(void
  
  static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
  {
-       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
-               /* Disable processor serial number */
-               unsigned long lo, hi;
-               rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-               lo |= 0x200000;
-               wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-               printk(KERN_NOTICE "CPU serial number disabled.\n");
-               clear_cpu_cap(c, X86_FEATURE_PN);
-               /* Disabling the serial number may affect the cpuid level */
-               c->cpuid_level = cpuid_eax(0);
-       }
+       unsigned long lo, hi;
+       if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+               return;
+       /* Disable processor serial number: */
+       rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+       lo |= 0x200000;
+       wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+       printk(KERN_NOTICE "CPU serial number disabled.\n");
+       clear_cpu_cap(c, X86_FEATURE_PN);
+       /* Disabling the serial number may affect the cpuid level */
+       c->cpuid_level = cpuid_eax(0);
  }
  
  static int __init x86_serial_nr_setup(char *s)
@@@ -233,6 -225,7 +226,7 @@@ struct cpuid_dependent_feature 
        u32 feature;
        u32 level;
  };
  static const struct cpuid_dependent_feature __cpuinitconst
  cpuid_dependent_features[] = {
        { X86_FEATURE_MWAIT,            0x00000005 },
  static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
  {
        const struct cpuid_dependent_feature *df;
        for (df = cpuid_dependent_features; df->feature; df++) {
+               if (!cpu_has(c, df->feature))
+                       continue;
                /*
                 * Note: cpuid_level is set to -1 if unavailable, but
                 * extended_extended_level is set to 0 if unavailable
                 * when signed; hence the weird messing around with
                 * signs here...
                 */
-               if (cpu_has(c, df->feature) &&
-                   ((s32)df->level < 0 ?
+               if (!((s32)df->level < 0 ?
                     (u32)df->level > (u32)c->extended_cpuid_level :
-                    (s32)df->level > (s32)c->cpuid_level)) {
-                       clear_cpu_cap(c, df->feature);
-                       if (warn)
-                               printk(KERN_WARNING
-                                      "CPU: CPU feature %s disabled "
-                                      "due to lack of CPUID level 0x%x\n",
-                                      x86_cap_flags[df->feature],
-                                      df->level);
-               }
+                    (s32)df->level > (s32)c->cpuid_level))
+                       continue;
+               clear_cpu_cap(c, df->feature);
+               if (!warn)
+                       continue;
+               printk(KERN_WARNING
+                      "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+                               x86_cap_flags[df->feature], df->level);
        }
  }
  
  /*
   * Naming convention should be: <Name> [(<Codename>)]
   * This table only is used unless init_<vendor>() below doesn't set it;
-  * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
-  *
+  * in particular, if CPUID levels 0x80000002..4 are supported, this
+  * isn't used
   */
  
  /* Look up CPU names by table lookup. */
- static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
  {
-       struct cpu_model_info *info;
+       const struct cpu_model_info *info;
  
        if (c->x86_model >= 16)
                return NULL;    /* Range check */
@@@ -308,8 -305,10 +306,10 @@@ void load_percpu_segment(int cpu
        load_stack_canary_segment();
  }
  
- /* Current gdt points %fs at the "master" per-cpu area: after this,
-  * it's on the real one. */
+ /*
+  * Current gdt points %fs at the "master" per-cpu area: after this,
+  * it's on the real one.
+  */
  void switch_to_new_gdt(int cpu)
  {
        struct desc_ptr gdt_descr;
        load_percpu_segment(cpu);
  }
  
- static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+ static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
  
  static void __cpuinit default_init(struct cpuinfo_x86 *c)
  {
  #endif
  }
  
- static struct cpu_dev __cpuinitdata default_cpu = {
+ static const struct cpu_dev __cpuinitconst default_cpu = {
        .c_init = default_init,
        .c_vendor = "Unknown",
        .c_x86_vendor = X86_VENDOR_UNKNOWN,
@@@ -355,22 -354,24 +355,24 @@@ static void __cpuinit get_model_name(st
        if (c->extended_cpuid_level < 0x80000004)
                return;
  
-       v = (unsigned int *) c->x86_model_id;
+       v = (unsigned int *)c->x86_model_id;
        cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
        cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
        cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
        c->x86_model_id[48] = 0;
  
-       /* Intel chips right-justify this string for some dumb reason;
-          undo that brain damage */
+       /*
+        * Intel chips right-justify this string for some dumb reason;
+        * undo that brain damage:
+        */
        p = q = &c->x86_model_id[0];
        while (*p == ' ')
-            p++;
+               p++;
        if (p != q) {
-            while (*p)
-                 *q++ = *p++;
-            while (q <= &c->x86_model_id[48])
-                 *q++ = '\0';  /* Zero-pad the rest */
+               while (*p)
+                       *q++ = *p++;
+               while (q <= &c->x86_model_id[48])
+                       *q++ = '\0';    /* Zero-pad the rest */
        }
  }
  
@@@ -439,27 -440,30 +441,30 @@@ void __cpuinit detect_ht(struct cpuinfo
  
        if (smp_num_siblings == 1) {
                printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-       } else if (smp_num_siblings > 1) {
+               goto out;
+       }
  
-               if (smp_num_siblings > nr_cpu_ids) {
-                       printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
-                                       smp_num_siblings);
-                       smp_num_siblings = 1;
-                       return;
-               }
+       if (smp_num_siblings <= 1)
+               goto out;
  
-               index_msb = get_count_order(smp_num_siblings);
-               c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+       if (smp_num_siblings > nr_cpu_ids) {
+               pr_warning("CPU: Unsupported number of siblings %d",
+                          smp_num_siblings);
+               smp_num_siblings = 1;
+               return;
+       }
  
-               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+       index_msb = get_count_order(smp_num_siblings);
+       c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
  
-               index_msb = get_count_order(smp_num_siblings);
+       smp_num_siblings = smp_num_siblings / c->x86_max_cores;
  
-               core_bits = get_count_order(c->x86_max_cores);
+       index_msb = get_count_order(smp_num_siblings);
  
-               c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
-                                              ((1 << core_bits) - 1);
-       }
+       core_bits = get_count_order(c->x86_max_cores);
+       c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+                                      ((1 << core_bits) - 1);
  
  out:
        if ((c->x86_max_cores * smp_num_siblings) > 1) {
  static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
  {
        char *v = c->x86_vendor_id;
-       int i;
        static int printed;
+       int i;
  
        for (i = 0; i < X86_VENDOR_NUM; i++) {
                if (!cpu_devs[i])
                if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
                    (cpu_devs[i]->c_ident[1] &&
                     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
                        this_cpu = cpu_devs[i];
                        c->x86_vendor = this_cpu->c_x86_vendor;
                        return;
  
        if (!printed) {
                printed++;
-               printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+               printk(KERN_ERR
+                   "CPU: vendor_id '%s' unknown, using generic init.\n", v);
                printk(KERN_ERR "CPU: Your system may be unstable.\n");
        }
  
@@@ -512,14 -519,17 +520,17 @@@ void __cpuinit cpu_detect(struct cpuinf
        /* Intel-defined flags: level 0x00000001 */
        if (c->cpuid_level >= 0x00000001) {
                u32 junk, tfms, cap0, misc;
                cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
                c->x86 = (tfms >> 8) & 0xf;
                c->x86_model = (tfms >> 4) & 0xf;
                c->x86_mask = tfms & 0xf;
                if (c->x86 == 0xf)
                        c->x86 += (tfms >> 20) & 0xff;
                if (c->x86 >= 0x6)
                        c->x86_model += ((tfms >> 16) & 0xf) << 4;
                if (cap0 & (1<<19)) {
                        c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
                        c->x86_cache_alignment = c->x86_clflush_size;
@@@ -535,6 -545,7 +546,7 @@@ static void __cpuinit get_cpu_cap(struc
        /* Intel-defined flags: level 0x00000001 */
        if (c->cpuid_level >= 0x00000001) {
                u32 capability, excap;
                cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
                c->x86_capability[0] = capability;
                c->x86_capability[4] = excap;
        /* AMD-defined flags: level 0x80000001 */
        xlvl = cpuid_eax(0x80000000);
        c->extended_cpuid_level = xlvl;
        if ((xlvl & 0xffff0000) == 0x80000000) {
                if (xlvl >= 0x80000001) {
                        c->x86_capability[1] = cpuid_edx(0x80000001);
                }
        }
  
- #ifdef CONFIG_X86_64
        if (c->extended_cpuid_level >= 0x80000008) {
                u32 eax = cpuid_eax(0x80000008);
  
                c->x86_virt_bits = (eax >> 8) & 0xff;
                c->x86_phys_bits = eax & 0xff;
        }
+ #ifdef CONFIG_X86_32
+       else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+               c->x86_phys_bits = 36;
  #endif
  
        if (c->extended_cpuid_level >= 0x80000007)
@@@ -603,8 -617,12 +618,12 @@@ static void __init early_identify_cpu(s
  {
  #ifdef CONFIG_X86_64
        c->x86_clflush_size = 64;
+       c->x86_phys_bits = 36;
+       c->x86_virt_bits = 48;
  #else
        c->x86_clflush_size = 32;
+       c->x86_phys_bits = 32;
+       c->x86_virt_bits = 32;
  #endif
        c->x86_cache_alignment = c->x86_clflush_size;
  
  
  void __init early_cpu_init(void)
  {
-       struct cpu_dev **cdev;
+       const struct cpu_dev *const *cdev;
        int count = 0;
  
-       printk("KERNEL supported cpus:\n");
+       printk(KERN_INFO "KERNEL supported cpus:\n");
        for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
-               struct cpu_dev *cpudev = *cdev;
+               const struct cpu_dev *cpudev = *cdev;
                unsigned int j;
  
                if (count >= X86_VENDOR_NUM)
                for (j = 0; j < 2; j++) {
                        if (!cpudev->c_ident[j])
                                continue;
-                       printk("  %s %s\n", cpudev->c_vendor,
+                       printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
                                cpudev->c_ident[j]);
                }
        }
@@@ -727,9 -745,13 +746,13 @@@ static void __cpuinit identify_cpu(stru
        c->x86_coreid_bits = 0;
  #ifdef CONFIG_X86_64
        c->x86_clflush_size = 64;
+       c->x86_phys_bits = 36;
+       c->x86_virt_bits = 48;
  #else
        c->cpuid_level = -1;    /* CPUID not detected */
        c->x86_clflush_size = 32;
+       c->x86_phys_bits = 32;
+       c->x86_virt_bits = 32;
  #endif
        c->x86_cache_alignment = c->x86_clflush_size;
        memset(&c->x86_capability, 0, sizeof c->x86_capability);
        squash_the_stupid_serial_number(c);
  
        /*
-        * The vendor-specific functions might have changed features.  Now
-        * we do "generic changes."
+        * The vendor-specific functions might have changed features.
+        * Now we do "generic changes."
         */
  
        /* Filter out anything that depends on CPUID levels we don't have */
  
        /* If the model name is still unset, do table lookup. */
        if (!c->x86_model_id[0]) {
-               char *p;
+               const char *p;
                p = table_lookup_model(c);
                if (p)
                        strcpy(c->x86_model_id, p);
@@@ -825,13 -847,13 +848,14 @@@ static void vgetcpu_set_mode(void
  void __init identify_boot_cpu(void)
  {
        identify_cpu(&boot_cpu_data);
+       init_c1e_mask();
  #ifdef CONFIG_X86_32
        sysenter_setup();
        enable_sep_cpu();
  #else
        vgetcpu_set_mode();
  #endif
 +      init_hw_perf_counters();
  }
  
  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
  }
  
  struct msr_range {
-       unsigned min;
-       unsigned max;
+       unsigned        min;
+       unsigned        max;
  };
  
- static struct msr_range msr_range_array[] __cpuinitdata = {
+ static const struct msr_range msr_range_array[] __cpuinitconst = {
        { 0x00000000, 0x00000418},
        { 0xc0000000, 0xc000040b},
        { 0xc0010000, 0xc0010142},
  
  static void __cpuinit print_cpu_msr(void)
  {
+       unsigned index_min, index_max;
        unsigned index;
        u64 val;
        int i;
-       unsigned index_min, index_max;
  
        for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
                index_min = msr_range_array[i].min;
                index_max = msr_range_array[i].max;
                for (index = index_min; index < index_max; index++) {
                        if (rdmsrl_amd_safe(index, &val))
                                continue;
  }
  
  static int show_msr __cpuinitdata;
  static __init int setup_show_msr(char *arg)
  {
        int num;
@@@ -896,12 -920,14 +922,14 @@@ __setup("noclflush", setup_noclflush)
  
  void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
  {
-       char *vendor = NULL;
+       const char *vendor = NULL;
  
-       if (c->x86_vendor < X86_VENDOR_NUM)
+       if (c->x86_vendor < X86_VENDOR_NUM) {
                vendor = this_cpu->c_vendor;
-       else if (c->cpuid_level >= 0)
-               vendor = c->x86_vendor_id;
+       } else {
+               if (c->cpuid_level >= 0)
+                       vendor = c->x86_vendor_id;
+       }
  
        if (vendor && !strstr(c->x86_model_id, vendor))
                printk(KERN_CONT "%s ", vendor);
  static __init int setup_disablecpuid(char *arg)
  {
        int bit;
        if (get_option(&arg, &bit) && bit < NCAPINTS*32)
                setup_clear_cpu_cap(bit);
        else
                return 0;
        return 1;
  }
  __setup("clearcpuid=", setup_disablecpuid);
@@@ -941,6 -969,7 +971,7 @@@ struct desc_ptr idt_descr = { 256 * 16 
  
  DEFINE_PER_CPU_FIRST(union irq_stack_union,
                     irq_stack_union) __aligned(PAGE_SIZE);
  DEFINE_PER_CPU(char *, irq_stack_ptr) =
        init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
  
@@@ -950,12 -979,21 +981,21 @@@ EXPORT_PER_CPU_SYMBOL(kernel_stack)
  
  DEFINE_PER_CPU(unsigned int, irq_count) = -1;
  
+ /*
+  * Special IST stacks which the CPU switches to when it calls
+  * an IST-marked descriptor entry. Up to 7 stacks (hardware
+  * limit), all of them are 4K, except the debug stack which
+  * is 8K.
+  */
+ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+         [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
+         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
+ };
  static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
        __aligned(PAGE_SIZE);
  
- extern asmlinkage void ignore_sysret(void);
  /* May not be marked __init: used by software suspend */
  void syscall_init(void)
  {
@@@ -985,7 -1023,7 +1025,7 @@@ unsigned long kernel_eflags
   */
  DEFINE_PER_CPU(struct orig_ist, orig_ist);
  
- #else /* x86_64 */
+ #else /* CONFIG_X86_64 */
  
  #ifdef CONFIG_CC_STACKPROTECTOR
  DEFINE_PER_CPU(unsigned long, stack_canary);
@@@ -997,9 -1035,26 +1037,26 @@@ struct pt_regs * __cpuinit idle_regs(st
        memset(regs, 0, sizeof(struct pt_regs));
        regs->fs = __KERNEL_PERCPU;
        regs->gs = __KERNEL_STACK_CANARY;
        return regs;
  }
- #endif        /* x86_64 */
+ #endif        /* CONFIG_X86_64 */
+ /*
+  * Clear all 6 debug registers:
+  */
+ static void clear_all_debug_regs(void)
+ {
+       int i;
+       for (i = 0; i < 8; i++) {
+               /* Ignore db4, db5 */
+               if ((i == 4) || (i == 5))
+                       continue;
+               set_debugreg(0, i);
+       }
+ }
  
  /*
   * cpu_init() initializes state that is per-CPU. Some data is already
   * A lot of state is already set up in PDA init for 64 bit
   */
  #ifdef CONFIG_X86_64
  void __cpuinit cpu_init(void)
  {
-       int cpu = stack_smp_processor_id();
-       struct tss_struct *t = &per_cpu(init_tss, cpu);
-       struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-       unsigned long v;
+       struct orig_ist *orig_ist;
        struct task_struct *me;
+       struct tss_struct *t;
+       unsigned long v;
+       int cpu;
        int i;
  
+       cpu = stack_smp_processor_id();
+       t = &per_cpu(init_tss, cpu);
+       orig_ist = &per_cpu(orig_ist, cpu);
  #ifdef CONFIG_NUMA
        if (cpu != 0 && percpu_read(node_number) == 0 &&
            cpu_to_node(cpu) != NUMA_NO_NODE)
         * set up and load the per-CPU TSS
         */
        if (!orig_ist->ist[0]) {
-               static const unsigned int sizes[N_EXCEPTION_STACKS] = {
-                 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
-                 [DEBUG_STACK - 1] = DEBUG_STKSZ
-               };
                char *estacks = per_cpu(exception_stacks, cpu);
                for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-                       estacks += sizes[v];
+                       estacks += exception_stack_sizes[v];
                        orig_ist->ist[v] = t->x86_tss.ist[v] =
                                        (unsigned long)estacks;
                }
        }
  
        t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
        /*
         * <= is required because the CPU will access up to
         * 8 bits beyond the end of the IO permission bitmap.
  
        atomic_inc(&init_mm.mm_count);
        me->active_mm = &init_mm;
-       if (me->mm)
-               BUG();
+       BUG_ON(me->mm);
        enter_lazy_tlb(&init_mm, me);
  
        load_sp0(t, &current->thread);
                arch_kgdb_ops.correct_hw_break();
        else
  #endif
-       {
-               /*
-                * Clear all 6 debug registers:
-                */
-               set_debugreg(0UL, 0);
-               set_debugreg(0UL, 1);
-               set_debugreg(0UL, 2);
-               set_debugreg(0UL, 3);
-               set_debugreg(0UL, 6);
-               set_debugreg(0UL, 7);
-       }
+               clear_all_debug_regs();
  
        fpu_init();
  
@@@ -1131,7 -1178,8 +1180,8 @@@ void __cpuinit cpu_init(void
  
        if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
                printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
-               for (;;) local_irq_enable();
+               for (;;)
+                       local_irq_enable();
        }
  
        printk(KERN_INFO "Initializing CPU#%d\n", cpu);
         */
        atomic_inc(&init_mm.mm_count);
        curr->active_mm = &init_mm;
-       if (curr->mm)
-               BUG();
+       BUG_ON(curr->mm);
        enter_lazy_tlb(&init_mm, curr);
  
        load_sp0(t, thread);
        __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
  #endif
  
-       /* Clear all 6 debug registers: */
-       set_debugreg(0, 0);
-       set_debugreg(0, 1);
-       set_debugreg(0, 2);
-       set_debugreg(0, 3);
-       set_debugreg(0, 6);
-       set_debugreg(0, 7);
+       clear_all_debug_regs();
  
        /*
         * Force FPU initialization:
  
        xsave_init();
  }
  #endif
@@@ -368,6 -368,7 +368,7 @@@ ENTRY(save_rest
  END(save_rest)
  
  /* save complete stack frame */
+       .pushsection .kprobes.text, "ax"
  ENTRY(save_paranoid)
        XCPT_FRAME 1 RDI+8
        cld
  1:    ret
        CFI_ENDPROC
  END(save_paranoid)
+       .popsection
  
  /*
   * A newly forked process directly context switches into this address.
@@@ -416,7 -418,6 +418,6 @@@ ENTRY(ret_from_fork
  
        GET_THREAD_INFO(%rcx)
  
-       CFI_REMEMBER_STATE
        RESTORE_REST
  
        testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
        RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
        jmp ret_from_sys_call                   # go to the SYSRET fastpath
  
-       CFI_RESTORE_STATE
        CFI_ENDPROC
  END(ret_from_fork)
  
@@@ -984,6 -984,8 +984,8 @@@ apicinterrupt UV_BAU_MESSAGE 
  #endif
  apicinterrupt LOCAL_TIMER_VECTOR \
        apic_timer_interrupt smp_apic_timer_interrupt
+ apicinterrupt GENERIC_INTERRUPT_VECTOR \
+       generic_interrupt smp_generic_interrupt
  
  #ifdef CONFIG_SMP
  apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@@ -1023,11 -1025,6 +1025,11 @@@ apicinterrupt ERROR_APIC_VECTOR 
  apicinterrupt SPURIOUS_APIC_VECTOR \
        spurious_interrupt smp_spurious_interrupt
  
 +#ifdef CONFIG_PERF_COUNTERS
 +apicinterrupt LOCAL_PERF_VECTOR \
 +      perf_counter_interrupt smp_perf_counter_interrupt
 +#endif
 +
  /*
   * Exception entry points.
   */
diff --combined arch/x86/kernel/irq.c
@@@ -15,6 -15,9 +15,9 @@@
  
  atomic_t irq_err_count;
  
+ /* Function pointer for generic interrupt vector handling */
+ void (*generic_interrupt_extension)(void) = NULL;
  /*
   * 'what should we do if we get a hw irq event on an illegal vector'.
   * each architecture has to answer this themselves.
@@@ -42,59 -45,60 +45,64 @@@ void ack_bad_irq(unsigned int irq
  /*
   * /proc/interrupts printing:
   */
- static int show_other_interrupts(struct seq_file *p)
+ static int show_other_interrupts(struct seq_file *p, int prec)
  {
        int j;
  
-       seq_printf(p, "NMI: ");
+       seq_printf(p, "%*s: ", prec, "NMI");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
        seq_printf(p, "  Non-maskable interrupts\n");
  #ifdef CONFIG_X86_LOCAL_APIC
-       seq_printf(p, "LOC: ");
+       seq_printf(p, "%*s: ", prec, "LOC");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
        seq_printf(p, "  Local timer interrupts\n");
+       seq_printf(p, "%*s: ", prec, "SPU");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+       seq_printf(p, "  Spurious interrupts\n");
 +      seq_printf(p, "CNT: ");
 +      for_each_online_cpu(j)
 +              seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 +      seq_printf(p, "  Performance counter interrupts\n");
  #endif
+       if (generic_interrupt_extension) {
+               seq_printf(p, "PLT: ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+               seq_printf(p, "  Platform interrupts\n");
+       }
  #ifdef CONFIG_SMP
-       seq_printf(p, "RES: ");
+       seq_printf(p, "%*s: ", prec, "RES");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
        seq_printf(p, "  Rescheduling interrupts\n");
-       seq_printf(p, "CAL: ");
+       seq_printf(p, "%*s: ", prec, "CAL");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
        seq_printf(p, "  Function call interrupts\n");
-       seq_printf(p, "TLB: ");
+       seq_printf(p, "%*s: ", prec, "TLB");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
        seq_printf(p, "  TLB shootdowns\n");
  #endif
  #ifdef CONFIG_X86_MCE
-       seq_printf(p, "TRM: ");
+       seq_printf(p, "%*s: ", prec, "TRM");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
        seq_printf(p, "  Thermal event interrupts\n");
  # ifdef CONFIG_X86_64
-       seq_printf(p, "THR: ");
+       seq_printf(p, "%*s: ", prec, "THR");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
        seq_printf(p, "  Threshold APIC interrupts\n");
  # endif
  #endif
- #ifdef CONFIG_X86_LOCAL_APIC
-       seq_printf(p, "SPU: ");
-       for_each_online_cpu(j)
-               seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
-       seq_printf(p, "  Spurious interrupts\n");
- #endif
-       seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+       seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
  #if defined(CONFIG_X86_IO_APIC)
-       seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+       seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
  #endif
        return 0;
  }
  int show_interrupts(struct seq_file *p, void *v)
  {
        unsigned long flags, any_count = 0;
-       int i = *(loff_t *) v, j;
+       int i = *(loff_t *) v, j, prec;
        struct irqaction *action;
        struct irq_desc *desc;
  
        if (i > nr_irqs)
                return 0;
  
+       for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+               j *= 10;
        if (i == nr_irqs)
-               return show_other_interrupts(p);
+               return show_other_interrupts(p, prec);
  
        /* print header */
        if (i == 0) {
-               seq_printf(p, "           ");
+               seq_printf(p, "%*s", prec + 8, "");
                for_each_online_cpu(j)
                        seq_printf(p, "CPU%-8d", j);
                seq_putc(p, '\n');
                return 0;
  
        spin_lock_irqsave(&desc->lock, flags);
- #ifndef CONFIG_SMP
-       any_count = kstat_irqs(i);
- #else
        for_each_online_cpu(j)
                any_count |= kstat_irqs_cpu(i, j);
- #endif
        action = desc->action;
        if (!action && !any_count)
                goto out;
  
-       seq_printf(p, "%3d: ", i);
- #ifndef CONFIG_SMP
-       seq_printf(p, "%10u ", kstat_irqs(i));
- #else
+       seq_printf(p, "%*d: ", prec, i);
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
- #endif
        seq_printf(p, " %8s", desc->chip->name);
        seq_printf(p, "-%-8s", desc->name);
  
@@@ -166,8 -165,10 +169,11 @@@ u64 arch_irq_stat_cpu(unsigned int cpu
  
  #ifdef CONFIG_X86_LOCAL_APIC
        sum += irq_stats(cpu)->apic_timer_irqs;
+       sum += irq_stats(cpu)->irq_spurious_count;
 +      sum += irq_stats(cpu)->apic_perf_irqs;
  #endif
+       if (generic_interrupt_extension)
+               sum += irq_stats(cpu)->generic_irqs;
  #ifdef CONFIG_SMP
        sum += irq_stats(cpu)->irq_resched_count;
        sum += irq_stats(cpu)->irq_call_count;
        sum += irq_stats(cpu)->irq_threshold_count;
  #endif
  #endif
- #ifdef CONFIG_X86_LOCAL_APIC
-       sum += irq_stats(cpu)->irq_spurious_count;
- #endif
        return sum;
  }
  
@@@ -231,4 -229,27 +234,27 @@@ unsigned int __irq_entry do_IRQ(struct 
        return 1;
  }
  
+ /*
+  * Handler for GENERIC_INTERRUPT_VECTOR.
+  */
+ void smp_generic_interrupt(struct pt_regs *regs)
+ {
+       struct pt_regs *old_regs = set_irq_regs(regs);
+       ack_APIC_irq();
+       exit_idle();
+       irq_enter();
+       inc_irq_stat(generic_irqs);
+       if (generic_interrupt_extension)
+               generic_interrupt_extension();
+       irq_exit();
+       set_irq_regs(old_regs);
+ }
  EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
@@@ -50,7 -50,6 +50,6 @@@ static irqreturn_t math_error_irq(int c
   */
  static struct irqaction fpu_irq = {
        .handler = math_error_irq,
-       .mask = CPU_MASK_NONE,
        .name = "fpu",
  };
  
@@@ -83,7 -82,6 +82,6 @@@ void __init init_ISA_irqs(void
   */
  static struct irqaction irq2 = {
        .handler = no_action,
-       .mask = CPU_MASK_NONE,
        .name = "cascade",
  };
  
@@@ -120,8 -118,28 +118,8 @@@ int vector_used_by_percpu_irq(unsigned 
        return 0;
  }
  
 -/* Overridden in paravirt.c */
 -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 -
 -void __init native_init_IRQ(void)
 +static void __init smp_intr_init(void)
  {
 -      int i;
 -
 -      /* Execute any quirks before the call gates are initialised: */
 -      x86_quirk_pre_intr_init();
 -
 -      /*
 -       * Cover the whole vector space, no vector can escape
 -       * us. (some of these will be overridden and become
 -       * 'special' SMP interrupts)
 -       */
 -      for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
 -              /* SYSCALL_VECTOR was reserved in trap_init. */
 -              if (i != SYSCALL_VECTOR)
 -                      set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
 -      }
 -
 -
  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
        /*
         * The reschedule interrupt is a CPU-to-CPU reschedule-helper
        set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
        set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
  #endif
 +}
 +
 +static void __init apic_intr_init(void)
 +{
 +      smp_intr_init();
  
  #ifdef CONFIG_X86_LOCAL_APIC
        /* self generated IPI for local APIC timer */
        alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
  
+       /* generic IPI for platform specific use */
+       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
        /* IPI vectors for APIC spurious and error interrupts */
        alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
        alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 -#endif
 +# ifdef CONFIG_PERF_COUNTERS
 +      alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 +# endif
  
 -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
 +# ifdef CONFIG_X86_MCE_P4THERMAL
        /* thermal monitor LVT interrupt */
        alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 +# endif
  #endif
 +}
 +
 +/* Overridden in paravirt.c */
 +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 +
 +void __init native_init_IRQ(void)
 +{
 +      int i;
 +
 +      /* Execute any quirks before the call gates are initialised: */
 +      x86_quirk_pre_intr_init();
 +
 +      apic_intr_init();
 +
 +      /*
 +       * Cover the whole vector space, no vector can escape
 +       * us. (some of these will be overridden and become
 +       * 'special' SMP interrupts)
 +       */
 +      for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
 +              int vector = FIRST_EXTERNAL_VECTOR + i;
 +              /* SYSCALL_VECTOR was reserved in trap_init. */
 +              if (!test_bit(vector, used_vectors))
 +                      set_intr_gate(vector, interrupt[i]);
 +      }
  
        if (!acpi_ioapic)
                setup_irq(2, &irq2);
@@@ -45,7 -45,6 +45,6 @@@
  
  static struct irqaction irq2 = {
        .handler = no_action,
-       .mask = CPU_MASK_NONE,
        .name = "cascade",
  };
  DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@@ -147,14 -146,12 +146,17 @@@ static void __init apic_intr_init(void
        /* self generated IPI for local APIC timer */
        alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
  
+       /* generic IPI for platform specific use */
+       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
        /* IPI vectors for APIC spurious and error interrupts */
        alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
        alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 +
 +      /* Performance monitoring interrupt: */
 +#ifdef CONFIG_PERF_COUNTERS
 +      alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 +#endif
  }
  
  void __init native_init_IRQ(void)
        int i;
  
        init_ISA_irqs();
 +
 +      apic_intr_init();
 +
        /*
         * Cover the whole vector space, no vector can escape
         * us. (some of these will be overridden and become
         */
        for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
                int vector = FIRST_EXTERNAL_VECTOR + i;
 -              if (vector != IA32_SYSCALL_VECTOR)
 +              if (!test_bit(vector, used_vectors))
                        set_intr_gate(vector, interrupt[i]);
        }
  
 -      apic_intr_init();
 -
        if (!acpi_ioapic)
                setup_irq(2, &irq2);
  }
diff --combined arch/x86/kernel/signal.c
@@@ -6,7 -6,7 +6,7 @@@
   *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
   *  2000-2002   x86-64 support by Andi Kleen
   */
 -
 +#include <linux/perf_counter.h>
  #include <linux/sched.h>
  #include <linux/mm.h>
  #include <linux/smp.h>
@@@ -187,6 -187,77 +187,77 @@@ setup_sigcontext(struct sigcontext __us
  /*
   * Set up a signal frame.
   */
+ /*
+  * Determine which stack to use..
+  */
+ static unsigned long align_sigframe(unsigned long sp)
+ {
+ #ifdef CONFIG_X86_32
+       /*
+        * Align the stack pointer according to the i386 ABI,
+        * i.e. so that on function entry ((sp + 4) & 15) == 0.
+        */
+       sp = ((sp + 4) & -16ul) - 4;
+ #else /* !CONFIG_X86_32 */
+       sp = round_down(sp, 16) - 8;
+ #endif
+       return sp;
+ }
+ static inline void __user *
+ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+            void __user **fpstate)
+ {
+       /* Default to using normal stack */
+       unsigned long sp = regs->sp;
+       int onsigstack = on_sig_stack(sp);
+ #ifdef CONFIG_X86_64
+       /* redzone */
+       sp -= 128;
+ #endif /* CONFIG_X86_64 */
+       if (!onsigstack) {
+               /* This is the X/Open sanctioned signal stack switching.  */
+               if (ka->sa.sa_flags & SA_ONSTACK) {
+                       if (current->sas_ss_size)
+                               sp = current->sas_ss_sp + current->sas_ss_size;
+               } else {
+ #ifdef CONFIG_X86_32
+                       /* This is the legacy signal stack switching. */
+                       if ((regs->ss & 0xffff) != __USER_DS &&
+                               !(ka->sa.sa_flags & SA_RESTORER) &&
+                                       ka->sa.sa_restorer)
+                               sp = (unsigned long) ka->sa.sa_restorer;
+ #endif /* CONFIG_X86_32 */
+               }
+       }
+       if (used_math()) {
+               sp -= sig_xstate_size;
+ #ifdef CONFIG_X86_64
+               sp = round_down(sp, 64);
+ #endif /* CONFIG_X86_64 */
+               *fpstate = (void __user *)sp;
+       }
+       sp = align_sigframe(sp - frame_size);
+       /*
+        * If we are on the alternate signal stack and would overflow it, don't.
+        * Return an always-bogus address instead so we will die with SIGSEGV.
+        */
+       if (onsigstack && !likely(on_sig_stack(sp)))
+               return (void __user *)-1L;
+       /* save i387 state */
+       if (used_math() && save_i387_xstate(*fpstate) < 0)
+               return (void __user *)-1L;
+       return (void __user *)sp;
+ }
  #ifdef CONFIG_X86_32
  static const struct {
        u16 poplmovl;
@@@ -210,54 -281,6 +281,6 @@@ static const struct 
        0
  };
  
- /*
-  * Determine which stack to use..
-  */
- static inline void __user *
- get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
-            void **fpstate)
- {
-       unsigned long sp;
-       /* Default to using normal stack */
-       sp = regs->sp;
-       /*
-        * If we are on the alternate signal stack and would overflow it, don't.
-        * Return an always-bogus address instead so we will die with SIGSEGV.
-        */
-       if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
-               return (void __user *) -1L;
-       /* This is the X/Open sanctioned signal stack switching.  */
-       if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(sp) == 0)
-                       sp = current->sas_ss_sp + current->sas_ss_size;
-       } else {
-               /* This is the legacy signal stack switching. */
-               if ((regs->ss & 0xffff) != __USER_DS &&
-                       !(ka->sa.sa_flags & SA_RESTORER) &&
-                               ka->sa.sa_restorer)
-                       sp = (unsigned long) ka->sa.sa_restorer;
-       }
-       if (used_math()) {
-               sp = sp - sig_xstate_size;
-               *fpstate = (struct _fpstate *) sp;
-               if (save_i387_xstate(*fpstate) < 0)
-                       return (void __user *)-1L;
-       }
-       sp -= frame_size;
-       /*
-        * Align the stack pointer according to the i386 ABI,
-        * i.e. so that on function entry ((sp + 4) & 15) == 0.
-        */
-       sp = ((sp + 4) & -16ul) - 4;
-       return (void __user *) sp;
- }
  static int
  __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
              struct pt_regs *regs)
@@@ -388,24 -411,6 +411,6 @@@ static int __setup_rt_frame(int sig, st
        return 0;
  }
  #else /* !CONFIG_X86_32 */
- /*
-  * Determine which stack to use..
-  */
- static void __user *
- get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
- {
-       /* Default to using normal stack - redzone*/
-       sp -= 128;
-       /* This is the X/Open sanctioned signal stack switching.  */
-       if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(sp) == 0)
-                       sp = current->sas_ss_sp + current->sas_ss_size;
-       }
-       return (void __user *)round_down(sp - size, 64);
- }
  static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                            sigset_t *set, struct pt_regs *regs)
  {
        int err = 0;
        struct task_struct *me = current;
  
-       if (used_math()) {
-               fp = get_stack(ka, regs->sp, sig_xstate_size);
-               frame = (void __user *)round_down(
-                       (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
-               if (save_i387_xstate(fp) < 0)
-                       return -EFAULT;
-       } else
-               frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
+       frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
  
        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
                return -EFAULT;
@@@ -875,11 -872,6 +872,11 @@@ do_notify_resume(struct pt_regs *regs, 
                tracehook_notify_resume(regs);
        }
  
 +      if (thread_info_flags & _TIF_PERF_COUNTERS) {
 +              clear_thread_flag(TIF_PERF_COUNTERS);
 +              perf_counter_notify(regs);
 +      }
 +
  #ifdef CONFIG_X86_32
        clear_thread_flag(TIF_IRET);
  #endif /* CONFIG_X86_32 */
@@@ -332,4 -332,5 +332,6 @@@ ENTRY(sys_call_table
        .long sys_dup3                  /* 330 */
        .long sys_pipe2
        .long sys_inotify_init1
 +      .long sys_perf_counter_open
+       .long sys_preadv
+       .long sys_pwritev
diff --combined arch/x86/kernel/traps.c
@@@ -118,47 -118,6 +118,6 @@@ die_if_kernel(const char *str, struct p
        if (!user_mode_vm(regs))
                die(str, regs, err);
  }
- /*
-  * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
-  * invalid offset set (the LAZY one) and the faulting thread has
-  * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
-  * we set the offset field correctly and return 1.
-  */
- static int lazy_iobitmap_copy(void)
- {
-       struct thread_struct *thread;
-       struct tss_struct *tss;
-       int cpu;
-       cpu = get_cpu();
-       tss = &per_cpu(init_tss, cpu);
-       thread = &current->thread;
-       if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
-           thread->io_bitmap_ptr) {
-               memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
-                      thread->io_bitmap_max);
-               /*
-                * If the previously set map was extending to higher ports
-                * than the current one, pad extra space with 0xff (no access).
-                */
-               if (thread->io_bitmap_max < tss->io_bitmap_max) {
-                       memset((char *) tss->io_bitmap +
-                               thread->io_bitmap_max, 0xff,
-                               tss->io_bitmap_max - thread->io_bitmap_max);
-               }
-               tss->io_bitmap_max = thread->io_bitmap_max;
-               tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-               tss->io_bitmap_owner = thread;
-               put_cpu();
-               return 1;
-       }
-       put_cpu();
-       return 0;
- }
  #endif
  
  static void __kprobes
@@@ -309,11 -268,6 +268,6 @@@ do_general_protection(struct pt_regs *r
        conditional_sti(regs);
  
  #ifdef CONFIG_X86_32
-       if (lazy_iobitmap_copy()) {
-               /* restart the faulting instruction */
-               return;
-       }
        if (regs->flags & X86_VM_MASK)
                goto gp_in_vm86;
  #endif
@@@ -991,13 -945,8 +945,13 @@@ void __init trap_init(void
  #endif
        set_intr_gate(19, &simd_coprocessor_error);
  
 +      /* Reserve all the builtin and the syscall vector: */
 +      for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
 +              set_bit(i, used_vectors);
 +
  #ifdef CONFIG_IA32_EMULATION
        set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
 +      set_bit(IA32_SYSCALL_VECTOR, used_vectors);
  #endif
  
  #ifdef CONFIG_X86_32
        }
  
        set_system_trap_gate(SYSCALL_VECTOR, &system_call);
 -#endif
 -
 -      /* Reserve all the builtin and the syscall vector: */
 -      for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
 -              set_bit(i, used_vectors);
 -
 -#ifdef CONFIG_X86_64
 -      set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 -#else
        set_bit(SYSCALL_VECTOR, used_vectors);
  #endif
 +
        /*
         * Should be a barrier for any external CPU state:
         */
@@@ -64,7 -64,6 +64,6 @@@
  #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
  ACPI_MODULE_NAME("processor_idle");
  #define ACPI_PROCESSOR_FILE_POWER     "power"
- #define US_TO_PM_TIMER_TICKS(t)               ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
  #define PM_TIMER_TICK_NS              (1000000000ULL/PM_TIMER_FREQUENCY)
  #define C2_OVERHEAD                   1       /* 1us */
  #define C3_OVERHEAD                   1       /* 1us */
@@@ -78,6 -77,10 +77,10 @@@ module_param(nocst, uint, 0000)
  static unsigned int latency_factor __read_mostly = 2;
  module_param(latency_factor, uint, 0644);
  
+ static s64 us_to_pm_timer_ticks(s64 t)
+ {
+       return div64_u64(t * PM_TIMER_FREQUENCY, 1000000);
+ }
  /*
   * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
   * For now disable this. Probably a bug somewhere else.
@@@ -101,57 -104,6 +104,6 @@@ static int set_max_cstate(const struct 
  /* Actually this shouldn't be __cpuinitdata, would be better to fix the
     callers to only run once -AK */
  static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET70WW")}, (void *)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW")}, (void *)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET43WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET45WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET47WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET50WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET52WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET55WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET56WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET59WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET61WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET62WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET64WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET65WW") }, (void*)1},
-       { set_max_cstate, "IBM ThinkPad R40e", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
-         DMI_MATCH(DMI_BIOS_VERSION,"1SET68WW") }, (void*)1},
-       { set_max_cstate, "Medion 41700", {
-         DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
-         DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J")}, (void *)1},
        { set_max_cstate, "Clevo 5600D", {
          DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
          DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")},
        {},
  };
  
- static inline u32 ticks_elapsed(u32 t1, u32 t2)
- {
-       if (t2 >= t1)
-               return (t2 - t1);
-       else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-               return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-       else
-               return ((0xFFFFFFFF - t1) + t2);
- }
- static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
- {
-       if (t2 >= t1)
-               return PM_TIMER_TICKS_TO_US(t2 - t1);
-       else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-               return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-       else
-               return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
- }
  
  /*
   * Callers should disable interrupts before the call and enable
@@@ -630,7 -563,7 +563,7 @@@ static void acpi_processor_power_verify
         * In either case, the proper way to
         * handle BM_RLD is to set it and leave it set.
         */
-       acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
+       acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
  
        return;
  }
@@@ -800,9 -733,9 +733,9 @@@ static int acpi_idle_bm_check(void
  {
        u32 bm_status = 0;
  
-       acpi_get_register_unlocked(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
+       acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
        if (bm_status)
-               acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
+               acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
        /*
         * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
         * the true state of bus mastering activity; forcing us to
   */
  static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
  {
 +      u64 perf_flags;
 +
        /* Don't trace irqs off for idle */
        stop_critical_timings();
 +      perf_flags = hw_perf_save_disable();
        if (cx->entry_method == ACPI_CSTATE_FFH) {
                /* Call into architectural FFH based C-state */
                acpi_processor_ffh_cstate_enter(cx);
                   gets asserted in time to freeze execution properly. */
                unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
        }
 +      hw_perf_restore(perf_flags);
        start_critical_timings();
  }
  
  static int acpi_idle_enter_c1(struct cpuidle_device *dev,
                              struct cpuidle_state *state)
  {
-       u32 t1, t2;
+       ktime_t  kt1, kt2;
+       s64 idle_time;
        struct acpi_processor *pr;
        struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
  
                return 0;
        }
  
-       t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt1 = ktime_get_real();
        acpi_idle_do_entry(cx);
-       t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt2 = ktime_get_real();
+       idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
  
        local_irq_enable();
        cx->usage++;
  
-       return ticks_elapsed_in_us(t1, t2);
+       return idle_time;
  }
  
  /**
@@@ -895,8 -826,9 +830,9 @@@ static int acpi_idle_enter_simple(struc
  {
        struct acpi_processor *pr;
        struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-       u32 t1, t2;
-       int sleep_ticks = 0;
+       ktime_t  kt1, kt2;
+       s64 idle_time;
+       s64 sleep_ticks = 0;
  
        pr = __get_cpu_var(processors);
  
        if (cx->type == ACPI_STATE_C3)
                ACPI_FLUSH_CPU_CACHE();
  
-       t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt1 = ktime_get_real();
        /* Tell the scheduler that we are going deep-idle: */
        sched_clock_idle_sleep_event();
        acpi_idle_do_entry(cx);
-       t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt2 = ktime_get_real();
+       idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
  
  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
        /* TSC could halt in idle, so notify users */
        if (tsc_halts_in_c(cx->type))
                mark_tsc_unstable("TSC halts in idle");;
  #endif
-       sleep_ticks = ticks_elapsed(t1, t2);
+       sleep_ticks = us_to_pm_timer_ticks(idle_time);
  
        /* Tell the scheduler how much we idled: */
        sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
  
        acpi_state_timer_broadcast(pr, cx, 0);
        cx->time += sleep_ticks;
-       return ticks_elapsed_in_us(t1, t2);
+       return idle_time;
  }
  
  static int c3_cpu_count;
@@@ -970,8 -903,10 +907,10 @@@ static int acpi_idle_enter_bm(struct cp
  {
        struct acpi_processor *pr;
        struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-       u32 t1, t2;
-       int sleep_ticks = 0;
+       ktime_t  kt1, kt2;
+       s64 idle_time;
+       s64 sleep_ticks = 0;
  
        pr = __get_cpu_var(processors);
  
                c3_cpu_count++;
                /* Disable bus master arbitration when all CPUs are in C3 */
                if (c3_cpu_count == num_online_cpus())
-                       acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
+                       acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
                spin_unlock(&c3_lock);
        } else if (!pr->flags.bm_check) {
                ACPI_FLUSH_CPU_CACHE();
        }
  
-       t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt1 = ktime_get_real();
        acpi_idle_do_entry(cx);
-       t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+       kt2 = ktime_get_real();
+       idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
  
        /* Re-enable bus master arbitration */
        if (pr->flags.bm_check && pr->flags.bm_control) {
                spin_lock(&c3_lock);
-               acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+               acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
                c3_cpu_count--;
                spin_unlock(&c3_lock);
        }
        if (tsc_halts_in_c(ACPI_STATE_C3))
                mark_tsc_unstable("TSC halts in idle");
  #endif
-       sleep_ticks = ticks_elapsed(t1, t2);
+       sleep_ticks = us_to_pm_timer_ticks(idle_time);
        /* Tell the scheduler how much we idled: */
        sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
  
  
        acpi_state_timer_broadcast(pr, cx, 0);
        cx->time += sleep_ticks;
-       return ticks_elapsed_in_us(t1, t2);
+       return idle_time;
  }
  
  struct cpuidle_driver acpi_idle_driver = {
diff --combined drivers/char/sysrq.c
@@@ -25,7 -25,6 +25,7 @@@
  #include <linux/kbd_kern.h>
  #include <linux/proc_fs.h>
  #include <linux/quotaops.h>
 +#include <linux/perf_counter.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
  #include <linux/suspend.h>
@@@ -36,7 -35,7 +36,7 @@@
  #include <linux/vt_kern.h>
  #include <linux/workqueue.h>
  #include <linux/kexec.h>
- #include <linux/irq.h>
+ #include <linux/interrupt.h>
  #include <linux/hrtimer.h>
  #include <linux/oom.h>
  
@@@ -245,7 -244,6 +245,7 @@@ static void sysrq_handle_showregs(int k
        struct pt_regs *regs = get_irq_regs();
        if (regs)
                show_regs(regs);
 +      perf_counter_print_debug();
  }
  static struct sysrq_key_op sysrq_showregs_op = {
        .handler        = sysrq_handle_showregs,
@@@ -285,7 -283,7 +285,7 @@@ static void sysrq_ftrace_dump(int key, 
  }
  static struct sysrq_key_op sysrq_ftrace_dump_op = {
        .handler        = sysrq_ftrace_dump,
-       .help_msg       = "dumpZ-ftrace-buffer",
+       .help_msg       = "dump-ftrace-buffer(Z)",
        .action_msg     = "Dump ftrace buffer",
        .enable_mask    = SYSRQ_ENABLE_DUMP,
  };
@@@ -348,6 -346,19 +348,19 @@@ static struct sysrq_key_op sysrq_moom_o
        .enable_mask    = SYSRQ_ENABLE_SIGNAL,
  };
  
+ #ifdef CONFIG_BLOCK
+ static void sysrq_handle_thaw(int key, struct tty_struct *tty)
+ {
+       emergency_thaw_all();
+ }
+ static struct sysrq_key_op sysrq_thaw_op = {
+       .handler        = sysrq_handle_thaw,
+       .help_msg       = "thaw-filesystems(J)",
+       .action_msg     = "Emergency Thaw of all frozen filesystems",
+       .enable_mask    = SYSRQ_ENABLE_SIGNAL,
+ };
+ #endif
  static void sysrq_handle_kill(int key, struct tty_struct *tty)
  {
        send_sig_all(SIGKILL);
@@@ -398,9 -409,13 +411,13 @@@ static struct sysrq_key_op *sysrq_key_t
        &sysrq_moom_op,                 /* f */
        /* g: May be registered by ppc for kgdb */
        NULL,                           /* g */
-       NULL,                           /* h */
+       NULL,                           /* h - reserved for help */
        &sysrq_kill_op,                 /* i */
+ #ifdef CONFIG_BLOCK
+       &sysrq_thaw_op,                 /* j */
+ #else
        NULL,                           /* j */
+ #endif
        &sysrq_SAK_op,                  /* k */
  #ifdef CONFIG_SMP
        &sysrq_showallcpus_op,          /* l */
diff --combined fs/exec.c
+++ b/fs/exec.c
@@@ -33,7 -33,6 +33,7 @@@
  #include <linux/string.h>
  #include <linux/init.h>
  #include <linux/pagemap.h>
 +#include <linux/perf_counter.h>
  #include <linux/highmem.h>
  #include <linux/spinlock.h>
  #include <linux/key.h>
@@@ -46,6 -45,7 +46,7 @@@
  #include <linux/proc_fs.h>
  #include <linux/mount.h>
  #include <linux/security.h>
+ #include <linux/ima.h>
  #include <linux/syscalls.h>
  #include <linux/tsacct_kern.h>
  #include <linux/cn_proc.h>
@@@ -53,6 -53,7 +54,7 @@@
  #include <linux/tracehook.h>
  #include <linux/kmod.h>
  #include <linux/fsnotify.h>
+ #include <linux/fs_struct.h>
  
  #include <asm/uaccess.h>
  #include <asm/mmu_context.h>
@@@ -128,6 -129,9 +130,9 @@@ SYSCALL_DEFINE1(uselib, const char __us
                                 MAY_READ | MAY_EXEC | MAY_OPEN);
        if (error)
                goto exit;
+       error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+       if (error)
+               goto exit;
  
        file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
        error = PTR_ERR(file);
@@@ -675,6 -679,9 +680,9 @@@ struct file *open_exec(const char *name
        err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
        if (err)
                goto out_path_put;
+       err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+       if (err)
+               goto out_path_put;
  
        file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
        if (IS_ERR(file))
@@@ -1011,13 -1018,6 +1019,13 @@@ int flush_old_exec(struct linux_binprm 
  
        current->personality &= ~bprm->per_clear;
  
 +      /*
 +       * Flush performance counters when crossing a
 +       * security domain:
 +       */
 +      if (!get_dumpable(current->mm))
 +              perf_counter_exit_task(current);
 +
        /* An exec changes our domain. We are no longer part of the thread
           group */
  
@@@ -1057,32 -1057,35 +1065,35 @@@ EXPORT_SYMBOL(install_exec_creds)
   * - the caller must hold current->cred_exec_mutex to protect against
   *   PTRACE_ATTACH
   */
void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
int check_unsafe_exec(struct linux_binprm *bprm)
  {
        struct task_struct *p = current, *t;
        unsigned long flags;
-       unsigned n_fs, n_files, n_sighand;
+       unsigned n_fs;
+       int res = 0;
  
        bprm->unsafe = tracehook_unsafe_exec(p);
  
        n_fs = 1;
-       n_files = 1;
-       n_sighand = 1;
+       write_lock(&p->fs->lock);
        lock_task_sighand(p, &flags);
        for (t = next_thread(p); t != p; t = next_thread(t)) {
                if (t->fs == p->fs)
                        n_fs++;
-               if (t->files == files)
-                       n_files++;
-               n_sighand++;
        }
  
-       if (atomic_read(&p->fs->count) > n_fs ||
-           atomic_read(&p->files->count) > n_files ||
-           atomic_read(&p->sighand->count) > n_sighand)
+       if (p->fs->users > n_fs) {
                bprm->unsafe |= LSM_UNSAFE_SHARE;
+       } else {
+               if (p->fs->in_exec)
+                       res = -EAGAIN;
+               p->fs->in_exec = 1;
+       }
  
        unlock_task_sighand(p, &flags);
+       write_unlock(&p->fs->lock);
+       return res;
  }
  
  /* 
@@@ -1192,6 -1195,9 +1203,9 @@@ int search_binary_handler(struct linux_
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
+       retval = ima_bprm_check(bprm);
+       if (retval)
+               return retval;
  
        /* kernel module loader fixup */
        /* so we don't try to load run modprobe in kernel space. */
@@@ -1292,17 -1298,21 +1306,21 @@@ int do_execve(char * filename
        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
        if (retval < 0)
                goto out_free;
+       current->in_execve = 1;
  
        retval = -ENOMEM;
        bprm->cred = prepare_exec_creds();
        if (!bprm->cred)
                goto out_unlock;
-       check_unsafe_exec(bprm, displaced);
+       retval = check_unsafe_exec(bprm);
+       if (retval)
+               goto out_unlock;
  
        file = open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
-               goto out_unlock;
+               goto out_unmark;
  
        sched_exec();
  
                goto out;
  
        /* execve succeeded */
+       write_lock(&current->fs->lock);
+       current->fs->in_exec = 0;
+       write_unlock(&current->fs->lock);
+       current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
@@@ -1362,7 -1376,13 +1384,13 @@@ out_file
                fput(bprm->file);
        }
  
+ out_unmark:
+       write_lock(&current->fs->lock);
+       current->fs->in_exec = 0;
+       write_unlock(&current->fs->lock);
  out_unlock:
+       current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
  
  out_free:
@@@ -120,16 -120,6 +120,16 @@@ extern struct group_info init_groups
  
  extern struct cred init_cred;
  
 +#ifdef CONFIG_PERF_COUNTERS
 +# define INIT_PERF_COUNTERS(tsk)                                      \
 +      .perf_counter_ctx.counter_list =                                \
 +              LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),      \
 +      .perf_counter_ctx.lock =                                        \
 +              __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
 +#else
 +# define INIT_PERF_COUNTERS(tsk)
 +#endif
 +
  /*
   *  INIT_TASK is used to set up the first task table, touch at
   * your own risk!. Base=0, limit=0x1fffff (=2MB)
                .nr_cpus_allowed = NR_CPUS,                             \
        },                                                              \
        .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
+       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
        .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
        .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
        .real_parent    = &tsk,                                         \
        INIT_IDS                                                        \
        INIT_TRACE_IRQFLAGS                                             \
        INIT_LOCKDEP                                                    \
 +      INIT_PERF_COUNTERS(tsk)                                         \
  }
  
  
@@@ -28,7 -28,7 +28,7 @@@ struct cpu_usage_stat 
  
  struct kernel_stat {
        struct cpu_usage_stat   cpustat;
- #ifndef CONFIG_SPARSE_IRQ
+ #ifndef CONFIG_GENERIC_HARDIRQS
         unsigned int irqs[NR_IRQS];
  #endif
  };
@@@ -41,7 -41,7 +41,7 @@@ DECLARE_PER_CPU(struct kernel_stat, kst
  
  extern unsigned long long nr_context_switches(void);
  
- #ifndef CONFIG_SPARSE_IRQ
+ #ifndef CONFIG_GENERIC_HARDIRQS
  #define kstat_irqs_this_cpu(irq) \
        (kstat_this_cpu.irqs[irq])
  
@@@ -52,16 -52,19 +52,19 @@@ static inline void kstat_incr_irqs_this
  {
        kstat_this_cpu.irqs[irq]++;
  }
- #endif
  
- #ifndef CONFIG_SPARSE_IRQ
  static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
  {
         return kstat_cpu(cpu).irqs[irq];
  }
  #else
+ #include <linux/irq.h>
  extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+ #define kstat_irqs_this_cpu(DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()])
+ #define kstat_incr_irqs_this_cpu(irqno, DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()]++)
  #endif
  
  /*
@@@ -78,15 -81,7 +81,15 @@@ static inline unsigned int kstat_irqs(u
        return sum;
  }
  
 +
 +/*
 + * Lock/unlock the current runqueue - to extract task statistics:
 + */
 +extern void curr_rq_lock_irq_save(unsigned long *flags);
 +extern void curr_rq_unlock_irq_restore(unsigned long *flags);
 +extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
  extern unsigned long long task_delta_exec(struct task_struct *);
 +
  extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
  extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
  extern void account_steal_time(cputime_t);
diff --combined include/linux/sched.h
@@@ -68,10 -68,9 +68,10 @@@ struct sched_param 
  #include <linux/smp.h>
  #include <linux/sem.h>
  #include <linux/signal.h>
- #include <linux/fs_struct.h>
+ #include <linux/path.h>
  #include <linux/compiler.h>
  #include <linux/completion.h>
 +#include <linux/perf_counter.h>
  #include <linux/pid.h>
  #include <linux/percpu.h>
  #include <linux/topology.h>
@@@ -98,6 -97,7 +98,7 @@@ struct futex_pi_state
  struct robust_list_head;
  struct bio;
  struct bts_tracer;
+ struct fs_struct;
  
  /*
   * List of flags we want to share for kernel threads,
@@@ -137,9 -137,9 +138,11 @@@ extern unsigned long nr_running(void)
  extern unsigned long nr_uninterruptible(void);
  extern unsigned long nr_active(void);
  extern unsigned long nr_iowait(void);
 +extern u64 cpu_nr_switches(int cpu);
 +extern u64 cpu_nr_migrations(int cpu);
  
+ extern unsigned long get_parent_ip(unsigned long addr);
  struct seq_file;
  struct cfs_rq;
  struct task_group;
@@@ -334,7 -334,9 +337,9 @@@ extern signed long schedule_timeout(sig
  extern signed long schedule_timeout_interruptible(signed long timeout);
  extern signed long schedule_timeout_killable(signed long timeout);
  extern signed long schedule_timeout_uninterruptible(signed long timeout);
+ asmlinkage void __schedule(void);
  asmlinkage void schedule(void);
+ extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
  
  struct nsproxy;
  struct user_namespace;
@@@ -392,8 -394,15 +397,15 @@@ extern void arch_unmap_area_topdown(str
                (mm)->hiwater_vm = (mm)->total_vm;      \
  } while (0)
  
- #define get_mm_hiwater_rss(mm)        max((mm)->hiwater_rss, get_mm_rss(mm))
- #define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
+ static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
+ {
+       return max(mm->hiwater_rss, get_mm_rss(mm));
+ }
+ static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
+ {
+       return max(mm->hiwater_vm, mm->total_vm);
+ }
  
  extern void set_dumpable(struct mm_struct *mm, int value);
  extern int get_dumpable(struct mm_struct *mm);
@@@ -541,25 -550,8 +553,8 @@@ struct signal_struct 
  
        struct list_head cpu_timers[3];
  
-       /* job control IDs */
-       /*
-        * pgrp and session fields are deprecated.
-        * use the task_session_Xnr and task_pgrp_Xnr routines below
-        */
-       union {
-               pid_t pgrp __deprecated;
-               pid_t __pgrp;
-       };
        struct pid *tty_old_pgrp;
  
-       union {
-               pid_t session __deprecated;
-               pid_t __session;
-       };
        /* boolean value for session group leader */
        int leader;
  
@@@ -1001,6 -993,7 +996,7 @@@ struct sched_class 
                              struct rq *busiest, struct sched_domain *sd,
                              enum cpu_idle_type idle);
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+       int (*needs_post_schedule) (struct rq *this_rq);
        void (*post_schedule) (struct rq *this_rq);
        void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
  
@@@ -1055,8 -1048,10 +1051,11 @@@ struct sched_entity 
        u64                     last_wakeup;
        u64                     avg_overlap;
  
 -      u64                     nr_migrations;
 +      u64                     nr_migrations;
 +
+       u64                     start_runtime;
+       u64                     avg_wakeup;
  #ifdef CONFIG_SCHEDSTATS
        u64                     wait_start;
        u64                     wait_max;
@@@ -1168,6 -1163,7 +1167,7 @@@ struct task_struct 
  #endif
  
        struct list_head tasks;
+       struct plist_node pushable_tasks;
  
        struct mm_struct *mm, *active_mm;
  
        /* ??? */
        unsigned int personality;
        unsigned did_exec:1;
+       unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
+                                * execve */
        pid_t pid;
        pid_t tgid;
  
        int lockdep_depth;
        unsigned int lockdep_recursion;
        struct held_lock held_locks[MAX_LOCK_DEPTH];
+       gfp_t lockdep_reclaim_gfp;
  #endif
  
  /* journalling filesystem info */
        struct list_head pi_state_list;
        struct futex_pi_state *pi_state_cache;
  #endif
 +      struct perf_counter_context perf_counter_ctx;
  #ifdef CONFIG_NUMA
        struct mempolicy *mempolicy;
        short il_next;
        int curr_ret_stack;
        /* Stack of return addresses for return function tracing */
        struct ftrace_ret_stack *ret_stack;
+       /* time stamp for last schedule */
+       unsigned long long ftrace_timestamp;
        /*
         * Number of functions that haven't been traced
         * because of depth overrun.
  #endif
  };
  
+ /* Future-safe accessor for struct task_struct's cpus_allowed. */
+ #define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
  /*
   * Priority of a process goes from 0..MAX_PRIO-1, valid RT
   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@@ -1454,16 -1457,6 +1462,6 @@@ static inline int rt_task(struct task_s
        return rt_prio(p->prio);
  }
  
- static inline void set_task_session(struct task_struct *tsk, pid_t session)
- {
-       tsk->signal->__session = session;
- }
- static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
- {
-       tsk->signal->__pgrp = pgrp;
- }
  static inline struct pid *task_pid(struct task_struct *task)
  {
        return task->pids[PIDTYPE_PID].pid;
@@@ -1474,6 -1467,11 +1472,11 @@@ static inline struct pid *task_tgid(str
        return task->group_leader->pids[PIDTYPE_PID].pid;
  }
  
+ /*
+  * Without tasklist or rcu lock it is not safe to dereference
+  * the result of task_pgrp/task_session even if task == current,
+  * we can race with another thread doing sys_setsid/sys_setpgid.
+  */
  static inline struct pid *task_pgrp(struct task_struct *task)
  {
        return task->group_leader->pids[PIDTYPE_PGID].pid;
@@@ -1499,17 -1497,23 +1502,23 @@@ struct pid_namespace
   *
   * see also pid_nr() etc in include/linux/pid.h
   */
+ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+                       struct pid_namespace *ns);
  
  static inline pid_t task_pid_nr(struct task_struct *tsk)
  {
        return tsk->pid;
  }
  
- pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+ static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
+                                       struct pid_namespace *ns)
+ {
+       return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+ }
  
  static inline pid_t task_pid_vnr(struct task_struct *tsk)
  {
-       return pid_vnr(task_pid(tsk));
+       return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
  }
  
  
@@@ -1526,31 -1530,34 +1535,34 @@@ static inline pid_t task_tgid_vnr(struc
  }
  
  
- static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+ static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
+                                       struct pid_namespace *ns)
  {
-       return tsk->signal->__pgrp;
+       return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
  }
  
- pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
  static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
  {
-       return pid_vnr(task_pgrp(tsk));
+       return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
  }
  
  
- static inline pid_t task_session_nr(struct task_struct *tsk)
+ static inline pid_t task_session_nr_ns(struct task_struct *tsk,
+                                       struct pid_namespace *ns)
  {
-       return tsk->signal->__session;
+       return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
  }
  
- pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
  static inline pid_t task_session_vnr(struct task_struct *tsk)
  {
-       return pid_vnr(task_session(tsk));
+       return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
  }
  
+ /* obsolete, do not use */
+ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+ {
+       return task_pgrp_nr_ns(tsk, &init_pid_ns);
+ }
  
  /**
   * pid_alive - check that a task structure is not stale
@@@ -1674,6 -1681,16 +1686,16 @@@ static inline int set_cpus_allowed(stru
        return set_cpus_allowed_ptr(p, &new_mask);
  }
  
+ /*
+  * Architectures can set this to 1 if they have specified
+  * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+  * but then during bootup it turns out that sched_clock()
+  * is reliable after all:
+  */
+ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ extern int sched_clock_stable;
+ #endif
  extern unsigned long long sched_clock(void);
  
  extern void sched_clock_init(void);
@@@ -1950,7 -1967,8 +1972,8 @@@ extern void mm_release(struct task_stru
  /* Allocate a new mm structure and copy contents from tsk->mm */
  extern struct mm_struct *dup_mm(struct task_struct *tsk);
  
- extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+ extern int copy_thread(unsigned long, unsigned long, unsigned long,
+                       struct task_struct *, struct pt_regs *);
  extern void flush_thread(void);
  extern void exit_thread(void);
  
@@@ -2035,6 -2053,11 +2058,11 @@@ static inline int thread_group_empty(st
  #define delay_group_leader(p) \
                (thread_group_leader(p) && !thread_group_empty(p))
  
+ static inline int task_detached(struct task_struct *p)
+ {
+       return p->exit_signal == -1;
+ }
  /*
   * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
   * subscriptions and synchronises with wait4().  Also used in procfs.  Also
@@@ -2357,13 -2380,6 +2385,13 @@@ static inline void inc_syscw(struct tas
  #define TASK_SIZE_OF(tsk)     TASK_SIZE
  #endif
  
 +/*
 + * Call the function if the target task is executing on a CPU right now:
 + */
 +extern void task_oncpu_function_call(struct task_struct *p,
 +                                   void (*func) (void *info), void *info);
 +
 +
  #ifdef CONFIG_MM_OWNER
  extern void mm_update_next_owner(struct mm_struct *mm);
  extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --combined include/linux/syscalls.h
@@@ -55,7 -55,6 +55,7 @@@ struct compat_timeval
  struct robust_list_head;
  struct getcpu_cache;
  struct old_linux_dirent;
 +struct perf_counter_hw_event;
  
  #include <linux/types.h>
  #include <linux/aio_abi.h>
@@@ -66,6 -65,7 +66,7 @@@
  #include <asm/signal.h>
  #include <linux/quota.h>
  #include <linux/key.h>
+ #include <linux/ftrace.h>
  
  #define __SC_DECL1(t1, a1)    t1 a1
  #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
  #define __SC_TEST5(t5, a5, ...)       __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
  #define __SC_TEST6(t6, a6, ...)       __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
  
+ #ifdef CONFIG_FTRACE_SYSCALLS
+ #define __SC_STR_ADECL1(t, a)         #a
+ #define __SC_STR_ADECL2(t, a, ...)    #a, __SC_STR_ADECL1(__VA_ARGS__)
+ #define __SC_STR_ADECL3(t, a, ...)    #a, __SC_STR_ADECL2(__VA_ARGS__)
+ #define __SC_STR_ADECL4(t, a, ...)    #a, __SC_STR_ADECL3(__VA_ARGS__)
+ #define __SC_STR_ADECL5(t, a, ...)    #a, __SC_STR_ADECL4(__VA_ARGS__)
+ #define __SC_STR_ADECL6(t, a, ...)    #a, __SC_STR_ADECL5(__VA_ARGS__)
+ #define __SC_STR_TDECL1(t, a)         #t
+ #define __SC_STR_TDECL2(t, a, ...)    #t, __SC_STR_TDECL1(__VA_ARGS__)
+ #define __SC_STR_TDECL3(t, a, ...)    #t, __SC_STR_TDECL2(__VA_ARGS__)
+ #define __SC_STR_TDECL4(t, a, ...)    #t, __SC_STR_TDECL3(__VA_ARGS__)
+ #define __SC_STR_TDECL5(t, a, ...)    #t, __SC_STR_TDECL4(__VA_ARGS__)
+ #define __SC_STR_TDECL6(t, a, ...)    #t, __SC_STR_TDECL5(__VA_ARGS__)
+ #define SYSCALL_METADATA(sname, nb)                           \
+       static const struct syscall_metadata __used             \
+         __attribute__((__aligned__(4)))                       \
+         __attribute__((section("__syscalls_metadata")))       \
+         __syscall_meta_##sname = {                            \
+               .name           = "sys"#sname,                  \
+               .nb_args        = nb,                           \
+               .types          = types_##sname,                \
+               .args           = args_##sname,                 \
+       }
+ #define SYSCALL_DEFINE0(sname)                                        \
+       static const struct syscall_metadata __used             \
+         __attribute__((__aligned__(4)))                       \
+         __attribute__((section("__syscalls_metadata")))       \
+         __syscall_meta_##sname = {                            \
+               .name           = "sys_"#sname,                 \
+               .nb_args        = 0,                            \
+       };                                                      \
+       asmlinkage long sys_##sname(void)
+ #else
  #define SYSCALL_DEFINE0(name)    asmlinkage long sys_##name(void)
+ #endif
  #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
  #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
  #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
  #endif
  #endif
  
+ #ifdef CONFIG_FTRACE_SYSCALLS
+ #define SYSCALL_DEFINEx(x, sname, ...)                                \
+       static const char *types_##sname[] = {                  \
+               __SC_STR_TDECL##x(__VA_ARGS__)                  \
+       };                                                      \
+       static const char *args_##sname[] = {                   \
+               __SC_STR_ADECL##x(__VA_ARGS__)                  \
+       };                                                      \
+       SYSCALL_METADATA(sname, x);                             \
+       __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+ #else
+ #define SYSCALL_DEFINEx(x, sname, ...)                                \
+       __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+ #endif
  #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
  
  #define SYSCALL_DEFINE(name) static inline long SYSC_##name
- #define SYSCALL_DEFINEx(x, name, ...)                                 \
+ #define __SYSCALL_DEFINEx(x, name, ...)                                       \
        asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));           \
        static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));       \
        asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))            \
  #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
  
  #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
- #define SYSCALL_DEFINEx(x, name, ...)                                 \
+ #define __SYSCALL_DEFINEx(x, name, ...)                                       \
        asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
  
  #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
@@@ -462,6 -517,10 +518,10 @@@ asmlinkage long sys_pread64(unsigned in
                            size_t count, loff_t pos);
  asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
                             size_t count, loff_t pos);
+ asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
+                          unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+ asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
+                           unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
  asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
  asmlinkage long sys_mkdir(const char __user *pathname, int mode);
  asmlinkage long sys_chdir(const char __user *filename);
@@@ -695,8 -754,4 +755,8 @@@ asmlinkage long sys_pipe(int __user *)
  
  int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
  
 +
 +asmlinkage long sys_perf_counter_open(
 +              const struct perf_counter_hw_event __user *hw_event_uptr,
 +              pid_t pid, int cpu, int group_fd, unsigned long flags);
  #endif
diff --combined init/Kconfig
@@@ -101,6 -101,66 +101,66 @@@ config LOCALVERSION_AUT
  
          which is done within the script "scripts/setlocalversion".)
  
+ config HAVE_KERNEL_GZIP
+       bool
+ config HAVE_KERNEL_BZIP2
+       bool
+ config HAVE_KERNEL_LZMA
+       bool
+ choice
+       prompt "Kernel compression mode"
+       default KERNEL_GZIP
+       depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA
+       help
+         The linux kernel is a kind of self-extracting executable.
+         Several compression algorithms are available, which differ
+         in efficiency, compression and decompression speed.
+         Compression speed is only relevant when building a kernel.
+         Decompression speed is relevant at each boot.
+         If you have any problems with bzip2 or lzma compressed
+         kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older
+         version of this functionality (bzip2 only), for 2.4, was
+         supplied by Christian Ludwig)
+         High compression options are mostly useful for users, who
+         are low on disk space (embedded systems), but for whom ram
+         size matters less.
+         If in doubt, select 'gzip'
+ config KERNEL_GZIP
+       bool "Gzip"
+       depends on HAVE_KERNEL_GZIP
+       help
+         The old and tried gzip compression. Its compression ratio is
+         the poorest among the 3 choices; however its speed (both
+         compression and decompression) is the fastest.
+ config KERNEL_BZIP2
+       bool "Bzip2"
+       depends on HAVE_KERNEL_BZIP2
+       help
+         Its compression ratio and speed is intermediate.
+         Decompression speed is slowest among the three.  The kernel
+         size is about 10% smaller with bzip2, in comparison to gzip.
+         Bzip2 uses a large amount of memory. For modern kernels you
+         will need at least 8MB RAM or more for booting.
+ config KERNEL_LZMA
+       bool "LZMA"
+       depends on HAVE_KERNEL_LZMA
+       help
+         The most recent compression algorithm.
+         Its ratio is best, decompression speed is between the other
+         two. Compression is slowest.  The kernel size is about 33%
+         smaller with LZMA in comparison to gzip.
+ endchoice
  config SWAP
        bool "Support for paging of anonymous memory (swap)"
        depends on MMU && BLOCK
@@@ -471,7 -531,7 +531,7 @@@ config CGROUP_DEVIC
  
  config CPUSETS
        bool "Cpuset support"
-       depends on SMP && CGROUPS
+       depends on CGROUPS
        help
          This option will let you create and manage CPUSETs which
          allow dynamically partitioning a system into sets of CPUs and
@@@ -505,7 -565,7 +565,7 @@@ config CGROUP_MEM_RES_CTL
        select MM_OWNER
        help
          Provides a memory resource controller that manages both anonymous
-         memory and page cache. (See Documentation/controllers/memory.txt)
+         memory and page cache. (See Documentation/cgroups/memory.txt)
  
          Note that setting this option increases fixed memory overhead
          associated with each page of memory in the system. By this,
@@@ -537,6 -597,8 +597,8 @@@ config CGROUP_MEM_RES_CTLR_SWA
          is disabled by boot option, this will be automatically disabled and
          there will be no overhead from this. Even when you set this config=y,
          if boot option "noswapaccount" is set, swap will not be accounted.
+         Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
+         size is 4096bytes, 512k per 1Gbytes of swap.
  
  endif # CGROUPS
  
@@@ -627,7 -689,7 +689,7 @@@ config PID_N
        depends on NAMESPACES && EXPERIMENTAL
        help
          Support process id namespaces.  This allows having multiple
-         process with the same pid as long as they are in different
+         processes with the same pid as long as they are in different
          pid namespaces.  This is a building block of containers.
  
          Unless you want to work with an experimental feature
@@@ -675,6 -737,9 +737,9 @@@ config CC_OPTIMIZE_FOR_SIZ
  config SYSCTL
        bool
  
+ config ANON_INODES
+       bool
  menuconfig EMBEDDED
        bool "Configure standard kernel features (for small systems)"
        help
@@@ -780,18 -845,6 +845,6 @@@ config PCSPKR_PLATFOR
            This option allows to disable the internal PC-Speaker
            support, saving some memory.
  
- config COMPAT_BRK
-       bool "Disable heap randomization"
-       default y
-       help
-         Randomizing heap placement makes heap exploits harder, but it
-         also breaks ancient binaries (including anything libc5 based).
-         This option changes the bootup default to heap randomization
-         disabled, and can be overriden runtime by setting
-         /proc/sys/kernel/randomize_va_space to 2.
-         On non-ancient distros (post-2000 ones) N is usually a safe choice.
  config BASE_FULL
        default y
        bool "Enable full-sized data structures for core" if EMBEDDED
@@@ -809,9 -862,6 +862,6 @@@ config FUTE
          support for "fast userspace mutexes".  The resulting kernel may not
          run glibc-based applications correctly.
  
- config ANON_INODES
-       bool
  config EPOLL
        bool "Enable eventpoll support" if EMBEDDED
        default y
@@@ -869,36 -919,6 +919,36 @@@ config AI
            by some high performance threaded applications. Disabling
            this option saves about 7k.
  
 +config HAVE_PERF_COUNTERS
 +      bool
 +
 +menu "Performance Counters"
 +
 +config PERF_COUNTERS
 +      bool "Kernel Performance Counters"
 +      depends on HAVE_PERF_COUNTERS
 +      default y
 +      select ANON_INODES
 +      help
 +        Enable kernel support for performance counter hardware.
 +
 +        Performance counters are special hardware registers available
 +        on most modern CPUs. These registers count the number of certain
 +        types of hw events: such as instructions executed, cachemisses
 +        suffered, or branches mis-predicted - without slowing down the
 +        kernel or applications. These registers can also trigger interrupts
 +        when a threshold number of events have passed - and can thus be
 +        used to profile the code that runs on that CPU.
 +
 +        The Linux Performance Counter subsystem provides an abstraction of
 +        these hardware capabilities, available via a system call. It
 +        provides per task and per CPU counters, and it provides event
 +        capabilities on top of those.
 +
 +        Say Y if unsure.
 +
 +endmenu
 +
  config VM_EVENT_COUNTERS
        default y
        bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
@@@ -927,6 -947,18 +977,18 @@@ config SLUB_DEBU
          SLUB sysfs support. /sys/slab will not exist and there will be
          no support for cache validation etc.
  
+ config COMPAT_BRK
+       bool "Disable heap randomization"
+       default y
+       help
+         Randomizing heap placement makes heap exploits harder, but it
+         also breaks ancient binaries (including anything libc5 based).
+         This option changes the bootup default to heap randomization
+         disabled, and can be overridden at runtime by setting
+         /proc/sys/kernel/randomize_va_space to 2.
+         On non-ancient distros (post-2000 ones) N is usually a safe choice.
  choice
        prompt "Choose SLAB allocator"
        default SLUB
@@@ -975,13 -1007,25 +1037,25 @@@ config TRACEPOINT
  
  config MARKERS
        bool "Activate markers"
-       depends on TRACEPOINTS
+       select TRACEPOINTS
        help
          Place an empty function call at each marker site. Can be
          dynamically changed for a probe function.
  
  source "arch/Kconfig"
  
+ config SLOW_WORK
+       default n
+       bool "Enable slow work thread pool"
+       help
+         The slow work thread pool provides a number of dynamically allocated
+         threads that can be used by the kernel to perform operations that
+         take a relatively long time.
+         An example of this would be CacheFiles doing a path lookup followed
+         by a series of mkdirs and a create call, all of which have to touch
+         disk.
  endmenu               # General setup
  
  config HAVE_GENERIC_DMA_COHERENT
@@@ -996,7 -1040,6 +1070,6 @@@ config SLABINF
  
  config RT_MUTEXES
        boolean
-       select PLIST
  
  config BASE_SMALL
        int
@@@ -1081,7 -1124,7 +1154,7 @@@ config INIT_ALL_POSSIBL
          cpu_possible_map, some of them chose to initialize cpu_possible_map
          with all 1s, and others with all 0s.  When they were centralised,
          it was better to provide this option than to break all the archs
-         and have several arch maintainers persuing me down dark alleys.
+         and have several arch maintainers pursuing me down dark alleys.
  
  config STOP_MACHINE
        bool
diff --combined kernel/Makefile
@@@ -93,7 -93,7 +93,8 @@@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT
  obj-$(CONFIG_FUNCTION_TRACER) += trace/
  obj-$(CONFIG_TRACING) += trace/
  obj-$(CONFIG_SMP) += sched_cpupri.o
+ obj-$(CONFIG_SLOW_WORK) += slow-work.o
 +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
  
  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --combined kernel/exit.c
@@@ -46,6 -46,7 +46,7 @@@
  #include <linux/blkdev.h>
  #include <linux/task_io_accounting_ops.h>
  #include <linux/tracehook.h>
+ #include <linux/fs_struct.h>
  #include <linux/init_task.h>
  #include <trace/sched.h>
  
@@@ -61,11 -62,6 +62,6 @@@ DEFINE_TRACE(sched_process_wait)
  
  static void exit_mm(struct task_struct * tsk);
  
- static inline int task_detached(struct task_struct *p)
- {
-       return p->exit_signal == -1;
- }
  static void __unhash_process(struct task_struct *p)
  {
        nr_threads--;
@@@ -162,9 -158,6 +158,9 @@@ static void delayed_put_task_struct(str
  {
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
  
 +#ifdef CONFIG_PERF_COUNTERS
 +      WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
 +#endif
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
  }
@@@ -365,16 -358,12 +361,12 @@@ static void reparent_to_kthreadd(void
  void __set_special_pids(struct pid *pid)
  {
        struct task_struct *curr = current->group_leader;
-       pid_t nr = pid_nr(pid);
  
-       if (task_session(curr) != pid) {
+       if (task_session(curr) != pid)
                change_pid(curr, PIDTYPE_SID, pid);
-               set_task_session(curr, nr);
-       }
-       if (task_pgrp(curr) != pid) {
+       if (task_pgrp(curr) != pid)
                change_pid(curr, PIDTYPE_PGID, pid);
-               set_task_pgrp(curr, nr);
-       }
  }
  
  static void set_special_pids(struct pid *pid)
@@@ -432,7 -421,6 +424,6 @@@ EXPORT_SYMBOL(disallow_signal)
  void daemonize(const char *name, ...)
  {
        va_list args;
-       struct fs_struct *fs;
        sigset_t blocked;
  
        va_start(args, name);
  
        /* Become as one with the init task */
  
-       exit_fs(current);       /* current->fs->count--; */
-       fs = init_task.fs;
-       current->fs = fs;
-       atomic_inc(&fs->count);
+       daemonize_fs_struct();
        exit_files(current);
        current->files = init_task.files;
        atomic_inc(&current->files->count);
@@@ -568,30 -552,6 +555,6 @@@ void exit_files(struct task_struct *tsk
        }
  }
  
- void put_fs_struct(struct fs_struct *fs)
- {
-       /* No need to hold fs->lock if we are killing it */
-       if (atomic_dec_and_test(&fs->count)) {
-               path_put(&fs->root);
-               path_put(&fs->pwd);
-               kmem_cache_free(fs_cachep, fs);
-       }
- }
- void exit_fs(struct task_struct *tsk)
- {
-       struct fs_struct * fs = tsk->fs;
-       if (fs) {
-               task_lock(tsk);
-               tsk->fs = NULL;
-               task_unlock(tsk);
-               put_fs_struct(fs);
-       }
- }
- EXPORT_SYMBOL_GPL(exit_fs);
  #ifdef CONFIG_MM_OWNER
  /*
   * Task p is exiting and it owned mm, lets find a new owner for it
@@@ -735,119 -695,6 +698,6 @@@ static void exit_mm(struct task_struct 
  }
  
  /*
-  * Return nonzero if @parent's children should reap themselves.
-  *
-  * Called with write_lock_irq(&tasklist_lock) held.
-  */
- static int ignoring_children(struct task_struct *parent)
- {
-       int ret;
-       struct sighand_struct *psig = parent->sighand;
-       unsigned long flags;
-       spin_lock_irqsave(&psig->siglock, flags);
-       ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
-              (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
-       spin_unlock_irqrestore(&psig->siglock, flags);
-       return ret;
- }
- /*
-  * Detach all tasks we were using ptrace on.
-  * Any that need to be release_task'd are put on the @dead list.
-  *
-  * Called with write_lock(&tasklist_lock) held.
-  */
- static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
- {
-       struct task_struct *p, *n;
-       int ign = -1;
-       list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-               __ptrace_unlink(p);
-               if (p->exit_state != EXIT_ZOMBIE)
-                       continue;
-               /*
-                * If it's a zombie, our attachedness prevented normal
-                * parent notification or self-reaping.  Do notification
-                * now if it would have happened earlier.  If it should
-                * reap itself, add it to the @dead list.  We can't call
-                * release_task() here because we already hold tasklist_lock.
-                *
-                * If it's our own child, there is no notification to do.
-                * But if our normal children self-reap, then this child
-                * was prevented by ptrace and we must reap it now.
-                */
-               if (!task_detached(p) && thread_group_empty(p)) {
-                       if (!same_thread_group(p->real_parent, parent))
-                               do_notify_parent(p, p->exit_signal);
-                       else {
-                               if (ign < 0)
-                                       ign = ignoring_children(parent);
-                               if (ign)
-                                       p->exit_signal = -1;
-                       }
-               }
-               if (task_detached(p)) {
-                       /*
-                        * Mark it as in the process of being reaped.
-                        */
-                       p->exit_state = EXIT_DEAD;
-                       list_add(&p->ptrace_entry, dead);
-               }
-       }
- }
- /*
-  * Finish up exit-time ptrace cleanup.
-  *
-  * Called without locks.
-  */
- static void ptrace_exit_finish(struct task_struct *parent,
-                              struct list_head *dead)
- {
-       struct task_struct *p, *n;
-       BUG_ON(!list_empty(&parent->ptraced));
-       list_for_each_entry_safe(p, n, dead, ptrace_entry) {
-               list_del_init(&p->ptrace_entry);
-               release_task(p);
-       }
- }
- static void reparent_thread(struct task_struct *p, struct task_struct *father)
- {
-       if (p->pdeath_signal)
-               /* We already hold the tasklist_lock here.  */
-               group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
-       list_move_tail(&p->sibling, &p->real_parent->children);
-       /* If this is a threaded reparent there is no need to
-        * notify anyone anything has happened.
-        */
-       if (same_thread_group(p->real_parent, father))
-               return;
-       /* We don't want people slaying init.  */
-       if (!task_detached(p))
-               p->exit_signal = SIGCHLD;
-       /* If we'd notified the old parent about this child's death,
-        * also notify the new parent.
-        */
-       if (!ptrace_reparented(p) &&
-           p->exit_state == EXIT_ZOMBIE &&
-           !task_detached(p) && thread_group_empty(p))
-               do_notify_parent(p, p->exit_signal);
-       kill_orphaned_pgrp(p, father);
- }
- /*
   * When we die, we re-parent all our children.
   * Try to give them to another thread in our thread
   * group, and if no such member exists, give it to
@@@ -886,17 -733,51 +736,51 @@@ static struct task_struct *find_new_rea
        return pid_ns->child_reaper;
  }
  
+ /*
+ * Any that need to be release_task'd are put on the @dead list.
+  */
+ static void reparent_thread(struct task_struct *father, struct task_struct *p,
+                               struct list_head *dead)
+ {
+       if (p->pdeath_signal)
+               group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+       list_move_tail(&p->sibling, &p->real_parent->children);
+       if (task_detached(p))
+               return;
+       /*
+        * If this is a threaded reparent there is no need to
+        * notify anyone anything has happened.
+        */
+       if (same_thread_group(p->real_parent, father))
+               return;
+       /* We don't want people slaying init.  */
+       p->exit_signal = SIGCHLD;
+       /* If it has exited notify the new parent about this child's death. */
+       if (!p->ptrace &&
+           p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+               do_notify_parent(p, p->exit_signal);
+               if (task_detached(p)) {
+                       p->exit_state = EXIT_DEAD;
+                       list_move_tail(&p->sibling, dead);
+               }
+       }
+       kill_orphaned_pgrp(p, father);
+ }
  static void forget_original_parent(struct task_struct *father)
  {
        struct task_struct *p, *n, *reaper;
-       LIST_HEAD(ptrace_dead);
+       LIST_HEAD(dead_children);
+       exit_ptrace(father);
  
        write_lock_irq(&tasklist_lock);
        reaper = find_new_reaper(father);
-       /*
-        * First clean up ptrace if we were using it.
-        */
-       ptrace_exit(father, &ptrace_dead);
  
        list_for_each_entry_safe(p, n, &father->children, sibling) {
                p->real_parent = reaper;
                        BUG_ON(p->ptrace);
                        p->parent = p->real_parent;
                }
-               reparent_thread(p, father);
+               reparent_thread(father, p, &dead_children);
        }
        write_unlock_irq(&tasklist_lock);
        BUG_ON(!list_empty(&father->children));
  
-       ptrace_exit_finish(father, &ptrace_dead);
+       list_for_each_entry_safe(p, n, &dead_children, sibling) {
+               list_del_init(&p->sibling);
+               release_task(p);
+       }
  }
  
  /*
@@@ -1096,6 -980,10 +983,6 @@@ NORET_TYPE void do_exit(long code
        tsk->mempolicy = NULL;
  #endif
  #ifdef CONFIG_FUTEX
 -      /*
 -       * This must happen late, after the PID is not
 -       * hashed anymore:
 -       */
        if (unlikely(!list_empty(&tsk->pi_state_list)))
                exit_pi_state_list(tsk);
        if (unlikely(current->pi_state_cache))
@@@ -1362,12 -1250,6 +1249,12 @@@ static int wait_task_zombie(struct task
         */
        read_unlock(&tasklist_lock);
  
 +      /*
 +       * Flush inherited counters to the parent - before the parent
 +       * gets woken up by child-exit notifications.
 +       */
 +      perf_counter_exit_task(p);
 +
        retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
        return retval;
  }
  
+ static int *task_stopped_code(struct task_struct *p, bool ptrace)
+ {
+       if (ptrace) {
+               if (task_is_stopped_or_traced(p))
+                       return &p->exit_code;
+       } else {
+               if (p->signal->flags & SIGNAL_STOP_STOPPED)
+                       return &p->signal->group_exit_code;
+       }
+       return NULL;
+ }
  /*
   * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
   * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
@@@ -1432,7 -1326,7 +1331,7 @@@ static int wait_task_stopped(int ptrace
                             int options, struct siginfo __user *infop,
                             int __user *stat_addr, struct rusage __user *ru)
  {
-       int retval, exit_code, why;
+       int retval, exit_code, *p_code, why;
        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;
  
        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);
  
-       if (unlikely(!task_is_stopped_or_traced(p)))
-               goto unlock_sig;
-       if (!ptrace && p->signal->group_stop_count > 0)
-               /*
-                * A group stop is in progress and this is the group leader.
-                * We won't report until all threads have stopped.
-                */
+       p_code = task_stopped_code(p, ptrace);
+       if (unlikely(!p_code))
                goto unlock_sig;
  
-       exit_code = p->exit_code;
+       exit_code = *p_code;
        if (!exit_code)
                goto unlock_sig;
  
        if (!unlikely(options & WNOWAIT))
-               p->exit_code = 0;
+               *p_code = 0;
  
        /* don't need the RCU readlock here as we're holding a spinlock */
        uid = __task_cred(p)->uid;
@@@ -1613,7 -1501,7 +1506,7 @@@ static int wait_consider_task(struct ta
         */
        *notask_error = 0;
  
-       if (task_is_stopped_or_traced(p))
+       if (task_stopped_code(p, ptrace))
                return wait_task_stopped(ptrace, p, options,
                                         infop, stat_addr, ru);
  
@@@ -1817,7 -1705,7 +1710,7 @@@ SYSCALL_DEFINE4(wait4, pid_t, upid, in
                pid = find_get_pid(-upid);
        } else if (upid == 0) {
                type = PIDTYPE_PGID;
-               pid = get_pid(task_pgrp(current));
+               pid = get_task_pid(current, PIDTYPE_PGID);
        } else /* upid > 0 */ {
                type = PIDTYPE_PID;
                pid = find_get_pid(upid);
diff --combined kernel/fork.c
@@@ -60,6 -60,7 +60,7 @@@
  #include <linux/tty.h>
  #include <linux/proc_fs.h>
  #include <linux/blkdev.h>
+ #include <linux/fs_struct.h>
  #include <trace/sched.h>
  #include <linux/magic.h>
  
@@@ -284,7 -285,7 +285,7 @@@ static int dup_mmap(struct mm_struct *m
        mm->free_area_cache = oldmm->mmap_base;
        mm->cached_hole_size = ~0UL;
        mm->map_count = 0;
-       cpus_clear(mm->cpu_vm_mask);
+       cpumask_clear(mm_cpumask(mm));
        mm->mm_rb = RB_ROOT;
        rb_link = &mm->mm_rb.rb_node;
        rb_parent = NULL;
@@@ -681,38 -682,21 +682,21 @@@ fail_nomem
        return retval;
  }
  
- static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
- {
-       struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
-       /* We don't need to lock fs - think why ;-) */
-       if (fs) {
-               atomic_set(&fs->count, 1);
-               rwlock_init(&fs->lock);
-               fs->umask = old->umask;
-               read_lock(&old->lock);
-               fs->root = old->root;
-               path_get(&old->root);
-               fs->pwd = old->pwd;
-               path_get(&old->pwd);
-               read_unlock(&old->lock);
-       }
-       return fs;
- }
- struct fs_struct *copy_fs_struct(struct fs_struct *old)
- {
-       return __copy_fs_struct(old);
- }
- EXPORT_SYMBOL_GPL(copy_fs_struct);
  static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
  {
+       struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
-               atomic_inc(&current->fs->count);
+               /* tsk->fs is already what we want */
+               write_lock(&fs->lock);
+               if (fs->in_exec) {
+                       write_unlock(&fs->lock);
+                       return -EAGAIN;
+               }
+               fs->users++;
+               write_unlock(&fs->lock);
                return 0;
        }
-       tsk->fs = __copy_fs_struct(current->fs);
+       tsk->fs = copy_fs_struct(fs);
        if (!tsk->fs)
                return -ENOMEM;
        return 0;
@@@ -841,6 -825,8 +825,8 @@@ static int copy_signal(unsigned long cl
        atomic_set(&sig->live, 1);
        init_waitqueue_head(&sig->wait_chldexit);
        sig->flags = 0;
+       if (clone_flags & CLONE_NEWPID)
+               sig->flags |= SIGNAL_UNKILLABLE;
        sig->group_exit_code = 0;
        sig->group_exit_task = NULL;
        sig->group_stop_count = 0;
@@@ -989,7 -975,6 +975,7 @@@ static struct task_struct *copy_process
                goto fork_out;
  
        rt_mutex_init_task(p);
 +      perf_counter_init_task(p);
  
  #ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
                goto bad_fork_cleanup_mm;
        if ((retval = copy_io(clone_flags, p)))
                goto bad_fork_cleanup_namespaces;
-       retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+       retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
        if (retval)
                goto bad_fork_cleanup_io;
  
  #endif
        clear_all_latency_tracing(p);
  
-       /* Our parent execution domain becomes current domain
-          These must match for thread signalling to apply */
-       p->parent_exec_id = p->self_exec_id;
        /* ok, now we should be set up.. */
        p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
        p->pdeath_signal = 0;
                set_task_cpu(p, smp_processor_id());
  
        /* CLONE_PARENT re-uses the old parent */
-       if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+       if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
-       else
+               p->parent_exec_id = current->parent_exec_id;
+       } else {
                p->real_parent = current;
+               p->parent_exec_id = current->self_exec_id;
+       }
  
        spin_lock(&current->sighand->siglock);
  
                        p->signal->leader_pid = pid;
                        tty_kref_put(p->signal->tty);
                        p->signal->tty = tty_kref_get(current->signal->tty);
-                       set_task_pgrp(p, task_pgrp_nr(current));
-                       set_task_session(p, task_session_nr(current));
                        attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        attach_pid(p, PIDTYPE_SID, task_session(current));
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
@@@ -1490,6 -1472,7 +1473,7 @@@ void __init proc_caches_init(void
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
        mmap_init();
  }
  
@@@ -1545,12 -1528,16 +1529,16 @@@ static int unshare_fs(unsigned long uns
  {
        struct fs_struct *fs = current->fs;
  
-       if ((unshare_flags & CLONE_FS) &&
-           (fs && atomic_read(&fs->count) > 1)) {
-               *new_fsp = __copy_fs_struct(current->fs);
-               if (!*new_fsp)
-                       return -ENOMEM;
-       }
+       if (!(unshare_flags & CLONE_FS) || !fs)
+               return 0;
+       /* don't need lock here; in the worst case we'll do useless copy */
+       if (fs->users == 1)
+               return 0;
+       *new_fsp = copy_fs_struct(fs);
+       if (!*new_fsp)
+               return -ENOMEM;
  
        return 0;
  }
@@@ -1666,8 -1653,13 +1654,13 @@@ SYSCALL_DEFINE1(unshare, unsigned long
  
                if (new_fs) {
                        fs = current->fs;
+                       write_lock(&fs->lock);
                        current->fs = new_fs;
-                       new_fs = fs;
+                       if (--fs->users)
+                               new_fs = NULL;
+                       else
+                               new_fs = fs;
+                       write_unlock(&fs->lock);
                }
  
                if (new_mm) {
@@@ -1706,7 -1698,7 +1699,7 @@@ bad_unshare_cleanup_sigh
  
  bad_unshare_cleanup_fs:
        if (new_fs)
-               put_fs_struct(new_fs);
+               free_fs_struct(new_fs);
  
  bad_unshare_cleanup_thread:
  bad_unshare_out:
diff --combined kernel/sched.c
@@@ -331,6 -331,13 +331,13 @@@ static DEFINE_PER_CPU(struct rt_rq, ini
   */
  static DEFINE_SPINLOCK(task_group_lock);
  
+ #ifdef CONFIG_SMP
+ static int root_task_group_empty(void)
+ {
+       return list_empty(&root_task_group.children);
+ }
+ #endif
  #ifdef CONFIG_FAIR_GROUP_SCHED
  #ifdef CONFIG_USER_SCHED
  # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@@ -391,6 -398,13 +398,13 @@@ static inline void set_task_rq(struct t
  
  #else
  
+ #ifdef CONFIG_SMP
+ static int root_task_group_empty(void)
+ {
+       return 1;
+ }
+ #endif
  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
  static inline struct task_group *task_group(struct task_struct *p)
  {
@@@ -467,11 -481,17 +481,17 @@@ struct rt_rq 
        struct rt_prio_array active;
        unsigned long rt_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       int highest_prio; /* highest queued rt task prio */
+       struct {
+               int curr; /* highest queued rt task prio */
+ #ifdef CONFIG_SMP
+               int next; /* next highest */
+ #endif
+       } highest_prio;
  #endif
  #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
        int overloaded;
+       struct plist_head pushable_tasks;
  #endif
        int rt_throttled;
        u64 rt_time;
@@@ -549,7 -569,6 +569,6 @@@ struct rq 
        unsigned long nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-       unsigned char idle_at_tick;
  #ifdef CONFIG_NO_HZ
        unsigned long last_tick_seen;
        unsigned char in_nohz_recently;
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
 +      u64 nr_migrations_in;
  
        struct cfs_rq cfs;
        struct rt_rq rt;
        struct root_domain *rd;
        struct sched_domain *sd;
  
+       unsigned char idle_at_tick;
        /* For active balancing */
        int active_balance;
        int push_cpu;
        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
        /* sys_sched_yield() stats */
-       unsigned int yld_exp_empty;
-       unsigned int yld_act_empty;
-       unsigned int yld_both_empty;
        unsigned int yld_count;
  
        /* schedule() stats */
@@@ -669,7 -685,7 +686,7 @@@ static inline int cpu_of(struct rq *rq
  #define task_rq(p)            cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
  
 -static inline void update_rq_clock(struct rq *rq)
 +inline void update_rq_clock(struct rq *rq)
  {
        rq->clock = sched_clock_cpu(cpu_of(rq));
  }
@@@ -980,26 -996,6 +997,26 @@@ static struct rq *task_rq_lock(struct t
        }
  }
  
 +void curr_rq_lock_irq_save(unsigned long *flags)
 +      __acquires(rq->lock)
 +{
 +      struct rq *rq;
 +
 +      local_irq_save(*flags);
 +      rq = cpu_rq(smp_processor_id());
 +      spin_lock(&rq->lock);
 +}
 +
 +void curr_rq_unlock_irq_restore(unsigned long *flags)
 +      __releases(rq->lock)
 +{
 +      struct rq *rq;
 +
 +      rq = cpu_rq(smp_processor_id());
 +      spin_unlock(&rq->lock);
 +      local_irq_restore(*flags);
 +}
 +
  void task_rq_unlock_wait(struct task_struct *p)
  {
        struct rq *rq = task_rq(p);
@@@ -1114,7 -1110,7 +1131,7 @@@ static void hrtick_start(struct rq *rq
        if (rq == this_rq()) {
                hrtimer_restart(timer);
        } else if (!rq->hrtick_csd_pending) {
-               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
                rq->hrtick_csd_pending = 1;
        }
  }
@@@ -1204,10 -1200,10 +1221,10 @@@ static void resched_task(struct task_st
  
        assert_spin_locked(&task_rq(p)->lock);
  
-       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+       if (test_tsk_need_resched(p))
                return;
  
-       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+       set_tsk_need_resched(p);
  
        cpu = task_cpu(p);
        if (cpu == smp_processor_id())
@@@ -1263,7 -1259,7 +1280,7 @@@ void wake_up_idle_cpu(int cpu
         * lockless. The worst case is that the other CPU runs the
         * idle task through an additional NOOP schedule()
         */
-       set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+       set_tsk_need_resched(rq->idle);
  
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@@ -1631,21 -1627,42 +1648,42 @@@ static inline void update_shares_locked
  
  #endif
  
+ #ifdef CONFIG_PREEMPT
  /*
-  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  * fair double_lock_balance: Safely acquires both rq->locks in a fair
+  * way at the expense of forcing extra atomic operations in all
+  * invocations.  This assures that the double_lock is acquired using the
+  * same underlying policy as the spinlock_t on this architecture, which
+  * reduces latency compared to the unfair variant below.  However, it
+  * also adds more overhead and therefore may reduce throughput.
   */
- static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+ {
+       spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+       return 1;
+ }
+ #else
+ /*
+  * Unfair double_lock_balance: Optimizes throughput at the expense of
+  * latency by eliminating extra atomic operations when the locks are
+  * already in proper order on entry.  This favors lower cpu-ids and will
+  * grant the double lock to lower cpus over higher ids under contention,
+  * regardless of entry order into the function.
+  */
+ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
  {
        int ret = 0;
  
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
        return ret;
  }
  
+ #endif /* CONFIG_PREEMPT */
+ /*
+  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  */
+ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ {
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       return _double_lock_balance(this_rq, busiest);
+ }
  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
  {
@@@ -1726,6 -1759,9 +1780,9 @@@ static void update_avg(u64 *avg, u64 sa
  
  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
+       if (wakeup)
+               p->se.start_runtime = p->se.sum_exec_runtime;
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, wakeup);
        p->se.on_rq = 1;
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
  {
-       if (sleep && p->se.last_wakeup) {
-               update_avg(&p->se.avg_overlap,
-                          p->se.sum_exec_runtime - p->se.last_wakeup);
-               p->se.last_wakeup = 0;
+       if (sleep) {
+               if (p->se.last_wakeup) {
+                       update_avg(&p->se.avg_overlap,
+                               p->se.sum_exec_runtime - p->se.last_wakeup);
+                       p->se.last_wakeup = 0;
+               } else {
+                       update_avg(&p->se.avg_wakeup,
+                               sysctl_sched_wakeup_granularity);
+               }
        }
  
        sched_info_dequeued(p);
@@@ -1906,15 -1947,12 +1968,15 @@@ void set_task_cpu(struct task_struct *p
                p->se.sleep_start -= clock_offset;
        if (p->se.block_start)
                p->se.block_start -= clock_offset;
 +#endif
        if (old_cpu != new_cpu) {
 -              schedstat_inc(p, se.nr_migrations);
 +              p->se.nr_migrations++;
 +              new_rq->nr_migrations_in++;
 +#ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
 -      }
  #endif
 +      }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
  
@@@ -2041,7 -2079,7 +2103,7 @@@ unsigned long wait_task_inactive(struc
                 * it must be off the runqueue _entirely_, and not
                 * preempted!
                 *
-                * So if it wa still runnable (but just not actively
+                * So if it was still runnable (but just not actively
                 * running right now), it's preempted, and we should
                 * yield - it could be a while.
                 */
@@@ -2266,27 -2304,6 +2328,27 @@@ static int sched_balance_self(int cpu, 
  
  #endif /* CONFIG_SMP */
  
 +/**
 + * task_oncpu_function_call - call a function on the cpu on which a task runs
 + * @p:                the task to evaluate
 + * @func:     the function to be called
 + * @info:     the function call argument
 + *
 + * Calls the function @func when the task is currently running. This might
 + * be on the current CPU, which just calls the function directly
 + */
 +void task_oncpu_function_call(struct task_struct *p,
 +                            void (*func) (void *info), void *info)
 +{
 +      int cpu;
 +
 +      preempt_disable();
 +      cpu = task_cpu(p);
 +      if (task_curr(p))
 +              smp_call_function_single(cpu, func, info, 1);
 +      preempt_enable();
 +}
 +
  /***
   * try_to_wake_up - wake up a thread
   * @p: the to-be-woken-up thread
@@@ -2312,7 -2329,7 +2374,7 @@@ static int try_to_wake_up(struct task_s
                sync = 0;
  
  #ifdef CONFIG_SMP
-       if (sched_feat(LB_WAKEUP_UPDATE)) {
+       if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                struct sched_domain *sd;
  
                this_cpu = raw_smp_processor_id();
@@@ -2390,6 -2407,22 +2452,22 @@@ out_activate
        activate_task(rq, p, 1);
        success = 1;
  
+       /*
+        * Only attribute actual wakeups done by this task.
+        */
+       if (!in_interrupt()) {
+               struct sched_entity *se = &current->se;
+               u64 sample = se->sum_exec_runtime;
+               if (se->last_wakeup)
+                       sample -= se->last_wakeup;
+               else
+                       sample -= se->start_runtime;
+               update_avg(&se->avg_wakeup, sample);
+               se->last_wakeup = se->sum_exec_runtime;
+       }
  out_running:
        trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
                p->sched_class->task_wake_up(rq, p);
  #endif
  out:
-       current->se.last_wakeup = current->se.sum_exec_runtime;
        task_rq_unlock(rq, &flags);
  
        return success;
@@@ -2429,9 -2460,10 +2505,11 @@@ static void __sched_fork(struct task_st
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
 +      p->se.nr_migrations             = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
+       p->se.start_runtime             = 0;
+       p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                = 0;
@@@ -2494,6 -2526,8 +2572,8 @@@ void sched_fork(struct task_struct *p, 
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
  #endif
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
        put_cpu();
  }
  
@@@ -2537,7 -2571,7 +2617,7 @@@ void wake_up_new_task(struct task_struc
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  
  /**
-  * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+  * preempt_notifier_register - tell me when current is being preempted & rescheduled
   * @notifier: notifier struct to register
   */
  void preempt_notifier_register(struct preempt_notifier *notifier)
@@@ -2634,6 -2668,12 +2714,12 @@@ static void finish_task_switch(struct r
  {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
+ #ifdef CONFIG_SMP
+       int post_schedule = 0;
+       if (current->sched_class->needs_post_schedule)
+               post_schedule = current->sched_class->needs_post_schedule(rq);
+ #endif
  
        rq->prev_mm = NULL;
  
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
 +      perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
-       if (current->sched_class->post_schedule)
+       if (post_schedule)
                current->sched_class->post_schedule(rq);
  #endif
  
@@@ -2813,21 -2852,6 +2899,21 @@@ unsigned long nr_active(void
  }
  
  /*
 + * Externally visible per-cpu scheduler statistics:
 + * cpu_nr_switches(cpu) - number of context switches on that cpu
 + * cpu_nr_migrations(cpu) - number of migrations into that cpu
 + */
 +u64 cpu_nr_switches(int cpu)
 +{
 +      return cpu_rq(cpu)->nr_switches;
 +}
 +
 +u64 cpu_nr_migrations(int cpu)
 +{
 +      return cpu_rq(cpu)->nr_migrations_in;
 +}
 +
 +/*
   * Update rq->cpu_load[] statistics. This function is usually called every
   * scheduler tick (TICK_NSEC).
   */
@@@ -2975,6 -2999,7 +3061,7 @@@ int can_migrate_task(struct task_struc
                     struct sched_domain *sd, enum cpu_idle_type idle,
                     int *all_pinned)
  {
+       int tsk_cache_hot = 0;
        /*
         * We do not migrate tasks that are:
         * 1) running (obviously), or
         * 2) too many balance attempts have failed.
         */
  
-       if (!task_hot(p, rq->clock, sd) ||
-                       sd->nr_balance_failed > sd->cache_nice_tries) {
+       tsk_cache_hot = task_hot(p, rq->clock, sd);
+       if (!tsk_cache_hot ||
+               sd->nr_balance_failed > sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
-               if (task_hot(p, rq->clock, sd)) {
+               if (tsk_cache_hot) {
                        schedstat_inc(sd, lb_hot_gained[idle]);
                        schedstat_inc(p, se.nr_forced_migrations);
                }
                return 1;
        }
  
-       if (task_hot(p, rq->clock, sd)) {
+       if (tsk_cache_hot) {
                schedstat_inc(p, se.nr_failed_migrations_hot);
                return 0;
        }
@@@ -3049,6 -3075,16 +3137,16 @@@ next
        pulled++;
        rem_load_move -= p->se.load.weight;
  
+ #ifdef CONFIG_PREEMPT
+       /*
+        * NEWIDLE balancing is a source of latency, so preemptible kernels
+        * will stop after the first task is pulled to minimize the critical
+        * section.
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               goto out;
+ #endif
        /*
         * We only want to steal up to the prescribed amount of weighted load.
         */
@@@ -3095,9 -3131,15 +3193,15 @@@ static int move_tasks(struct rq *this_r
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
  
+ #ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                        break;
+ #endif
        } while (class && max_load_move > total_load_moved);
  
        return total_load_moved > 0;
@@@ -3147,246 -3189,480 +3251,480 @@@ static int move_one_task(struct rq *thi
  
        return 0;
  }
+ /********** Helpers for find_busiest_group ************************/
  /*
-  * find_busiest_group finds and returns the busiest CPU group within the
-  * domain. It calculates and returns the amount of weighted load which
-  * should be moved to restore balance via the imbalance parameter.
+  * sd_lb_stats - Structure to store the statistics of a sched_domain
+  *            during load balancing.
   */
- static struct sched_group *
- find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const struct cpumask *cpus, int *balance)
- {
-       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
-       unsigned long max_pull;
-       unsigned long busiest_load_per_task, busiest_nr_running;
-       unsigned long this_load_per_task, this_nr_running;
-       int load_idx, group_imb = 0;
+ struct sd_lb_stats {
+       struct sched_group *busiest; /* Busiest group in this sd */
+       struct sched_group *this;  /* Local group in this sd */
+       unsigned long total_load;  /* Total load of all groups in sd */
+       unsigned long total_pwr;   /*   Total power of all groups in sd */
+       unsigned long avg_load;    /* Average load across all groups in sd */
+       /** Statistics of this group */
+       unsigned long this_load;
+       unsigned long this_load_per_task;
+       unsigned long this_nr_running;
+       /* Statistics of the busiest group */
+       unsigned long max_load;
+       unsigned long busiest_load_per_task;
+       unsigned long busiest_nr_running;
+       int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       int power_savings_balance = 1;
-       unsigned long leader_nr_running = 0, min_load_per_task = 0;
-       unsigned long min_nr_running = ULONG_MAX;
-       struct sched_group *group_min = NULL, *group_leader = NULL;
+       int power_savings_balance; /* Is powersave balance needed for this sd */
+       struct sched_group *group_min; /* Least loaded group in sd */
+       struct sched_group *group_leader; /* Group which relieves group_min */
+       unsigned long min_load_per_task; /* load_per_task in group_min */
+       unsigned long leader_nr_running; /* Nr running of group_leader */
+       unsigned long min_nr_running; /* Nr running of group_min */
  #endif
+ };
  
-       max_load = this_load = total_load = total_pwr = 0;
-       busiest_load_per_task = busiest_nr_running = 0;
-       this_load_per_task = this_nr_running = 0;
+ /*
+  * sg_lb_stats - stats of a sched_group required for load_balancing
+  */
+ struct sg_lb_stats {
+       unsigned long avg_load; /*Avg load across the CPUs of the group */
+       unsigned long group_load; /* Total load over the CPUs of the group */
+       unsigned long sum_nr_running; /* Nr tasks running in the group */
+       unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+       unsigned long group_capacity;
+       int group_imb; /* Is there an imbalance in the group ? */
+ };
+ /**
+  * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+  * @group: The group whose first cpu is to be returned.
+  */
+ static inline unsigned int group_first_cpu(struct sched_group *group)
+ {
+       return cpumask_first(sched_group_cpus(group));
+ }
+ /**
+  * get_sd_load_idx - Obtain the load index for a given sched domain.
+  * @sd: The sched_domain whose load_idx is to be obtained.
+  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+  */
+ static inline int get_sd_load_idx(struct sched_domain *sd,
+                                       enum cpu_idle_type idle)
+ {
+       int load_idx;
  
-       if (idle == CPU_NOT_IDLE)
+       switch (idle) {
+       case CPU_NOT_IDLE:
                load_idx = sd->busy_idx;
-       else if (idle == CPU_NEWLY_IDLE)
+               break;
+       case CPU_NEWLY_IDLE:
                load_idx = sd->newidle_idx;
-       else
+               break;
+       default:
                load_idx = sd->idle_idx;
+               break;
+       }
  
-       do {
-               unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
-               int local_group;
-               int i;
-               int __group_imb = 0;
-               unsigned int balance_cpu = -1, first_idle_cpu = 0;
-               unsigned long sum_nr_running, sum_weighted_load;
-               unsigned long sum_avg_load_per_task;
-               unsigned long avg_load_per_task;
+       return load_idx;
+ }
  
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
  
-               if (local_group)
-                       balance_cpu = cpumask_first(sched_group_cpus(group));
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /**
+  * init_sd_power_savings_stats - Initialize power savings statistics for
+  * the given sched_domain, during load balancing.
+  *
+  * @sd: Sched domain whose power-savings statistics are to be initialized.
+  * @sds: Variable containing the statistics for sd.
+  * @idle: Idle status of the CPU at which we're performing load-balancing.
+  */
+ static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ {
+       /*
+        * Busy processors will not participate in power savings
+        * balance.
+        */
+       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+               sds->power_savings_balance = 0;
+       else {
+               sds->power_savings_balance = 1;
+               sds->min_nr_running = ULONG_MAX;
+               sds->leader_nr_running = 0;
+       }
+ }
  
-               /* Tally up the load of all CPUs in the group */
-               sum_weighted_load = sum_nr_running = avg_load = 0;
-               sum_avg_load_per_task = avg_load_per_task = 0;
+ /**
+  * update_sd_power_savings_stats - Update the power saving stats for a
+  * sched_domain while performing load balancing.
+  *
+  * @group: sched_group belonging to the sched_domain under consideration.
+  * @sds: Variable containing the statistics of the sched_domain
+  * @local_group: Does group contain the CPU for which we're performing
+  *            load balancing ?
+  * @sgs: Variable containing the statistics of the group.
+  */
+ static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ {
  
-               max_cpu_load = 0;
-               min_cpu_load = ~0UL;
+       if (!sds->power_savings_balance)
+               return;
  
-               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-                       struct rq *rq = cpu_rq(i);
+       /*
+        * If the local group is idle or completely loaded
+        * no need to do power savings balance at this domain
+        */
+       if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                               !sds->this_nr_running))
+               sds->power_savings_balance = 0;
  
-                       if (*sd_idle && rq->nr_running)
-                               *sd_idle = 0;
+       /*
+        * If a group is already running at full capacity or idle,
+        * don't include that group in power savings calculations
+        */
+       if (!sds->power_savings_balance ||
+               sgs->sum_nr_running >= sgs->group_capacity ||
+               !sgs->sum_nr_running)
+               return;
  
-                       /* Bias balancing toward cpus of our domain */
-                       if (local_group) {
-                               if (idle_cpu(i) && !first_idle_cpu) {
-                                       first_idle_cpu = 1;
-                                       balance_cpu = i;
-                               }
+       /*
+        * Calculate the group which has the least non-idle load.
+        * This is the group from where we need to pick up the load
+        * for saving power
+        */
+       if ((sgs->sum_nr_running < sds->min_nr_running) ||
+           (sgs->sum_nr_running == sds->min_nr_running &&
+            group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+               sds->group_min = group;
+               sds->min_nr_running = sgs->sum_nr_running;
+               sds->min_load_per_task = sgs->sum_weighted_load /
+                                               sgs->sum_nr_running;
+       }
  
-                               load = target_load(i, load_idx);
-                       } else {
-                               load = source_load(i, load_idx);
-                               if (load > max_cpu_load)
-                                       max_cpu_load = load;
-                               if (min_cpu_load > load)
-                                       min_cpu_load = load;
-                       }
+       /*
+        * Calculate the group which is almost near its
+        * capacity but still has some space to pick up some load
+        * from other group and save more power
+        */
+       if (sgs->sum_nr_running > sgs->group_capacity - 1)
+               return;
  
-                       avg_load += load;
-                       sum_nr_running += rq->nr_running;
-                       sum_weighted_load += weighted_cpuload(i);
+       if (sgs->sum_nr_running > sds->leader_nr_running ||
+           (sgs->sum_nr_running == sds->leader_nr_running &&
+            group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+               sds->group_leader = group;
+               sds->leader_nr_running = sgs->sum_nr_running;
+       }
+ }
  
-                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
-               }
+ /**
+  * check_power_save_busiest_group - see if there is potential for some power-savings balance
+  * @sds: Variable containing the statistics of the sched_domain
+  *    under consideration.
+  * @this_cpu: Cpu at which we're currently performing load-balancing.
+  * @imbalance: Variable to store the imbalance.
+  *
+  * Description:
+  * Check if we have potential to perform some power-savings balance.
+  * If yes, set the busiest group to be the least loaded group in the
+  * sched_domain, so that it's CPUs can be put to idle.
+  *
+  * Returns 1 if there is potential to perform power-savings balance.
+  * Else returns 0.
+  */
+ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+ {
+       if (!sds->power_savings_balance)
+               return 0;
  
-               /*
-                * First idle cpu or the first cpu(busiest) in this sched group
-                * is eligible for doing load balancing at this and above
-                * domains. In the newly idle case, we will allow all the cpu's
-                * to do the newly idle load balance.
-                */
-               if (idle != CPU_NEWLY_IDLE && local_group &&
-                   balance_cpu != this_cpu && balance) {
-                       *balance = 0;
-                       goto ret;
-               }
+       if (sds->this != sds->group_leader ||
+                       sds->group_leader == sds->group_min)
+               return 0;
  
-               total_load += avg_load;
-               total_pwr += group->__cpu_power;
+       *imbalance = sds->min_load_per_task;
+       sds->busiest = sds->group_min;
  
-               /* Adjust by relative CPU power of the group */
-               avg_load = sg_div_cpu_power(group,
-                               avg_load * SCHED_LOAD_SCALE);
+       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                       group_first_cpu(sds->group_leader);
+       }
  
+       return 1;
  
-               /*
-                * Consider the group unbalanced when the imbalance is larger
-                * than the average weight of two tasks.
-                *
-                * APZ: with cgroup the avg task weight can vary wildly and
-                *      might not be a suitable number - should we keep a
-                *      normalized nr_running number somewhere that negates
-                *      the hierarchy?
-                */
-               avg_load_per_task = sg_div_cpu_power(group,
-                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ }
+ #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ {
+       return;
+ }
+ static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ {
+       return;
+ }
+ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+ {
+       return 0;
+ }
+ #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ /**
+  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+  * @group: sched_group whose statistics are to be updated.
+  * @this_cpu: Cpu for which load balance is currently performed.
+  * @idle: Idle status of this_cpu
+  * @load_idx: Load index of sched_domain of this_cpu for load calc.
+  * @sd_idle: Idle status of the sched_domain containing group.
+  * @local_group: Does group contain this_cpu.
+  * @cpus: Set of cpus considered for load balancing.
+  * @balance: Should we balance.
+  * @sgs: variable to hold the statistics for this group.
+  */
+ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+                       enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                       int local_group, const struct cpumask *cpus,
+                       int *balance, struct sg_lb_stats *sgs)
+ {
+       unsigned long load, max_cpu_load, min_cpu_load;
+       int i;
+       unsigned int balance_cpu = -1, first_idle_cpu = 0;
+       unsigned long sum_avg_load_per_task;
+       unsigned long avg_load_per_task;
+       if (local_group)
+               balance_cpu = group_first_cpu(group);
  
-               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-                       __group_imb = 1;
+       /* Tally up the load of all CPUs in the group */
+       sum_avg_load_per_task = avg_load_per_task = 0;
+       max_cpu_load = 0;
+       min_cpu_load = ~0UL;
  
-               group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+       for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+               struct rq *rq = cpu_rq(i);
  
+               if (*sd_idle && rq->nr_running)
+                       *sd_idle = 0;
+               /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                       this_load = avg_load;
-                       this = group;
-                       this_nr_running = sum_nr_running;
-                       this_load_per_task = sum_weighted_load;
-               } else if (avg_load > max_load &&
-                          (sum_nr_running > group_capacity || __group_imb)) {
-                       max_load = avg_load;
-                       busiest = group;
-                       busiest_nr_running = sum_nr_running;
-                       busiest_load_per_task = sum_weighted_load;
-                       group_imb = __group_imb;
+                       if (idle_cpu(i) && !first_idle_cpu) {
+                               first_idle_cpu = 1;
+                               balance_cpu = i;
+                       }
+                       load = target_load(i, load_idx);
+               } else {
+                       load = source_load(i, load_idx);
+                       if (load > max_cpu_load)
+                               max_cpu_load = load;
+                       if (min_cpu_load > load)
+                               min_cpu_load = load;
                }
  
- #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-               /*
-                * Busy processors will not participate in power savings
-                * balance.
-                */
-               if (idle == CPU_NOT_IDLE ||
-                               !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                       goto group_next;
+               sgs->group_load += load;
+               sgs->sum_nr_running += rq->nr_running;
+               sgs->sum_weighted_load += weighted_cpuload(i);
  
-               /*
-                * If the local group is idle or completely loaded
-                * no need to do power savings balance at this domain
-                */
-               if (local_group && (this_nr_running >= group_capacity ||
-                                   !this_nr_running))
-                       power_savings_balance = 0;
+               sum_avg_load_per_task += cpu_avg_load_per_task(i);
+       }
  
-               /*
-                * If a group is already running at full capacity or idle,
-                * don't include that group in power savings calculations
-                */
-               if (!power_savings_balance || sum_nr_running >= group_capacity
-                   || !sum_nr_running)
-                       goto group_next;
+       /*
+        * First idle cpu or the first cpu(busiest) in this sched group
+        * is eligible for doing load balancing at this and above
+        * domains. In the newly idle case, we will allow all the cpu's
+        * to do the newly idle load balance.
+        */
+       if (idle != CPU_NEWLY_IDLE && local_group &&
+           balance_cpu != this_cpu && balance) {
+               *balance = 0;
+               return;
+       }
  
-               /*
-                * Calculate the group which has the least non-idle load.
-                * This is the group from where we need to pick up the load
-                * for saving power
-                */
-               if ((sum_nr_running < min_nr_running) ||
-                   (sum_nr_running == min_nr_running &&
-                    cpumask_first(sched_group_cpus(group)) >
-                    cpumask_first(sched_group_cpus(group_min)))) {
-                       group_min = group;
-                       min_nr_running = sum_nr_running;
-                       min_load_per_task = sum_weighted_load /
-                                               sum_nr_running;
-               }
+       /* Adjust by relative CPU power of the group */
+       sgs->avg_load = sg_div_cpu_power(group,
+                       sgs->group_load * SCHED_LOAD_SCALE);
  
-               /*
-                * Calculate the group which is almost near its
-                * capacity but still has some space to pick up some load
-                * from other group and save more power
-                */
-               if (sum_nr_running <= group_capacity - 1) {
-                       if (sum_nr_running > leader_nr_running ||
-                           (sum_nr_running == leader_nr_running &&
-                            cpumask_first(sched_group_cpus(group)) <
-                            cpumask_first(sched_group_cpus(group_leader)))) {
-                               group_leader = group;
-                               leader_nr_running = sum_nr_running;
-                       }
+       /*
+        * Consider the group unbalanced when the imbalance is larger
+        * than the average weight of two tasks.
+        *
+        * APZ: with cgroup the avg task weight can vary wildly and
+        *      might not be a suitable number - should we keep a
+        *      normalized nr_running number somewhere that negates
+        *      the hierarchy?
+        */
+       avg_load_per_task = sg_div_cpu_power(group,
+                       sum_avg_load_per_task * SCHED_LOAD_SCALE);
+       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+               sgs->group_imb = 1;
+       sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ }
+ /**
+  * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+  * @sd: sched_domain whose statistics are to be updated.
+  * @this_cpu: Cpu for which load balance is currently performed.
+  * @idle: Idle status of this_cpu
+  * @sd_idle: Idle status of the sched_domain containing group.
+  * @cpus: Set of cpus considered for load balancing.
+  * @balance: Should we balance.
+  * @sds: variable to hold the statistics for this sched_domain.
+  */
+ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                       enum cpu_idle_type idle, int *sd_idle,
+                       const struct cpumask *cpus, int *balance,
+                       struct sd_lb_stats *sds)
+ {
+       struct sched_group *group = sd->groups;
+       struct sg_lb_stats sgs;
+       int load_idx;
+       init_sd_power_savings_stats(sd, sds, idle);
+       load_idx = get_sd_load_idx(sd, idle);
+       do {
+               int local_group;
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
+               memset(&sgs, 0, sizeof(sgs));
+               update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+                               local_group, cpus, balance, &sgs);
+               if (local_group && balance && !(*balance))
+                       return;
+               sds->total_load += sgs.group_load;
+               sds->total_pwr += group->__cpu_power;
+               if (local_group) {
+                       sds->this_load = sgs.avg_load;
+                       sds->this = group;
+                       sds->this_nr_running = sgs.sum_nr_running;
+                       sds->this_load_per_task = sgs.sum_weighted_load;
+               } else if (sgs.avg_load > sds->max_load &&
+                          (sgs.sum_nr_running > sgs.group_capacity ||
+                               sgs.group_imb)) {
+                       sds->max_load = sgs.avg_load;
+                       sds->busiest = group;
+                       sds->busiest_nr_running = sgs.sum_nr_running;
+                       sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->group_imb = sgs.group_imb;
                }
- group_next:
- #endif
+               update_sd_power_savings_stats(group, sds, local_group, &sgs);
                group = group->next;
        } while (group != sd->groups);
  
-       if (!busiest || this_load >= max_load || busiest_nr_running == 0)
-               goto out_balanced;
-       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+ }
  
-       if (this_load >= avg_load ||
-                       100*max_load <= sd->imbalance_pct*this_load)
-               goto out_balanced;
+ /**
+  * fix_small_imbalance - Calculate the minor imbalance that exists
+  *                    amongst the groups of a sched_domain, during
+  *                    load balancing.
+  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+  * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+  * @imbalance: Variable to store the imbalance.
+  */
+ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                               int this_cpu, unsigned long *imbalance)
+ {
+       unsigned long tmp, pwr_now = 0, pwr_move = 0;
+       unsigned int imbn = 2;
+       if (sds->this_nr_running) {
+               sds->this_load_per_task /= sds->this_nr_running;
+               if (sds->busiest_load_per_task >
+                               sds->this_load_per_task)
+                       imbn = 1;
+       } else
+               sds->this_load_per_task =
+                       cpu_avg_load_per_task(this_cpu);
  
-       busiest_load_per_task /= busiest_nr_running;
-       if (group_imb)
-               busiest_load_per_task = min(busiest_load_per_task, avg_load);
+       if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+                       sds->busiest_load_per_task * imbn) {
+               *imbalance = sds->busiest_load_per_task;
+               return;
+       }
  
        /*
-        * We're trying to get all the cpus to the average_load, so we don't
-        * want to push ourselves above the average load, nor do we wish to
-        * reduce the max loaded cpu below the average load, as either of these
-        * actions would just result in more rebalancing later, and ping-pong
-        * tasks around. Thus we look for the minimum possible imbalance.
-        * Negative imbalances (*we* are more loaded than anyone else) will
-        * be counted as no imbalance for these purposes -- we can't fix that
-        * by pulling tasks to us. Be careful of negative numbers as they'll
-        * appear as very large values with unsigned longs.
+        * OK, we don't have enough imbalance to justify moving tasks,
+        * however we may be able to increase total CPU power used by
+        * moving them.
         */
-       if (max_load <= busiest_load_per_task)
-               goto out_balanced;
  
+       pwr_now += sds->busiest->__cpu_power *
+                       min(sds->busiest_load_per_task, sds->max_load);
+       pwr_now += sds->this->__cpu_power *
+                       min(sds->this_load_per_task, sds->this_load);
+       pwr_now /= SCHED_LOAD_SCALE;
+       /* Amount of load we'd subtract */
+       tmp = sg_div_cpu_power(sds->busiest,
+                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       if (sds->max_load > tmp)
+               pwr_move += sds->busiest->__cpu_power *
+                       min(sds->busiest_load_per_task, sds->max_load - tmp);
+       /* Amount of load we'd add */
+       if (sds->max_load * sds->busiest->__cpu_power <
+               sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+               tmp = sg_div_cpu_power(sds->this,
+                       sds->max_load * sds->busiest->__cpu_power);
+       else
+               tmp = sg_div_cpu_power(sds->this,
+                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       pwr_move += sds->this->__cpu_power *
+                       min(sds->this_load_per_task, sds->this_load + tmp);
+       pwr_move /= SCHED_LOAD_SCALE;
+       /* Move if we gain throughput */
+       if (pwr_move > pwr_now)
+               *imbalance = sds->busiest_load_per_task;
+ }
+ /**
+  * calculate_imbalance - Calculate the amount of imbalance present within the
+  *                     groups of a given sched_domain during load balance.
+  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+  * @this_cpu: Cpu for which currently load balance is being performed.
+  * @imbalance: The variable to store the imbalance.
+  */
+ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+               unsigned long *imbalance)
+ {
+       unsigned long max_pull;
        /*
         * In the presence of smp nice balancing, certain scenarios can have
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
-       if (max_load < avg_load) {
+       if (sds->max_load < sds->avg_load) {
                *imbalance = 0;
-               goto small_imbalance;
+               return fix_small_imbalance(sds, this_cpu, imbalance);
        }
  
        /* Don't want to pull so many tasks that a group would go idle */
-       max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+       max_pull = min(sds->max_load - sds->avg_load,
+                       sds->max_load - sds->busiest_load_per_task);
  
        /* How much load to actually move to equalise the imbalance */
-       *imbalance = min(max_pull * busiest->__cpu_power,
-                               (avg_load - this_load) * this->__cpu_power)
+       *imbalance = min(max_pull * sds->busiest->__cpu_power,
+               (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                        / SCHED_LOAD_SCALE;
  
        /*
         * a think about bumping its value to force at least one task to be
         * moved
         */
-       if (*imbalance < busiest_load_per_task) {
-               unsigned long tmp, pwr_now, pwr_move;
-               unsigned int imbn;
- small_imbalance:
-               pwr_move = pwr_now = 0;
-               imbn = 2;
-               if (this_nr_running) {
-                       this_load_per_task /= this_nr_running;
-                       if (busiest_load_per_task > this_load_per_task)
-                               imbn = 1;
-               } else
-                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
+       if (*imbalance < sds->busiest_load_per_task)
+               return fix_small_imbalance(sds, this_cpu, imbalance);
  
-               if (max_load - this_load + busiest_load_per_task >=
-                                       busiest_load_per_task * imbn) {
-                       *imbalance = busiest_load_per_task;
-                       return busiest;
-               }
+ }
+ /******* find_busiest_group() helpers end here *********************/
  
-               /*
-                * OK, we don't have enough imbalance to justify moving tasks,
-                * however we may be able to increase total CPU power used by
-                * moving them.
-                */
+ /**
+  * find_busiest_group - Returns the busiest group within the sched_domain
+  * if there is an imbalance. If there isn't an imbalance, and
+  * the user has opted for power-savings, it returns a group whose
+  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+  * such a group exists.
+  *
+  * Also calculates the amount of weighted load which should be moved
+  * to restore balance.
+  *
+  * @sd: The sched_domain whose busiest group is to be returned.
+  * @this_cpu: The cpu for which load balancing is currently being performed.
+  * @imbalance: Variable which stores amount of weighted load which should
+  *            be moved to restore balance/put a group to idle.
+  * @idle: The idle status of this_cpu.
+  * @sd_idle: The idleness of sd
+  * @cpus: The set of CPUs under consideration for load-balancing.
+  * @balance: Pointer to a variable indicating if this_cpu
+  *    is the appropriate cpu to perform load balancing at this_level.
+  *
+  * Returns:   - the busiest group if imbalance exists.
+  *            - If no imbalance and user has opted for power-savings balance,
+  *               return the least loaded group whose CPUs can be
+  *               put to idle by rebalancing its tasks onto our group.
+  */
+ static struct sched_group *
+ find_busiest_group(struct sched_domain *sd, int this_cpu,
+                  unsigned long *imbalance, enum cpu_idle_type idle,
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
+ {
+       struct sd_lb_stats sds;
  
-               pwr_now += busiest->__cpu_power *
-                               min(busiest_load_per_task, max_load);
-               pwr_now += this->__cpu_power *
-                               min(this_load_per_task, this_load);
-               pwr_now /= SCHED_LOAD_SCALE;
-               /* Amount of load we'd subtract */
-               tmp = sg_div_cpu_power(busiest,
-                               busiest_load_per_task * SCHED_LOAD_SCALE);
-               if (max_load > tmp)
-                       pwr_move += busiest->__cpu_power *
-                               min(busiest_load_per_task, max_load - tmp);
-               /* Amount of load we'd add */
-               if (max_load * busiest->__cpu_power <
-                               busiest_load_per_task * SCHED_LOAD_SCALE)
-                       tmp = sg_div_cpu_power(this,
-                                       max_load * busiest->__cpu_power);
-               else
-                       tmp = sg_div_cpu_power(this,
-                               busiest_load_per_task * SCHED_LOAD_SCALE);
-               pwr_move += this->__cpu_power *
-                               min(this_load_per_task, this_load + tmp);
-               pwr_move /= SCHED_LOAD_SCALE;
+       memset(&sds, 0, sizeof(sds));
  
-               /* Move if we gain throughput */
-               if (pwr_move > pwr_now)
-                       *imbalance = busiest_load_per_task;
-       }
+       /*
+        * Compute the various statistics relavent for load balancing at
+        * this level.
+        */
+       update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+                                       balance, &sds);
+       /* Cases where imbalance does not exist from POV of this_cpu */
+       /* 1) this_cpu is not the appropriate cpu to perform load balancing
+        *    at this level.
+        * 2) There is no busy sibling group to pull from.
+        * 3) This group is the busiest group.
+        * 4) This group is more busy than the avg busieness at this
+        *    sched_domain.
+        * 5) The imbalance is within the specified limit.
+        * 6) Any rebalance would lead to ping-pong
+        */
+       if (balance && !(*balance))
+               goto ret;
  
-       return busiest;
+       if (!sds.busiest || sds.busiest_nr_running == 0)
+               goto out_balanced;
  
- out_balanced:
- #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-               goto ret;
+       if (sds.this_load >= sds.max_load)
+               goto out_balanced;
  
-       if (this == group_leader && group_leader != group_min) {
-               *imbalance = min_load_per_task;
-               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-                               cpumask_first(sched_group_cpus(group_leader));
-               }
-               return group_min;
-       }
- #endif
+       sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+       if (sds.this_load >= sds.avg_load)
+               goto out_balanced;
+       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+               goto out_balanced;
+       sds.busiest_load_per_task /= sds.busiest_nr_running;
+       if (sds.group_imb)
+               sds.busiest_load_per_task =
+                       min(sds.busiest_load_per_task, sds.avg_load);
+       /*
+        * We're trying to get all the cpus to the average_load, so we don't
+        * want to push ourselves above the average load, nor do we wish to
+        * reduce the max loaded cpu below the average load, as either of these
+        * actions would just result in more rebalancing later, and ping-pong
+        * tasks around. Thus we look for the minimum possible imbalance.
+        * Negative imbalances (*we* are more loaded than anyone else) will
+        * be counted as no imbalance for these purposes -- we can't fix that
+        * by pulling tasks to us. Be careful of negative numbers as they'll
+        * appear as very large values with unsigned longs.
+        */
+       if (sds.max_load <= sds.busiest_load_per_task)
+               goto out_balanced;
+       /* Looks like there is an imbalance. Compute it */
+       calculate_imbalance(&sds, this_cpu, imbalance);
+       return sds.busiest;
+ out_balanced:
+       /*
+        * There is no obvious imbalance. But check if we can do some balancing
+        * to save power.
+        */
+       if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+               return sds.busiest;
  ret:
        *imbalance = 0;
        return NULL;
@@@ -3510,19 -3818,23 +3880,23 @@@ find_busiest_queue(struct sched_group *
   */
  #define MAX_PINNED_INTERVAL   512
  
+ /* Working cpumask for load_balance and load_balance_newidle. */
+ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
  /*
   * Check this_cpu to ensure it is balanced within domain. Attempt to move
   * tasks if there is an imbalance.
   */
  static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, struct cpumask *cpus)
+                       int *balance)
  {
        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
        unsigned long flags;
+       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
  
        cpumask_setall(cpus);
  
@@@ -3677,8 -3989,7 +4051,7 @@@ out
   * this_rq is locked.
   */
  static int
- load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       struct cpumask *cpus)
+ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
  {
        struct sched_group *group;
        struct rq *busiest = NULL;
        int ld_moved = 0;
        int sd_idle = 0;
        int all_pinned = 0;
+       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
  
        cpumask_setall(cpus);
  
@@@ -3826,10 -4138,6 +4200,6 @@@ static void idle_balance(int this_cpu, 
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
-       cpumask_var_t tmpmask;
-       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
-               return;
  
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                if (sd->flags & SD_BALANCE_NEWIDLE)
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, tmpmask);
+                                                          sd);
  
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                 */
                this_rq->next_balance = next_balance;
        }
-       free_cpumask_var(tmpmask);
  }
  
  /*
@@@ -4005,11 -4312,6 +4374,6 @@@ static void rebalance_domains(int cpu, 
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
        int need_serialize;
-       cpumask_var_t tmp;
-       /* Fails alloc?  Rebalancing probably not a priority right now. */
-       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
-               return;
  
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                }
  
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@@ -4068,8 -4370,6 +4432,6 @@@ out
         */
        if (likely(update_next_balance))
                rq->next_balance = next_balance;
-       free_cpumask_var(tmp);
  }
  
  /*
@@@ -4119,6 -4419,11 +4481,11 @@@ static void run_rebalance_domains(struc
  #endif
  }
  
+ static inline int on_null_domain(int cpu)
+ {
+       return !rcu_dereference(cpu_rq(cpu)->sd);
+ }
  /*
   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
   *
@@@ -4176,7 -4481,9 +4543,9 @@@ static inline void trigger_load_balance
            cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
  #endif
-       if (time_after_eq(jiffies, rq->next_balance))
+       /* Don't need to rebalance while attached to NULL domain */
+       if (time_after_eq(jiffies, rq->next_balance) &&
+           likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
  }
  
@@@ -4199,29 -4506,6 +4568,29 @@@ EXPORT_PER_CPU_SYMBOL(kstat)
   * Return any ns on the sched_clock that have not yet been banked in
   * @p in case that task is currently running.
   */
 +unsigned long long __task_delta_exec(struct task_struct *p, int update)
 +{
 +      s64 delta_exec;
 +      struct rq *rq;
 +
 +      rq = task_rq(p);
 +      WARN_ON_ONCE(!runqueue_is_locked());
 +      WARN_ON_ONCE(!task_current(rq, p));
 +
 +      if (update)
 +              update_rq_clock(rq);
 +
 +      delta_exec = rq->clock - p->se.exec_start;
 +
 +      WARN_ON_ONCE(delta_exec < 0);
 +
 +      return delta_exec;
 +}
 +
 +/*
 + * Return any ns on the sched_clock that have not yet been banked in
 + * @p in case that task is currently running.
 + */
  unsigned long long task_delta_exec(struct task_struct *p)
  {
        unsigned long flags;
@@@ -4481,7 -4765,6 +4850,7 @@@ void scheduler_tick(void
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
 +      perf_counter_task_tick(curr, cpu);
        spin_unlock(&rq->lock);
  
  #ifdef CONFIG_SMP
  #endif
  }
  
- #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-                               defined(CONFIG_PREEMPT_TRACER))
- static inline unsigned long get_parent_ip(unsigned long addr)
+ unsigned long get_parent_ip(unsigned long addr)
  {
        if (in_lock_functions(addr)) {
                addr = CALLER_ADDR2;
        return addr;
  }
  
+ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
  void __kprobes add_preempt_count(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
@@@ -4594,11 -4877,33 +4963,33 @@@ static inline void schedule_debug(struc
  #endif
  }
  
+ static void put_prev_task(struct rq *rq, struct task_struct *prev)
+ {
+       if (prev->state == TASK_RUNNING) {
+               u64 runtime = prev->se.sum_exec_runtime;
+               runtime -= prev->se.prev_sum_exec_runtime;
+               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+               /*
+                * In order to avoid avg_overlap growing stale when we are
+                * indeed overlapping and hence not getting put to sleep, grow
+                * the avg_overlap on preemption.
+                *
+                * We use the average preemption runtime because that
+                * correlates to the amount of cache footprint a task can
+                * build up.
+                */
+               update_avg(&prev->se.avg_overlap, runtime);
+       }
+       prev->sched_class->put_prev_task(rq, prev);
+ }
  /*
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
- pick_next_task(struct rq *rq, struct task_struct *prev)
+ pick_next_task(struct rq *rq)
  {
        const struct sched_class *class;
        struct task_struct *p;
  /*
   * schedule() is the main scheduler function.
   */
- asmlinkage void __sched schedule(void)
+ asmlinkage void __sched __schedule(void)
  {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
  
- need_resched:
-       preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@@ -4672,12 -4975,11 +5061,12 @@@ need_resched_nonpreemptible
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
  
-       prev->sched_class->put_prev_task(rq, prev);
-       next = pick_next_task(rq, prev);
+       put_prev_task(rq, prev);
+       next = pick_next_task(rq);
  
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
 +              perf_counter_task_sched_out(prev, cpu);
  
                rq->nr_switches++;
                rq->curr = next;
  
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
+ }
  
+ asmlinkage void __sched schedule(void)
+ {
+ need_resched:
+       preempt_disable();
+       __schedule();
        preempt_enable_no_resched();
        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                goto need_resched;
  }
  EXPORT_SYMBOL(schedule);
  
+ #ifdef CONFIG_SMP
+ /*
+  * Look out! "owner" is an entirely speculative pointer
+  * access and not reliable.
+  */
+ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+ {
+       unsigned int cpu;
+       struct rq *rq;
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * Need to access the cpu field knowing that
+        * DEBUG_PAGEALLOC could have unmapped it if
+        * the mutex owner just released it and exited.
+        */
+       if (probe_kernel_address(&owner->cpu, cpu))
+               goto out;
+ #else
+       cpu = owner->cpu;
+ #endif
+       /*
+        * Even if the access succeeded (likely case),
+        * the cpu field may no longer be valid.
+        */
+       if (cpu >= nr_cpumask_bits)
+               goto out;
+       /*
+        * We need to validate that we can do a
+        * get_cpu() and that we have the percpu area.
+        */
+       if (!cpu_online(cpu))
+               goto out;
+       rq = cpu_rq(cpu);
+       for (;;) {
+               /*
+                * Owner changed, break to re-assess state.
+                */
+               if (lock->owner != owner)
+                       break;
+               /*
+                * Is that owner really running on that cpu?
+                */
+               if (task_thread_info(rq->curr) != owner || need_resched())
+                       return 0;
+               cpu_relax();
+       }
+ out:
+       return 1;
+ }
+ #endif
  #ifdef CONFIG_PREEMPT
  /*
   * this is the entry point to schedule() from in-kernel preemption
@@@ -4729,7 -5098,7 +5185,7 @@@ asmlinkage void __sched preempt_schedul
                 * between schedule and now.
                 */
                barrier();
-       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+       } while (need_resched());
  }
  EXPORT_SYMBOL(preempt_schedule);
  
@@@ -4758,7 -5127,7 +5214,7 @@@ asmlinkage void __sched preempt_schedul
                 * between schedule and now.
                 */
                barrier();
-       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+       } while (need_resched());
  }
  
  #endif /* CONFIG_PREEMPT */
@@@ -4819,11 -5188,17 +5275,17 @@@ void __wake_up_locked(wait_queue_head_
        __wake_up_common(q, mode, 1, 0, NULL);
  }
  
+ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+ {
+       __wake_up_common(q, mode, 1, 0, key);
+ }
  /**
-  * __wake_up_sync - wake up threads blocked on a waitqueue.
+  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
   * @q: the waitqueue
   * @mode: which threads
   * @nr_exclusive: how many wake-one or wake-many threads to wake up
+  * @key: opaque value to be passed to wakeup targets
   *
   * The sync wakeup differs that the waker knows that it will schedule
   * away soon, so while the target thread will be woken up, it will not
   *
   * On UP it can prevent extra preemption.
   */
- void
__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                      int nr_exclusive, void *key)
  {
        unsigned long flags;
        int sync = 1;
                sync = 0;
  
        spin_lock_irqsave(&q->lock, flags);
-       __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+       __wake_up_common(q, mode, nr_exclusive, sync, key);
        spin_unlock_irqrestore(&q->lock, flags);
  }
+ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+ /*
+  * __wake_up_sync - see __wake_up_sync_key()
+  */
+ void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ {
+       __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+ }
  EXPORT_SYMBOL_GPL(__wake_up_sync);    /* For internal use only */
  
  /**
@@@ -5232,7 -5616,7 +5703,7 @@@ SYSCALL_DEFINE1(nice, int, increment
        if (increment > 40)
                increment = 40;
  
-       nice = PRIO_TO_NICE(current->static_prio) + increment;
+       nice = TASK_NICE(current) + increment;
        if (nice < -20)
                nice = -20;
        if (nice > 19)
@@@ -6505,7 -6889,7 +6976,7 @@@ static void migrate_dead_tasks(unsigne
                if (!rq->nr_running)
                        break;
                update_rq_clock(rq);
-               next = pick_next_task(rq, rq->curr);
+               next = pick_next_task(rq);
                if (!next)
                        break;
                next->sched_class->put_prev_task(rq, next);
@@@ -7336,7 -7720,7 +7807,7 @@@ cpu_to_core_group(int cpu, const struc
  {
        int group;
  
-       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
        group = cpumask_first(mask);
        if (sg)
                *sg = &per_cpu(sched_group_core, group).sg;
@@@ -7365,7 -7749,7 +7836,7 @@@ cpu_to_phys_group(int cpu, const struc
        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
        group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
-       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
        group = cpumask_first(mask);
  #else
        group = cpu;
@@@ -7708,7 -8092,7 +8179,7 @@@ static int __build_sched_domains(const 
                SD_INIT(sd, SIBLING);
                set_domain_attribute(sd, attr);
                cpumask_and(sched_domain_span(sd),
-                           &per_cpu(cpu_sibling_map, i), cpu_map);
+                           topology_thread_cpumask(i), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
        /* Set up CPU (sibling) groups */
        for_each_cpu(i, cpu_map) {
                cpumask_and(this_sibling_map,
-                           &per_cpu(cpu_sibling_map, i), cpu_map);
+                           topology_thread_cpumask(i), cpu_map);
                if (i != cpumask_first(this_sibling_map))
                        continue;
  
@@@ -8300,11 -8684,15 +8771,15 @@@ static void init_rt_rq(struct rt_rq *rt
        __set_bit(MAX_RT_PRIO, array->bitmap);
  
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       rt_rq->highest_prio = MAX_RT_PRIO;
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ #ifdef CONFIG_SMP
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
+ #endif
  #endif
  #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
+       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
  #endif
  
        rt_rq->rt_time = 0;
@@@ -8391,6 -8779,9 +8866,9 @@@ void __init sched_init(void
  #ifdef CONFIG_USER_SCHED
        alloc_size *= 2;
  #endif
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+       alloc_size += num_possible_cpus() * cpumask_size();
+ #endif
        /*
         * As sched_init() is called before page_alloc is setup,
         * we use alloc_bootmem().
                ptr += nr_cpu_ids * sizeof(void **);
  #endif /* CONFIG_USER_SCHED */
  #endif /* CONFIG_RT_GROUP_SCHED */
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+               for_each_possible_cpu(i) {
+                       per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                       ptr += cpumask_size();
+               }
+ #endif /* CONFIG_CPUMASK_OFFSTACK */
        }
  
  #ifdef CONFIG_SMP
@@@ -9572,7 -9969,7 +10056,7 @@@ cpuacct_destroy(struct cgroup_subsys *s
  
  static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
  {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
        u64 data;
  
  #ifndef CONFIG_64BIT
  
  static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
  {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
  
  #ifndef CONFIG_64BIT
        /*
@@@ -9680,14 -10077,14 +10164,14 @@@ static void cpuacct_charge(struct task_
        struct cpuacct *ca;
        int cpu;
  
-       if (!cpuacct_subsys.active)
+       if (unlikely(!cpuacct_subsys.active))
                return;
  
        cpu = task_cpu(tsk);
        ca = task_ca(tsk);
  
        for (; ca; ca = ca->parent) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
  }
diff --combined kernel/sys.c
@@@ -14,7 -14,6 +14,7 @@@
  #include <linux/prctl.h>
  #include <linux/highuid.h>
  #include <linux/fs.h>
 +#include <linux/perf_counter.h>
  #include <linux/resource.h>
  #include <linux/kernel.h>
  #include <linux/kexec.h>
@@@ -35,6 -34,7 +35,7 @@@
  #include <linux/seccomp.h>
  #include <linux/cpu.h>
  #include <linux/ptrace.h>
+ #include <linux/fs_struct.h>
  
  #include <linux/compat.h>
  #include <linux/syscalls.h>
@@@ -1014,10 -1014,8 +1015,8 @@@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pi
        if (err)
                goto out;
  
-       if (task_pgrp(p) != pgrp) {
+       if (task_pgrp(p) != pgrp)
                change_pid(p, PIDTYPE_PGID, pgrp);
-               set_task_pgrp(p, pid_nr(pgrp));
-       }
  
        err = 0;
  out:
@@@ -1801,12 -1799,6 +1800,12 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                case PR_SET_TSC:
                        error = SET_TSC_CTL(arg2);
                        break;
 +              case PR_TASK_PERF_COUNTERS_DISABLE:
 +                      error = perf_counter_task_disable();
 +                      break;
 +              case PR_TASK_PERF_COUNTERS_ENABLE:
 +                      error = perf_counter_task_enable();
 +                      break;
                case PR_GET_TIMERSLACK:
                        error = current->timer_slack_ns;
                        break;