Merge branch 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 28 Dec 2008 20:21:10 +0000 (12:21 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 28 Dec 2008 20:21:10 +0000 (12:21 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 28 Dec 2008 20:21:10 +0000 (12:21 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 28 Dec 2008 20:21:10 +0000 (12:21 -0800)
diff --combined Documentation/kernel-parameters.txt

index 2c95cae,f44552e..68e7694
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -89,6 -89,7 +89,7 @@@ parameter is applicable
         SPARC   Sparc architecture is enabled.
         SWSUSP  Software suspend (hibernation) is enabled.
         SUSPEND System suspend states are enabled.
+       FTRACE  Function tracing enabled.
         TS      Appropriate touchscreen support is enabled.
         USB     USB support is enabled.
         USBHID  USB Human Interface Device support is enabled.
@@@ -753,6 -754,14 +754,14 @@@ and is between 256 and 4096 characters
                         parameter will force ia64_sal_cache_flush to call
                         ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
   
+       ftrace=[tracer]
+                       [ftrace] will set and start the specified tracer
+                       as early as possible in order to facilitate early
+                       boot debugging.
+ 
+       ftrace_dump_on_oops
+                       [ftrace] will dump the trace buffers on oops.
+ 
         gamecon.map[2|3]=
                         [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
                         support via parallel port (up to 5 devices per port)
@@@ -1396,20 -1405,7 +1405,20 @@@
                         when a NMI is triggered.
                         Format: [state][,regs][,debounce][,die]
   
- -      nmi_watchdog=   [KNL,BUGS=X86-32] Debugging features for SMP kernels
+ +      nmi_watchdog=   [KNL,BUGS=X86-32,X86-64] Debugging features for SMP kernels
+ +                      Format: [panic,][num]
+ +                      Valid num: 0,1,2
+ +                      0 - turn nmi_watchdog off
+ +                      1 - use the IO-APIC timer for the NMI watchdog
+ +                      2 - use the local APIC for the NMI watchdog using
+ +                      a performance counter. Note: This will use one performance
+ +                      counter and the local APIC's performance vector.
+ +                      When panic is specified panic when an NMI watchdog timeout occurs.
+ +                      This is useful when you use a panic=... timeout and need the box
+ +                      quickly up again.
+ +                      Instead of 1 and 2 it is possible to use the following
+ +                      symbolic names: lapic and ioapic
+ +                      Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
   
         no387           [BUGS=X86-32] Tells the kernel to use the 387 maths
                         emulation library even if a 387 maths coprocessor
@@@ -1465,10 -1461,6 +1474,10 @@@
                         instruction doesn't work correctly and not to
                         use it.
   
+ +      no_file_caps    Tells the kernel not to honor file capabilities.  The
+ +                      only way then for a file to be executed with privilege
+ +                      is to be setuid root or executed by root.
+ +
         nohalt          [IA-64] Tells the kernel not to use the power saving
                         function PAL_HALT_LIGHT when idle. This increases
                         power-consumption. On the positive side, it reduces
@@@ -1646,17 -1638,6 +1655,17 @@@
                 nomsi           [MSI] If the PCI_MSI kernel config parameter is
                                 enabled, this kernel boot option can be used to
                                 disable the use of MSI interrupts system-wide.
+ +              noioapicquirk   [APIC] Disable all boot interrupt quirks.
+ +                              Safety option to keep boot IRQs enabled. This
+ +                              should never be necessary.
+ +              ioapicreroute   [APIC] Enable rerouting of boot IRQs to the
+ +                              primary IO-APIC for bridges that cannot disable
+ +                              boot IRQs. This fixes a source of spurious IRQs
+ +                              when the system masks IRQs.
+ +              noioapicreroute [APIC] Disable workaround that uses the
+ +                              boot IRQ equivalent of an IRQ that connects to
+ +                              a chipset where boot IRQs cannot be disabled.
+ +                              The opposite of ioapicreroute.
                 biosirq         [X86-32] Use PCI BIOS calls to get the interrupt
                                 routing table. These calls are known to be buggy
                                 on several machines and they hang the machine
@@@ -2196,6 -2177,9 +2205,9 @@@
         st=             [HW,SCSI] SCSI tape parameters (buffers, etc.)
                         See Documentation/scsi/st.txt.
   
+       stacktrace      [FTRACE]
+                       Enabled the stack tracer on boot up.
+ 
         sti=            [PARISC,HW]
                         Format: <num>
                         Set the STI (builtin display/keyboard on the HP-PARISC
@@@ -2286,13 -2270,6 +2298,13 @@@
                         Format:
                         <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
   
+ +      tsc=            Disable clocksource-must-verify flag for TSC.
+ +                      Format: <string>
+ +                      [x86] reliable: mark tsc clocksource as reliable, this
+ +                      disables clocksource verification at runtime.
+ +                      Used to enable high-resolution timer mode on older
+ +                      hardware, and in virtualized environment.
+ +
         turbografx.map[2|3]=    [HW,JOY]
                         TurboGraFX parallel port interface
                         Format:
diff --combined arch/x86/Kconfig

index a2ae4c0,45c86fb..7b17f9d
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -19,8 -19,6 +19,8 @@@ config X86_6
   config X86
         def_bool y
         select HAVE_AOUT if X86_32
+ +      select HAVE_READQ
+ +      select HAVE_WRITEQ
         select HAVE_UNSTABLE_SCHED_CLOCK
         select HAVE_IDE
         select HAVE_OPROFILE
@@@ -31,11 -29,14 +31,14 @@@
         select HAVE_FTRACE_MCOUNT_RECORD
         select HAVE_DYNAMIC_FTRACE
         select HAVE_FUNCTION_TRACER
+       select HAVE_FUNCTION_GRAPH_TRACER
+       select HAVE_FUNCTION_TRACE_MCOUNT_TEST
         select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
         select HAVE_ARCH_KGDB if !X86_VOYAGER
         select HAVE_ARCH_TRACEHOOK
         select HAVE_GENERIC_DMA_COHERENT if X86_32
         select HAVE_EFFICIENT_UNALIGNED_ACCESS
+       select USER_STACKTRACE_SUPPORT
   
   config ARCH_DEFCONFIG
         string
@@@ -89,10 -90,6 +92,10 @@@ config GENERIC_IOMA
   config GENERIC_BUG
         def_bool y
         depends on BUG
+ +      select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+ +
+ +config GENERIC_BUG_RELATIVE_POINTERS
+ +      bool
   
   config GENERIC_HWEIGHT
         def_bool y
@@@ -248,13 -245,21 +251,13 @@@ config X86_FIND_SMP_CONFI
         def_bool y
         depends on X86_MPPARSE || X86_VOYAGER
   
- -if ACPI
   config X86_MPPARSE
- -      def_bool y
- -      bool "Enable MPS table"
+ +      bool "Enable MPS table" if ACPI
+ +      default y
         depends on X86_LOCAL_APIC
         help
           For old smp systems that do not have proper acpi support. Newer systems
           (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
- -endif
- -
- -if !ACPI
- -config X86_MPPARSE
- -      def_bool y
- -      depends on X86_LOCAL_APIC
- -endif
   
   choice
         prompt "Subarchitecture Type"
@@@ -463,6 -468,10 +466,6 @@@ config X86_CYCLONE_TIME
         def_bool y
         depends on X86_GENERICARCH
   
- -config ES7000_CLUSTERED_APIC
- -      def_bool y
- -      depends on SMP && X86_ES7000 && MPENTIUMIII
- -
   source "arch/x86/Kconfig.cpu"
   
   config HPET_TIMER
@@@ -563,7 -572,7 +566,7 @@@ config AMD_IOMM
   
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
- -      bool
+ +      def_bool y if X86_64
         help
           Support for software bounce buffers used on x86-64 systems
           which don't have a hardware IOMMU (e.g. the current generation
@@@ -654,30 -663,6 +657,30 @@@ config X86_VISWS_API
         def_bool y
         depends on X86_32 && X86_VISWS
   
+ +config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ +      bool "Reroute for broken boot IRQs"
+ +      default n
+ +      depends on X86_IO_APIC
+ +      help
+ +        This option enables a workaround that fixes a source of
+ +        spurious interrupts. This is recommended when threaded
+ +        interrupt handling is used on systems where the generation of
+ +        superfluous "boot interrupts" cannot be disabled.
+ +
+ +        Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
+ +        entry in the chipset's IO-APIC is masked (as, e.g. the RT
+ +        kernel does during interrupt handling). On chipsets where this
+ +        boot IRQ generation cannot be disabled, this workaround keeps
+ +        the original IRQ line masked so that only the equivalent "boot
+ +        IRQ" is delivered to the CPUs. The workaround also tells the
+ +        kernel to set up the IRQ handler on the boot IRQ line. In this
+ +        way only one interrupt is delivered to the kernel. Otherwise
+ +        the spurious second interrupt may cause the kernel to bring
+ +        down (vital) interrupt lines.
+ +
+ +        Only affects "broken" chipsets. Interrupt sharing may be
+ +        increased on these systems.
+ +
   config X86_MCE
         bool "Machine Check Exception"
         depends on !X86_VOYAGER
@@@ -974,37 -959,24 +977,37 @@@ config X86_PA
   config ARCH_PHYS_ADDR_T_64BIT
          def_bool X86_64 || X86_PAE
   
+ +config DIRECT_GBPAGES
+ +      bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+ +      default y
+ +      depends on X86_64
+ +      help
+ +        Allow the kernel linear mapping to use 1GB pages on CPUs that
+ +        support it. This can improve the kernel's performance a tiny bit by
+ +        reducing TLB pressure. If in doubt, say "Y".
+ +
   # Common NUMA Features
   config NUMA
- -      bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
+ +      bool "Numa Memory Allocation and Scheduler Support"
         depends on SMP
         depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
         default n if X86_PC
         default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
         help
           Enable NUMA (Non Uniform Memory Access) support.
+ +
           The kernel will try to allocate memory used by a CPU on the
           local memory controller of the CPU and add some more
           NUMA awareness to the kernel.
   
- -        For 32-bit this is currently highly experimental and should be only
- -        used for kernel development. It might also cause boot failures.
- -        For 64-bit this is recommended on all multiprocessor Opteron systems.
- -        If the system is EM64T, you should say N unless your system is
- -        EM64T NUMA.
+ +        For 64-bit this is recommended if the system is Intel Core i7
+ +        (or later), AMD Opteron, or EM64T NUMA.
+ +
+ +        For 32-bit this is only needed on (rare) 32-bit-only platforms
+ +        that support NUMA topologies, such as NUMAQ / Summit, or if you
+ +        boot a 32-bit kernel on a 64-bit NUMA platform.
+ +
+ +        Otherwise, you should say N.
   
   comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
         depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
@@@ -1524,10 -1496,6 +1527,10 @@@ config ARCH_ENABLE_MEMORY_HOTPLU
         def_bool y
         depends on X86_64 || (X86_32 && HIGHMEM)
   
+ +config ARCH_ENABLE_MEMORY_HOTREMOVE
+ +      def_bool y
+ +      depends on MEMORY_HOTPLUG
+ +
   config HAVE_ARCH_EARLY_PFN_TO_NID
         def_bool X86_64
         depends on NUMA
@@@ -1667,6 -1635,13 +1670,6 @@@ config APM_ALLOW_INT
           many of the newer IBM Thinkpads.  If you experience hangs when you
           suspend, try setting this to Y.  Otherwise, say N.
   
- -config APM_REAL_MODE_POWER_OFF
- -      bool "Use real mode APM BIOS call to power off"
- -      help
- -        Use real mode APM BIOS calls to switch off the computer. This is
- -        a work-around for a number of buggy BIOSes. Switch this option on if
- -        your computer crashes instead of powering off properly.
- -
   endif # APM
   
   source "arch/x86/kernel/cpu/cpufreq/Kconfig"
diff --combined arch/x86/Kconfig.debug

index 4ee7686,fa013f5..10d6cc3
--- 1/arch/x86/Kconfig.debug
--- 2/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@@ -114,6 -114,18 +114,6 @@@ config DEBUG_RODAT
           data. This is recommended so that we can catch kernel bugs sooner.
           If in doubt, say "Y".
   
- -config DIRECT_GBPAGES
- -      bool "Enable gbpages-mapped kernel pagetables"
- -      depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64
- -      help
- -        Enable gigabyte pages support (if the CPU supports it). This can
- -        improve the kernel's performance a tiny bit by reducing TLB
- -        pressure.
- -
- -        This is experimental code.
- -
- -        If in doubt, say "N".
- -
   config DEBUG_RODATA_TEST
         bool "Testcase for the DEBUG_RODATA feature"
         depends on DEBUG_RODATA
@@@ -174,14 -186,10 +174,10 @@@ config IOMMU_LEA
           Add a simple leak tracer to the IOMMU code. This is useful when you
           are debugging a buggy device driver that leaks IOMMU mappings.
   
- config MMIOTRACE_HOOKS
-       bool
- 
   config MMIOTRACE
         bool "Memory mapped IO tracing"
         depends on DEBUG_KERNEL && PCI
         select TRACING
-       select MMIOTRACE_HOOKS
         help
           Mmiotrace traces Memory Mapped I/O access and is meant for
           debugging and reverse engineering. It is called from the ioremap
@@@ -295,10 -303,10 +291,10 @@@ config OPTIMIZE_INLININ
           developers have marked 'inline'. Doing so takes away freedom from gcc to
           do what it thinks is best, which is desirable for the gcc 3.x series of
           compilers. The gcc 4.x series have a rewritten inlining algorithm and
- -        disabling this option will generate a smaller kernel there. Hopefully
- -        this algorithm is so good that allowing gcc4 to make the decision can
- -        become the default in the future, until then this option is there to
- -        test gcc for this.
+ +        enabling this option will generate a smaller kernel there. Hopefully
+ +        this algorithm is so good that allowing gcc 4.x and above to make the
+ +        decision will become the default in the future. Until then this option
+ +        is there to test gcc for this.
   
           If unsure, say N.
   
diff --combined arch/x86/include/asm/msr.h

index 4640ddd,b8a1799..638bf62
--- 1/arch/x86/include/asm/msr.h
--- 2/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@@ -22,10 -22,10 +22,10 @@@ static inline unsigned long long native
   }
   
   /*
- - * i386 calling convention returns 64-bit value in edx:eax, while
- - * x86_64 returns at rax. Also, the "A" constraint does not really
- - * mean rdx:rax in x86_64, so we need specialized behaviour for each
- - * architecture
+ + * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
+ + * constraint has different meanings. For i386, "A" means exactly
+ + * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
+ + * it means rax *or* rdx.
    */
   #ifdef CONFIG_X86_64
   #define DECLARE_ARGS(val, low, high)  unsigned low, high
@@@ -85,7 -85,8 +85,8 @@@ static inline void native_write_msr(uns
         asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
   }
   
- static inline int native_write_msr_safe(unsigned int msr,
+ /* Can be uninlined because referenced by paravirt */
+ notrace static inline int native_write_msr_safe(unsigned int msr,
                                         unsigned low, unsigned high)
   {
         int err;
@@@ -181,10 -182,10 +182,10 @@@ static inline int rdmsrl_amd_safe(unsig
   }
   
   #define rdtscl(low)                                           \
- -      ((low) = (u32)native_read_tsc())
+ +      ((low) = (u32)__native_read_tsc())
   
   #define rdtscll(val)                                          \
- -      ((val) = native_read_tsc())
+ +      ((val) = __native_read_tsc())
   
   #define rdpmc(counter, low, high)                     \
   do {                                                  \
diff --combined arch/x86/include/asm/processor.h

index a570eaf,aa5914f..091cd88
--- 1/arch/x86/include/asm/processor.h
--- 2/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -110,7 -110,6 +110,7 @@@ struct cpuinfo_x86 
         /* Index into per_cpu list: */
         u16                     cpu_index;
   #endif
+ +      unsigned int            x86_hyper_vendor;
   } __attribute__((__aligned__(SMP_CACHE_BYTES)));
   
   #define X86_VENDOR_INTEL      0
@@@ -124,9 -123,6 +124,9 @@@
   
   #define X86_VENDOR_UNKNOWN    0xff
   
+ +#define X86_HYPER_VENDOR_NONE  0
+ +#define X86_HYPER_VENDOR_VMWARE 1
+ +
   /*
    * capabilities of CPUs
    */
@@@ -756,6 -752,19 +756,19 @@@ extern void switch_to_new_gdt(void)
   extern void cpu_init(void);
   extern void init_gdt(int cpu);
   
+ static inline unsigned long get_debugctlmsr(void)
+ {
+     unsigned long debugctlmsr = 0;
+ 
+ #ifndef CONFIG_X86_DEBUGCTLMSR
+       if (boot_cpu_data.x86 < 6)
+               return 0;
+ #endif
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+ 
+     return debugctlmsr;
+ }
+ 
   static inline void update_debugctlmsr(unsigned long debugctlmsr)
   {
   #ifndef CONFIG_X86_DEBUGCTLMSR
diff --combined arch/x86/include/asm/thread_info.h

index 8dbc573,bf8113d..9878964
--- 1/arch/x86/include/asm/thread_info.h
--- 2/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@@ -20,11 -20,13 +20,13 @@@
   struct task_struct;
   struct exec_domain;
   #include <asm/processor.h>
+ #include <asm/ftrace.h>
+ #include <asm/atomic.h>
   
   struct thread_info {
         struct task_struct      *task;          /* main task structure */
         struct exec_domain      *exec_domain;   /* execution domain */
- -      unsigned long           flags;          /* low level flags */
+ +      __u32                   flags;          /* low level flags */
         __u32                   status;         /* thread synchronous flags */
         __u32                   cpu;            /* current CPU */
         int                     preempt_count;  /* 0 => preemptable,
@@@ -91,7 -93,6 +93,6 @@@
   #define TIF_FORCED_TF         24      /* true if TF in eflags artificially */
   #define TIF_DEBUGCTLMSR               25      /* uses thread_struct.debugctlmsr */
   #define TIF_DS_AREA_MSR               26      /* uses thread_struct.ds_area_msr */
- #define TIF_BTS_TRACE_TS      27      /* record scheduling event timestamps */
   
   #define _TIF_SYSCALL_TRACE    (1 << TIF_SYSCALL_TRACE)
   #define _TIF_NOTIFY_RESUME    (1 << TIF_NOTIFY_RESUME)
@@@ -113,7 -114,6 +114,6 @@@
   #define _TIF_FORCED_TF                (1 << TIF_FORCED_TF)
   #define _TIF_DEBUGCTLMSR      (1 << TIF_DEBUGCTLMSR)
   #define _TIF_DS_AREA_MSR      (1 << TIF_DS_AREA_MSR)
- #define _TIF_BTS_TRACE_TS     (1 << TIF_BTS_TRACE_TS)
   
   /* work to do in syscall_trace_enter() */
   #define _TIF_WORK_SYSCALL_ENTRY       \
@@@ -139,8 -139,7 +139,7 @@@
   
   /* flags to check in __switch_to() */
   #define _TIF_WORK_CTXSW                                                       \
-       (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
-                                                               _TIF_NOTSC)
+       (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
   
   #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
   #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --combined arch/x86/kernel/Makefile

index 1f208aa,1cad931..88dd768
--- 1/arch/x86/kernel/Makefile
--- 2/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@@ -12,7 -12,6 +12,7 @@@ CFLAGS_REMOVE_tsc.o = -p
   CFLAGS_REMOVE_rtc.o = -pg
   CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
   CFLAGS_REMOVE_ftrace.o = -pg
+ +CFLAGS_REMOVE_early_printk.o = -pg
   endif
   
   #
@@@ -24,7 -23,7 +24,7 @@@ CFLAGS_vsyscall_64.o  := $(PROFILING) -g
   CFLAGS_hpet.o         := $(nostackp)
   CFLAGS_tsc.o          := $(nostackp)
   
- -obj-y                 := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
+ +obj-y                 := process_$(BITS).o signal.o entry_$(BITS).o
   obj-y                 += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
   obj-y                 += time_$(BITS).o ioport.o ldt.o dumpstack.o
   obj-y                 += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
@@@ -66,6 -65,7 +66,7 @@@ obj-$(CONFIG_X86_LOCAL_APIC)  += apic.o 
   obj-$(CONFIG_X86_IO_APIC)     += io_apic.o
   obj-$(CONFIG_X86_REBOOTFIXUPS)        += reboot_fixups_32.o
   obj-$(CONFIG_DYNAMIC_FTRACE)  += ftrace.o
+ obj-$(CONFIG_FUNCTION_GRAPH_TRACER)   += ftrace.o
   obj-$(CONFIG_KEXEC)           += machine_kexec_$(BITS).o
   obj-$(CONFIG_KEXEC)           += relocate_kernel_$(BITS).o crash.o
   obj-$(CONFIG_CRASH_DUMP)      += crash_dump_$(BITS).o
@@@ -106,8 -106,6 +107,8 @@@ microcode-$(CONFIG_MICROCODE_INTEL)        += 
   microcode-$(CONFIG_MICROCODE_AMD)     += microcode_amd.o
   obj-$(CONFIG_MICROCODE)                       += microcode.o
   
+ +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+ +
   ###
   # 64 bit specific files
   ifeq ($(CONFIG_X86_64),y)
diff --combined arch/x86/kernel/apic.c

index 7397911,b946ac1..b5229af
--- 1/arch/x86/kernel/apic.c
--- 2/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@@ -30,6 -30,7 +30,7 @@@
   #include <linux/module.h>
   #include <linux/dmi.h>
   #include <linux/dmar.h>
+ #include <linux/ftrace.h>
   
   #include <asm/atomic.h>
   #include <asm/smp.h>
@@@ -441,7 -442,6 +442,7 @@@ static void lapic_timer_setup(enum cloc
                 v = apic_read(APIC_LVTT);
                 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
                 apic_write(APIC_LVTT, v);
+ +              apic_write(APIC_TMICT, 0xffffffff);
                 break;
         case CLOCK_EVT_MODE_RESUME:
                 /* Nothing to do here */
@@@ -560,13 -560,13 +561,13 @@@ static int __init calibrate_by_pmtimer(
         } else {
                 res = (((u64)deltapm) *  mult) >> 22;
                 do_div(res, 1000000);
- -              printk(KERN_WARNING "APIC calibration not consistent "
+ +              pr_warning("APIC calibration not consistent "
                         "with PM Timer: %ldms instead of 100ms\n",
                         (long)res);
                 /* Correct the lapic counter value */
                 res = (((u64)(*delta)) * pm_100ms);
                 do_div(res, deltapm);
- -              printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+ +              pr_info("APIC delta adjusted to PM-Timer: "
                         "%lu (%ld)\n", (unsigned long)res, *delta);
                 *delta = (long)res;
         }
@@@ -646,7 -646,8 +647,7 @@@ static int __init calibrate_APIC_clock(
          */
         if (calibration_result < (1000000 / HZ)) {
                 local_irq_enable();
- -              printk(KERN_WARNING
- -                     "APIC frequency too slow, disabling apic timer\n");
+ +              pr_warning("APIC frequency too slow, disabling apic timer\n");
                 return -1;
         }
   
@@@ -672,9 -673,13 +673,9 @@@
                 while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
                         cpu_relax();
   
                 /* Stop the lapic timer */
                 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
   
- -              local_irq_enable();
- -
                 /* Jiffies delta */
                 deltaj = lapic_cal_j2 - lapic_cal_j1;
                 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
@@@ -688,7 -693,8 +689,7 @@@
                 local_irq_enable();
   
         if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
- -              printk(KERN_WARNING
- -                     "APIC timer disabled due to verification failure.\n");
+ +              pr_warning("APIC timer disabled due to verification failure.\n");
                         return -1;
         }
   
@@@ -709,7 -715,7 +710,7 @@@ void __init setup_boot_APIC_clock(void
          * broadcast mechanism is used. On UP systems simply ignore it.
          */
         if (disable_apic_timer) {
- -              printk(KERN_INFO "Disabling APIC timer\n");
+ +              pr_info("Disabling APIC timer\n");
                 /* No broadcast on UP ! */
                 if (num_possible_cpus() > 1) {
                         lapic_clockevent.mult = 1;
@@@ -736,7 -742,7 +737,7 @@@
         if (nmi_watchdog != NMI_IO_APIC)
                 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
         else
- -              printk(KERN_WARNING "APIC timer registered as dummy,"
+ +              pr_warning("APIC timer registered as dummy,"
                         " due to nmi_watchdog=%d!\n", nmi_watchdog);
   
         /* Setup the lapic or request the broadcast */
@@@ -768,7 -774,8 +769,7 @@@ static void local_apic_timer_interrupt(
          * spurious.
          */
         if (!evt->event_handler) {
- -              printk(KERN_WARNING
- -                     "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ +              pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
                 /* Switch it off */
                 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
                 return;
@@@ -777,7 -784,11 +778,7 @@@
         /*
          * the NMI deadlock-detector uses this.
          */
- -#ifdef CONFIG_X86_64
- -      add_pda(apic_timer_irqs, 1);
- -#else
- -      per_cpu(irq_stat, cpu).apic_timer_irqs++;
- -#endif
+ +      inc_irq_stat(apic_timer_irqs);
   
         evt->event_handler(evt);
   }
@@@ -790,7 -801,7 +791,7 @@@
    * [ if a single-CPU system runs an SMP kernel then we call the local
    *   interrupt as well. Thus we cannot inline the local irq ... ]
    */
- void smp_apic_timer_interrupt(struct pt_regs *regs)
+ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
   {
         struct pt_regs *old_regs = set_irq_regs(regs);
   
@@@ -804,7 -815,9 +805,7 @@@
          * Besides, if we don't timer interrupts ignore the global
          * interrupt lock, which is the WrongThing (tm) to do.
          */
- -#ifdef CONFIG_X86_64
         exit_idle();
- -#endif
         irq_enter();
         local_apic_timer_interrupt();
         irq_exit();
@@@ -1081,7 -1094,7 +1082,7 @@@ static void __cpuinit lapic_setup_esr(v
         unsigned int oldvalue, value, maxlvt;
   
         if (!lapic_is_integrated()) {
- -              printk(KERN_INFO "No ESR for 82489DX.\n");
+ +              pr_info("No ESR for 82489DX.\n");
                 return;
         }
   
@@@ -1092,7 -1105,7 +1093,7 @@@
                  * ESR disabled - we can't do anything useful with the
                  * errors anyway - mbligh
                  */
- -              printk(KERN_INFO "Leaving ESR disabled.\n");
+ +              pr_info("Leaving ESR disabled.\n");
                 return;
         }
   
@@@ -1286,7 -1299,7 +1287,7 @@@ void check_x2apic(void
         rdmsr(MSR_IA32_APICBASE, msr, msr2);
   
         if (msr & X2APIC_ENABLE) {
- -              printk("x2apic enabled by BIOS, switching to x2apic ops\n");
+ +              pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
                 x2apic_preenabled = x2apic = 1;
                 apic_ops = &x2apic_ops;
         }
@@@ -1298,7 -1311,7 +1299,7 @@@ void enable_x2apic(void
   
         rdmsr(MSR_IA32_APICBASE, msr, msr2);
         if (!(msr & X2APIC_ENABLE)) {
- -              printk("Enabling x2apic\n");
+ +              pr_info("Enabling x2apic\n");
                 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
         }
   }
@@@ -1313,8 -1326,9 +1314,8 @@@ void __init enable_IR_x2apic(void
                 return;
   
         if (!x2apic_preenabled && disable_x2apic) {
- -              printk(KERN_INFO
- -                     "Skipped enabling x2apic and Interrupt-remapping "
- -                     "because of nox2apic\n");
+ +              pr_info("Skipped enabling x2apic and Interrupt-remapping "
+ +                      "because of nox2apic\n");
                 return;
         }
   
@@@ -1322,19 -1336,22 +1323,19 @@@
                 panic("Bios already enabled x2apic, can't enforce nox2apic");
   
         if (!x2apic_preenabled && skip_ioapic_setup) {
- -              printk(KERN_INFO
- -                     "Skipped enabling x2apic and Interrupt-remapping "
- -                     "because of skipping io-apic setup\n");
+ +              pr_info("Skipped enabling x2apic and Interrupt-remapping "
+ +                      "because of skipping io-apic setup\n");
                 return;
         }
   
         ret = dmar_table_init();
         if (ret) {
- -              printk(KERN_INFO
- -                     "dmar_table_init() failed with %d:\n", ret);
+ +              pr_info("dmar_table_init() failed with %d:\n", ret);
   
                 if (x2apic_preenabled)
                         panic("x2apic enabled by bios. But IR enabling failed");
                 else
- -                      printk(KERN_INFO
- -                             "Not enabling x2apic,Intr-remapping\n");
+ +                      pr_info("Not enabling x2apic,Intr-remapping\n");
                 return;
         }
   
@@@ -1343,7 -1360,7 +1344,7 @@@
   
         ret = save_mask_IO_APIC_setup();
         if (ret) {
- -              printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
+ +              pr_info("Saving IO-APIC state failed: %d\n", ret);
                 goto end;
         }
   
@@@ -1378,11 -1395,14 +1379,11 @@@ end
   
         if (!ret) {
                 if (!x2apic_preenabled)
- -                      printk(KERN_INFO
- -                             "Enabled x2apic and interrupt-remapping\n");
+ +                      pr_info("Enabled x2apic and interrupt-remapping\n");
                 else
- -                      printk(KERN_INFO
- -                             "Enabled Interrupt-remapping\n");
+ +                      pr_info("Enabled Interrupt-remapping\n");
         } else
- -              printk(KERN_ERR
- -                     "Failed to enable Interrupt-remapping and x2apic\n");
+ +              pr_err("Failed to enable Interrupt-remapping and x2apic\n");
   #else
         if (!cpu_has_x2apic)
                 return;
@@@ -1391,8 -1411,8 +1392,8 @@@
                 panic("x2apic enabled prior OS handover,"
                       " enable CONFIG_INTR_REMAP");
   
- -      printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
- -             " and x2apic\n");
+ +      pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
+ +              " and x2apic\n");
   #endif
   
         return;
@@@ -1409,7 -1429,7 +1410,7 @@@
   static int __init detect_init_APIC(void)
   {
         if (!cpu_has_apic) {
- -              printk(KERN_INFO "No local APIC present\n");
+ +              pr_info("No local APIC present\n");
                 return -1;
         }
   
@@@ -1450,8 -1470,8 +1451,8 @@@ static int __init detect_init_APIC(void
                  * "lapic" specified.
                  */
                 if (!force_enable_local_apic) {
- -                      printk(KERN_INFO "Local APIC disabled by BIOS -- "
- -                             "you can enable it with \"lapic\"\n");
+ +                      pr_info("Local APIC disabled by BIOS -- "
+ +                              "you can enable it with \"lapic\"\n");
                         return -1;
                 }
                 /*
@@@ -1461,7 -1481,8 +1462,7 @@@
                  */
                 rdmsr(MSR_IA32_APICBASE, l, h);
                 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- -                      printk(KERN_INFO
- -                             "Local APIC disabled by BIOS -- reenabling.\n");
+ +                      pr_info("Local APIC disabled by BIOS -- reenabling.\n");
                         l &= ~MSR_IA32_APICBASE_BASE;
                         l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
                         wrmsr(MSR_IA32_APICBASE, l, h);
@@@ -1474,7 -1495,7 +1475,7 @@@
          */
         features = cpuid_edx(1);
         if (!(features & (1 << X86_FEATURE_APIC))) {
- -              printk(KERN_WARNING "Could not enable APIC!\n");
+ +              pr_warning("Could not enable APIC!\n");
                 return -1;
         }
         set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
@@@ -1485,14 -1506,14 +1486,14 @@@
         if (l & MSR_IA32_APICBASE_ENABLE)
                 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
   
- -      printk(KERN_INFO "Found and enabled local APIC!\n");
+ +      pr_info("Found and enabled local APIC!\n");
   
         apic_pm_activate();
   
         return 0;
   
   no_apic:
- -      printk(KERN_INFO "No local APIC present or hardware disabled\n");
+ +      pr_info("No local APIC present or hardware disabled\n");
         return -1;
   }
   #endif
@@@ -1568,12 -1589,12 +1569,12 @@@ int __init APIC_init_uniprocessor(void
   {
   #ifdef CONFIG_X86_64
         if (disable_apic) {
- -              printk(KERN_INFO "Apic disabled\n");
+ +              pr_info("Apic disabled\n");
                 return -1;
         }
         if (!cpu_has_apic) {
                 disable_apic = 1;
- -              printk(KERN_INFO "Apic disabled by BIOS\n");
+ +              pr_info("Apic disabled by BIOS\n");
                 return -1;
         }
   #else
@@@ -1585,8 -1606,8 +1586,8 @@@
          */
         if (!cpu_has_apic &&
             APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- -              printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
- -                     boot_cpu_physical_apicid);
+ +              pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
+ +                      boot_cpu_physical_apicid);
                 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
                 return -1;
         }
@@@ -1662,7 -1683,9 +1663,7 @@@ void smp_spurious_interrupt(struct pt_r
   {
         u32 v;
   
- -#ifdef CONFIG_X86_64
         exit_idle();
- -#endif
         irq_enter();
         /*
          * Check if this really is a spurious interrupt and ACK it
@@@ -1673,11 -1696,14 +1674,11 @@@
         if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
                 ack_APIC_irq();
   
- -#ifdef CONFIG_X86_64
- -      add_pda(irq_spurious_count, 1);
- -#else
+ +      inc_irq_stat(irq_spurious_count);
+ +
         /* see sw-dev-man vol 3, chapter 7.4.13.5 */
- -      printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
- -             "should never happen.\n", smp_processor_id());
- -      __get_cpu_var(irq_stat).irq_spurious_count++;
- -#endif
+ +      pr_info("spurious APIC interrupt on CPU#%d, "
+ +              "should never happen.\n", smp_processor_id());
         irq_exit();
   }
   
@@@ -1688,7 -1714,9 +1689,7 @@@ void smp_error_interrupt(struct pt_reg
   {
         u32 v, v1;
   
- -#ifdef CONFIG_X86_64
         exit_idle();
- -#endif
         irq_enter();
         /* First tickle the hardware, only then report what went on. -- REW */
         v = apic_read(APIC_ESR);
@@@ -1697,18 -1725,17 +1698,18 @@@
         ack_APIC_irq();
         atomic_inc(&irq_err_count);
   
- -      /* Here is what the APIC error bits mean:
- -         0: Send CS error
- -         1: Receive CS error
- -         2: Send accept error
- -         3: Receive accept error
- -         4: Reserved
- -         5: Send illegal vector
- -         6: Received illegal vector
- -         7: Illegal register address
- -      */
- -      printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+ +      /*
+ +       * Here is what the APIC error bits mean:
+ +       * 0: Send CS error
+ +       * 1: Receive CS error
+ +       * 2: Send accept error
+ +       * 3: Receive accept error
+ +       * 4: Reserved
+ +       * 5: Send illegal vector
+ +       * 6: Received illegal vector
+ +       * 7: Illegal register address
+ +       */
+ +      pr_debug("APIC error on CPU%d: %02x(%02x)\n",
                 smp_processor_id(), v , v1);
         irq_exit();
   }
@@@ -1812,15 -1839,15 +1813,15 @@@ void __cpuinit generic_processor_info(i
          * Validate version
          */
         if (version == 0x0) {
- -              printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
- -                              "fixing up to 0x10. (tell your hw vendor)\n",
- -                              version);
+ +              pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
+ +                      "fixing up to 0x10. (tell your hw vendor)\n",
+ +                      version);
                 version = 0x10;
         }
         apic_version[apicid] = version;
   
         if (num_processors >= NR_CPUS) {
- -              printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+ +              pr_warning("WARNING: NR_CPUS limit of %i reached."
                         "  Processor ignored.\n", NR_CPUS);
                 return;
         }
@@@ -2183,7 -2210,7 +2184,7 @@@ static int __init apic_set_verbosity(ch
         else if (strcmp("verbose", arg) == 0)
                 apic_verbosity = APIC_VERBOSE;
         else {
- -              printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+ +              pr_warning("APIC Verbosity level %s not recognised"
                         " use apic=verbose or apic=debug\n", arg);
                 return -EINVAL;
         }
diff --combined arch/x86/kernel/cpu/Makefile

index a5c04e8,4ae495a..82db7f4
--- 1/arch/x86/kernel/cpu/Makefile
--- 2/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@@ -2,9 -2,13 +2,14 @@@
   # Makefile for x86-compatible CPU details and quirks
   #
   
+ # Don't trace early stages of a secondary CPU boot
+ ifdef CONFIG_FUNCTION_TRACER
+ CFLAGS_REMOVE_common.o = -pg
+ endif
+ 
   obj-y                 := intel_cacheinfo.o addon_cpuid_features.o
   obj-y                 += proc.o capflags.o powerflags.o common.o
+ +obj-y                 += vmware.o hypervisor.o
   
   obj-$(CONFIG_X86_32)  += bugs.o cmpxchg.o
   obj-$(CONFIG_X86_64)  += bugs_64.o
diff --combined arch/x86/kernel/entry_32.S

index fe70141,43ceb3f..d6f0490
--- 1/arch/x86/kernel/entry_32.S
--- 2/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@@ -619,37 -619,28 +619,37 @@@ END(syscall_badsys
   27:;
   
   /*
- - * Build the entry stubs and pointer table with
- - * some assembler magic.
+ + * Build the entry stubs and pointer table with some assembler magic.
+ + * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ + * single cache line on all modern x86 implementations.
    */
- -.section .rodata,"a"
+ +.section .init.rodata,"a"
   ENTRY(interrupt)
   .text
- -
+ +      .p2align 5
+ +      .p2align CONFIG_X86_L1_CACHE_SHIFT
   ENTRY(irq_entries_start)
         RING0_INT_FRAME
- -vector=0
- -.rept NR_VECTORS
- -      ALIGN
- - .if vector
+ +vector=FIRST_EXTERNAL_VECTOR
+ +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+ +      .balign 32
+ +  .rept       7
+ +    .if vector < NR_VECTORS
+ +      .if vector <> FIRST_EXTERNAL_VECTOR
         CFI_ADJUST_CFA_OFFSET -4
- - .endif
- -1:    pushl $~(vector)
+ +      .endif
+ +1:    pushl $(~vector+0x80)   /* Note: always in signed byte range */
         CFI_ADJUST_CFA_OFFSET 4
- -      jmp common_interrupt
- - .previous
+ +      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+ +      jmp 2f
+ +      .endif
+ +      .previous
         .long 1b
- - .text
+ +      .text
   vector=vector+1
+ +    .endif
+ +  .endr
+ +2:    jmp common_interrupt
   .endr
   END(irq_entries_start)
   
@@@ -661,9 -652,8 +661,9 @@@ END(interrupt
    * the CPU automatically disables interrupts when executing an IRQ vector,
    * so IRQ-flags tracing has to follow that:
    */
- -      ALIGN
+ +      .p2align CONFIG_X86_L1_CACHE_SHIFT
   common_interrupt:
+ +      addl $-0x80,(%esp)      /* Adjust vector into the [-256,-1] range */
         SAVE_ALL
         TRACE_IRQS_OFF
         movl %esp,%eax
@@@ -688,6 -678,65 +688,6 @@@ ENDPROC(name
   /* The include is where all of the SMP etc. interrupts come from */
   #include "entry_arch.h"
   
- -KPROBE_ENTRY(page_fault)
- -      RING0_EC_FRAME
- -      pushl $do_page_fault
- -      CFI_ADJUST_CFA_OFFSET 4
- -      ALIGN
- -error_code:
- -      /* the function address is in %fs's slot on the stack */
- -      pushl %es
- -      CFI_ADJUST_CFA_OFFSET 4
- -      /*CFI_REL_OFFSET es, 0*/
- -      pushl %ds
- -      CFI_ADJUST_CFA_OFFSET 4
- -      /*CFI_REL_OFFSET ds, 0*/
- -      pushl %eax
- -      CFI_ADJUST_CFA_OFFSET 4
- -      CFI_REL_OFFSET eax, 0
- -      pushl %ebp
- -      CFI_ADJUST_CFA_OFFSET 4
- -      CFI_REL_OFFSET ebp, 0
- -      pushl %edi
- -      CFI_ADJUST_CFA_OFFSET 4
- -      CFI_REL_OFFSET edi, 0
- -      pushl %esi
- -      CFI_ADJUST_CFA_OFFSET 4
- -      CFI_REL_OFFSET esi, 0
- -      pushl %edx
- -      CFI_ADJUST_CFA_OFFSET 4
- -      CFI_REL_OFFSET edx, 0
- -      pushl %ecx
- -      CFI_ADJUST_CFA_OFFSET 4
- -      CFI_REL_OFFSET ecx, 0
- -      pushl %ebx
- -      CFI_ADJUST_CFA_OFFSET 4
- -      CFI_REL_OFFSET ebx, 0
- -      cld
- -      pushl %fs
- -      CFI_ADJUST_CFA_OFFSET 4
- -      /*CFI_REL_OFFSET fs, 0*/
- -      movl $(__KERNEL_PERCPU), %ecx
- -      movl %ecx, %fs
- -      UNWIND_ESPFIX_STACK
- -      popl %ecx
- -      CFI_ADJUST_CFA_OFFSET -4
- -      /*CFI_REGISTER es, ecx*/
- -      movl PT_FS(%esp), %edi          # get the function address
- -      movl PT_ORIG_EAX(%esp), %edx    # get the error code
- -      movl $-1, PT_ORIG_EAX(%esp)     # no syscall to restart
- -      mov  %ecx, PT_FS(%esp)
- -      /*CFI_REL_OFFSET fs, ES*/
- -      movl $(__USER_DS), %ecx
- -      movl %ecx, %ds
- -      movl %ecx, %es
- -      TRACE_IRQS_OFF
- -      movl %esp,%eax                  # pt_regs pointer
- -      call *%edi
- -      jmp ret_from_exception
- -      CFI_ENDPROC
- -KPROBE_END(page_fault)
- -
   ENTRY(coprocessor_error)
         RING0_INT_FRAME
         pushl $0
@@@ -718,6 -767,140 +718,6 @@@ ENTRY(device_not_available
         CFI_ENDPROC
   END(device_not_available)
   
- -/*
- - * Debug traps and NMI can happen at the one SYSENTER instruction
- - * that sets up the real kernel stack. Check here, since we can't
- - * allow the wrong stack to be used.
- - *
- - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- - * already pushed 3 words if it hits on the sysenter instruction:
- - * eflags, cs and eip.
- - *
- - * We just load the right stack, and push the three (known) values
- - * by hand onto the new stack - while updating the return eip past
- - * the instruction that would have done it for sysenter.
- - */
- -#define FIX_STACK(offset, ok, label)          \
- -      cmpw $__KERNEL_CS,4(%esp);              \
- -      jne ok;                                 \
- -label:                                                \
- -      movl TSS_sysenter_sp0+offset(%esp),%esp;        \
- -      CFI_DEF_CFA esp, 0;                     \
- -      CFI_UNDEFINED eip;                      \
- -      pushfl;                                 \
- -      CFI_ADJUST_CFA_OFFSET 4;                \
- -      pushl $__KERNEL_CS;                     \
- -      CFI_ADJUST_CFA_OFFSET 4;                \
- -      pushl $sysenter_past_esp;               \
- -      CFI_ADJUST_CFA_OFFSET 4;                \
- -      CFI_REL_OFFSET eip, 0
- -
- -KPROBE_ENTRY(debug)
- -      RING0_INT_FRAME
- -      cmpl $ia32_sysenter_target,(%esp)
- -      jne debug_stack_correct
- -      FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
- -debug_stack_correct:
- -      pushl $-1                       # mark this as an int
- -      CFI_ADJUST_CFA_OFFSET 4
- -      SAVE_ALL
- -      TRACE_IRQS_OFF
- -      xorl %edx,%edx                  # error code 0
- -      movl %esp,%eax                  # pt_regs pointer
- -      call do_debug
- -      jmp ret_from_exception
- -      CFI_ENDPROC
- -KPROBE_END(debug)
- -
- -/*
- - * NMI is doubly nasty. It can happen _while_ we're handling
- - * a debug fault, and the debug fault hasn't yet been able to
- - * clear up the stack. So we first check whether we got  an
- - * NMI on the sysenter entry path, but after that we need to
- - * check whether we got an NMI on the debug path where the debug
- - * fault happened on the sysenter path.
- - */
- -KPROBE_ENTRY(nmi)
- -      RING0_INT_FRAME
- -      pushl %eax
- -      CFI_ADJUST_CFA_OFFSET 4
- -      movl %ss, %eax
- -      cmpw $__ESPFIX_SS, %ax
- -      popl %eax
- -      CFI_ADJUST_CFA_OFFSET -4
- -      je nmi_espfix_stack
- -      cmpl $ia32_sysenter_target,(%esp)
- -      je nmi_stack_fixup
- -      pushl %eax
- -      CFI_ADJUST_CFA_OFFSET 4
- -      movl %esp,%eax
- -      /* Do not access memory above the end of our stack page,
- -       * it might not exist.
- -       */
- -      andl $(THREAD_SIZE-1),%eax
- -      cmpl $(THREAD_SIZE-20),%eax
- -      popl %eax
- -      CFI_ADJUST_CFA_OFFSET -4
- -      jae nmi_stack_correct
- -      cmpl $ia32_sysenter_target,12(%esp)
- -      je nmi_debug_stack_check
- -nmi_stack_correct:
- -      /* We have a RING0_INT_FRAME here */
- -      pushl %eax
- -      CFI_ADJUST_CFA_OFFSET 4
- -      SAVE_ALL
- -      TRACE_IRQS_OFF
- -      xorl %edx,%edx          # zero error code
- -      movl %esp,%eax          # pt_regs pointer
- -      call do_nmi
- -      jmp restore_nocheck_notrace
- -      CFI_ENDPROC
- -
- -nmi_stack_fixup:
- -      RING0_INT_FRAME
- -      FIX_STACK(12,nmi_stack_correct, 1)
- -      jmp nmi_stack_correct
- -
- -nmi_debug_stack_check:
- -      /* We have a RING0_INT_FRAME here */
- -      cmpw $__KERNEL_CS,16(%esp)
- -      jne nmi_stack_correct
- -      cmpl $debug,(%esp)
- -      jb nmi_stack_correct
- -      cmpl $debug_esp_fix_insn,(%esp)
- -      ja nmi_stack_correct
- -      FIX_STACK(24,nmi_stack_correct, 1)
- -      jmp nmi_stack_correct
- -
- -nmi_espfix_stack:
- -      /* We have a RING0_INT_FRAME here.
- -       *
- -       * create the pointer to lss back
- -       */
- -      pushl %ss
- -      CFI_ADJUST_CFA_OFFSET 4
- -      pushl %esp
- -      CFI_ADJUST_CFA_OFFSET 4
- -      addw $4, (%esp)
- -      /* copy the iret frame of 12 bytes */
- -      .rept 3
- -      pushl 16(%esp)
- -      CFI_ADJUST_CFA_OFFSET 4
- -      .endr
- -      pushl %eax
- -      CFI_ADJUST_CFA_OFFSET 4
- -      SAVE_ALL
- -      TRACE_IRQS_OFF
- -      FIXUP_ESPFIX_STACK              # %eax == %esp
- -      xorl %edx,%edx                  # zero error code
- -      call do_nmi
- -      RESTORE_REGS
- -      lss 12+4(%esp), %esp            # back to espfix stack
- -      CFI_ADJUST_CFA_OFFSET -24
- -      jmp irq_return
- -      CFI_ENDPROC
- -KPROBE_END(nmi)
- -
   #ifdef CONFIG_PARAVIRT
   ENTRY(native_iret)
         iret
@@@ -733,6 -916,19 +733,6 @@@ ENTRY(native_irq_enable_sysexit
   END(native_irq_enable_sysexit)
   #endif
   
- -KPROBE_ENTRY(int3)
- -      RING0_INT_FRAME
- -      pushl $-1                       # mark this as an int
- -      CFI_ADJUST_CFA_OFFSET 4
- -      SAVE_ALL
- -      TRACE_IRQS_OFF
- -      xorl %edx,%edx          # zero error code
- -      movl %esp,%eax          # pt_regs pointer
- -      call do_int3
- -      jmp ret_from_exception
- -      CFI_ENDPROC
- -KPROBE_END(int3)
- -
   ENTRY(overflow)
         RING0_INT_FRAME
         pushl $0
@@@ -797,6 -993,14 +797,6 @@@ ENTRY(stack_segment
         CFI_ENDPROC
   END(stack_segment)
   
- -KPROBE_ENTRY(general_protection)
- -      RING0_EC_FRAME
- -      pushl $do_general_protection
- -      CFI_ADJUST_CFA_OFFSET 4
- -      jmp error_code
- -      CFI_ENDPROC
- -KPROBE_END(general_protection)
- -
   ENTRY(alignment_check)
         RING0_EC_FRAME
         pushl $do_alignment_check
@@@ -847,7 -1051,6 +847,7 @@@ ENTRY(kernel_thread_helper
         push %eax
         CFI_ADJUST_CFA_OFFSET 4
         call do_exit
+ +      ud2                     # padding for call trace
         CFI_ENDPROC
   ENDPROC(kernel_thread_helper)
   
@@@ -954,6 -1157,9 +954,9 @@@ ENTRY(mcount
   END(mcount)
   
   ENTRY(ftrace_caller)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+ 
         pushl %eax
         pushl %ecx
         pushl %edx
@@@ -968,6 -1174,11 +971,11 @@@ ftrace_call
         popl %edx
         popl %ecx
         popl %eax
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ .globl ftrace_graph_call
+ ftrace_graph_call:
+       jmp ftrace_stub
+ #endif
   
   .globl ftrace_stub
   ftrace_stub:
@@@ -977,8 -1188,18 +985,18 @@@ END(ftrace_caller
   #else /* ! CONFIG_DYNAMIC_FTRACE */
   
   ENTRY(mcount)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+ 
         cmpl $ftrace_stub, ftrace_trace_function
         jnz trace
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       cmpl $ftrace_stub, ftrace_graph_return
+       jnz ftrace_graph_caller
+ 
+       cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+       jnz ftrace_graph_caller
+ #endif
   .globl ftrace_stub
   ftrace_stub:
         ret
@@@ -997,237 -1218,44 +1015,268 @@@ trace
         popl %edx
         popl %ecx
         popl %eax
- 
         jmp ftrace_stub
   END(mcount)
   #endif /* CONFIG_DYNAMIC_FTRACE */
   #endif /* CONFIG_FUNCTION_TRACER */
   
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ ENTRY(ftrace_graph_caller)
+       cmpl $0, function_trace_stop
+       jne ftrace_stub
+ 
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %edx
+       lea 0x4(%ebp), %eax
+       subl $MCOUNT_INSN_SIZE, %edx
+       call prepare_ftrace_return
+       popl %edx
+       popl %ecx
+       popl %eax
+       ret
+ END(ftrace_graph_caller)
+ 
+ .globl return_to_handler
+ return_to_handler:
+       pushl $0
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       call ftrace_return_to_handler
+       movl %eax, 0xc(%esp)
+       popl %edx
+       popl %ecx
+       popl %eax
+       ret
+ #endif
+ 
   .section .rodata,"a"
   #include "syscall_table_32.S"
   
   syscall_table_size=(.-sys_call_table)
+ +
+ +/*
+ + * Some functions should be protected against kprobes
+ + */
+ +      .pushsection .kprobes.text, "ax"
+ +
+ +ENTRY(page_fault)
+ +      RING0_EC_FRAME
+ +      pushl $do_page_fault
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      ALIGN
+ +error_code:
+ +      /* the function address is in %fs's slot on the stack */
+ +      pushl %es
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      /*CFI_REL_OFFSET es, 0*/
+ +      pushl %ds
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      /*CFI_REL_OFFSET ds, 0*/
+ +      pushl %eax
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      CFI_REL_OFFSET eax, 0
+ +      pushl %ebp
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      CFI_REL_OFFSET ebp, 0
+ +      pushl %edi
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      CFI_REL_OFFSET edi, 0
+ +      pushl %esi
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      CFI_REL_OFFSET esi, 0
+ +      pushl %edx
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      CFI_REL_OFFSET edx, 0
+ +      pushl %ecx
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      CFI_REL_OFFSET ecx, 0
+ +      pushl %ebx
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      CFI_REL_OFFSET ebx, 0
+ +      cld
+ +      pushl %fs
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      /*CFI_REL_OFFSET fs, 0*/
+ +      movl $(__KERNEL_PERCPU), %ecx
+ +      movl %ecx, %fs
+ +      UNWIND_ESPFIX_STACK
+ +      popl %ecx
+ +      CFI_ADJUST_CFA_OFFSET -4
+ +      /*CFI_REGISTER es, ecx*/
+ +      movl PT_FS(%esp), %edi          # get the function address
+ +      movl PT_ORIG_EAX(%esp), %edx    # get the error code
+ +      movl $-1, PT_ORIG_EAX(%esp)     # no syscall to restart
+ +      mov  %ecx, PT_FS(%esp)
+ +      /*CFI_REL_OFFSET fs, ES*/
+ +      movl $(__USER_DS), %ecx
+ +      movl %ecx, %ds
+ +      movl %ecx, %es
+ +      TRACE_IRQS_OFF
+ +      movl %esp,%eax                  # pt_regs pointer
+ +      call *%edi
+ +      jmp ret_from_exception
+ +      CFI_ENDPROC
+ +END(page_fault)
+ +
+ +/*
+ + * Debug traps and NMI can happen at the one SYSENTER instruction
+ + * that sets up the real kernel stack. Check here, since we can't
+ + * allow the wrong stack to be used.
+ + *
+ + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ + * already pushed 3 words if it hits on the sysenter instruction:
+ + * eflags, cs and eip.
+ + *
+ + * We just load the right stack, and push the three (known) values
+ + * by hand onto the new stack - while updating the return eip past
+ + * the instruction that would have done it for sysenter.
+ + */
+ +#define FIX_STACK(offset, ok, label)          \
+ +      cmpw $__KERNEL_CS,4(%esp);              \
+ +      jne ok;                                 \
+ +label:                                                \
+ +      movl TSS_sysenter_sp0+offset(%esp),%esp;        \
+ +      CFI_DEF_CFA esp, 0;                     \
+ +      CFI_UNDEFINED eip;                      \
+ +      pushfl;                                 \
+ +      CFI_ADJUST_CFA_OFFSET 4;                \
+ +      pushl $__KERNEL_CS;                     \
+ +      CFI_ADJUST_CFA_OFFSET 4;                \
+ +      pushl $sysenter_past_esp;               \
+ +      CFI_ADJUST_CFA_OFFSET 4;                \
+ +      CFI_REL_OFFSET eip, 0
+ +
+ +ENTRY(debug)
+ +      RING0_INT_FRAME
+ +      cmpl $ia32_sysenter_target,(%esp)
+ +      jne debug_stack_correct
+ +      FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+ +debug_stack_correct:
+ +      pushl $-1                       # mark this as an int
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      SAVE_ALL
+ +      TRACE_IRQS_OFF
+ +      xorl %edx,%edx                  # error code 0
+ +      movl %esp,%eax                  # pt_regs pointer
+ +      call do_debug
+ +      jmp ret_from_exception
+ +      CFI_ENDPROC
+ +END(debug)
+ +
+ +/*
+ + * NMI is doubly nasty. It can happen _while_ we're handling
+ + * a debug fault, and the debug fault hasn't yet been able to
+ + * clear up the stack. So we first check whether we got  an
+ + * NMI on the sysenter entry path, but after that we need to
+ + * check whether we got an NMI on the debug path where the debug
+ + * fault happened on the sysenter path.
+ + */
+ +ENTRY(nmi)
+ +      RING0_INT_FRAME
+ +      pushl %eax
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      movl %ss, %eax
+ +      cmpw $__ESPFIX_SS, %ax
+ +      popl %eax
+ +      CFI_ADJUST_CFA_OFFSET -4
+ +      je nmi_espfix_stack
+ +      cmpl $ia32_sysenter_target,(%esp)
+ +      je nmi_stack_fixup
+ +      pushl %eax
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      movl %esp,%eax
+ +      /* Do not access memory above the end of our stack page,
+ +       * it might not exist.
+ +       */
+ +      andl $(THREAD_SIZE-1),%eax
+ +      cmpl $(THREAD_SIZE-20),%eax
+ +      popl %eax
+ +      CFI_ADJUST_CFA_OFFSET -4
+ +      jae nmi_stack_correct
+ +      cmpl $ia32_sysenter_target,12(%esp)
+ +      je nmi_debug_stack_check
+ +nmi_stack_correct:
+ +      /* We have a RING0_INT_FRAME here */
+ +      pushl %eax
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      SAVE_ALL
+ +      TRACE_IRQS_OFF
+ +      xorl %edx,%edx          # zero error code
+ +      movl %esp,%eax          # pt_regs pointer
+ +      call do_nmi
+ +      jmp restore_nocheck_notrace
+ +      CFI_ENDPROC
+ +
+ +nmi_stack_fixup:
+ +      RING0_INT_FRAME
+ +      FIX_STACK(12,nmi_stack_correct, 1)
+ +      jmp nmi_stack_correct
+ +
+ +nmi_debug_stack_check:
+ +      /* We have a RING0_INT_FRAME here */
+ +      cmpw $__KERNEL_CS,16(%esp)
+ +      jne nmi_stack_correct
+ +      cmpl $debug,(%esp)
+ +      jb nmi_stack_correct
+ +      cmpl $debug_esp_fix_insn,(%esp)
+ +      ja nmi_stack_correct
+ +      FIX_STACK(24,nmi_stack_correct, 1)
+ +      jmp nmi_stack_correct
+ +
+ +nmi_espfix_stack:
+ +      /* We have a RING0_INT_FRAME here.
+ +       *
+ +       * create the pointer to lss back
+ +       */
+ +      pushl %ss
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      pushl %esp
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      addw $4, (%esp)
+ +      /* copy the iret frame of 12 bytes */
+ +      .rept 3
+ +      pushl 16(%esp)
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      .endr
+ +      pushl %eax
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      SAVE_ALL
+ +      TRACE_IRQS_OFF
+ +      FIXUP_ESPFIX_STACK              # %eax == %esp
+ +      xorl %edx,%edx                  # zero error code
+ +      call do_nmi
+ +      RESTORE_REGS
+ +      lss 12+4(%esp), %esp            # back to espfix stack
+ +      CFI_ADJUST_CFA_OFFSET -24
+ +      jmp irq_return
+ +      CFI_ENDPROC
+ +END(nmi)
+ +
+ +ENTRY(int3)
+ +      RING0_INT_FRAME
+ +      pushl $-1                       # mark this as an int
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      SAVE_ALL
+ +      TRACE_IRQS_OFF
+ +      xorl %edx,%edx          # zero error code
+ +      movl %esp,%eax          # pt_regs pointer
+ +      call do_int3
+ +      jmp ret_from_exception
+ +      CFI_ENDPROC
+ +END(int3)
+ +
+ +ENTRY(general_protection)
+ +      RING0_EC_FRAME
+ +      pushl $do_general_protection
+ +      CFI_ADJUST_CFA_OFFSET 4
+ +      jmp error_code
+ +      CFI_ENDPROC
+ +END(general_protection)
+ +
+ +/*
+ + * End of kprobes section
+ + */
+ +      .popsection
diff --combined arch/x86/kernel/entry_64.S

index 3194636,303dd84..e28c7a9
--- 1/arch/x86/kernel/entry_64.S
--- 2/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@@ -11,15 -11,15 +11,15 @@@
    *
    * NOTE: This code handles signal-recognition, which happens every time
    * after an interrupt and after each system call.
- - * 
- - * Normal syscalls and interrupts don't save a full stack frame, this is 
+ + *
+ + * Normal syscalls and interrupts don't save a full stack frame, this is
    * only done for syscall tracing, signals or fork/exec et.al.
- - * 
- - * A note on terminology:      
- - * - top of stack: Architecture defined interrupt frame from SS to RIP 
- - * at the top of the kernel process stack.    
+ + *
+ + * A note on terminology:
+ + * - top of stack: Architecture defined interrupt frame from SS to RIP
+ + * at the top of the kernel process stack.
    * - partial stack frame: partially saved registers upto R11.
- - * - full stack frame: Like partial stack frame, but all register saved. 
+ + * - full stack frame: Like partial stack frame, but all register saved.
    *
    * Some macro usage:
    * - CFI macros are used to generate dwarf2 unwind information for better
@@@ -60,6 -60,7 +60,6 @@@
   #define __AUDIT_ARCH_LE          0x40000000
   
         .code64
- -
   #ifdef CONFIG_FUNCTION_TRACER
   #ifdef CONFIG_DYNAMIC_FTRACE
   ENTRY(mcount)
@@@ -67,16 -68,10 +67,10 @@@
   END(mcount)
   
   ENTRY(ftrace_caller)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
   
-       /* taken from glibc */
-       subq $0x38, %rsp
-       movq %rax, (%rsp)
-       movq %rcx, 8(%rsp)
-       movq %rdx, 16(%rsp)
-       movq %rsi, 24(%rsp)
-       movq %rdi, 32(%rsp)
-       movq %r8, 40(%rsp)
-       movq %r9, 48(%rsp)
+       MCOUNT_SAVE_FRAME
   
         movq 0x38(%rsp), %rdi
         movq 8(%rbp), %rsi
@@@ -86,14 -81,13 +80,13 @@@
   ftrace_call:
         call ftrace_stub
   
-       movq 48(%rsp), %r9
-       movq 40(%rsp), %r8
-       movq 32(%rsp), %rdi
-       movq 24(%rsp), %rsi
-       movq 16(%rsp), %rdx
-       movq 8(%rsp), %rcx
-       movq (%rsp), %rax
-       addq $0x38, %rsp
+       MCOUNT_RESTORE_FRAME
+ 
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ .globl ftrace_graph_call
+ ftrace_graph_call:
+       jmp ftrace_stub
+ #endif
   
   .globl ftrace_stub
   ftrace_stub:
@@@ -102,15 -96,63 +95,63 @@@ END(ftrace_caller
   
   #else /* ! CONFIG_DYNAMIC_FTRACE */
   ENTRY(mcount)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+ 
         cmpq $ftrace_stub, ftrace_trace_function
         jnz trace
+ 
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       cmpq $ftrace_stub, ftrace_graph_return
+       jnz ftrace_graph_caller
+ 
+       cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+       jnz ftrace_graph_caller
+ #endif
+ 
   .globl ftrace_stub
   ftrace_stub:
         retq
   
   trace:
-       /* taken from glibc */
-       subq $0x38, %rsp
+       MCOUNT_SAVE_FRAME
+ 
+       movq 0x38(%rsp), %rdi
+       movq 8(%rbp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rdi
+ 
+       call   *ftrace_trace_function
+ 
+       MCOUNT_RESTORE_FRAME
+ 
+       jmp ftrace_stub
+ END(mcount)
+ #endif /* CONFIG_DYNAMIC_FTRACE */
+ #endif /* CONFIG_FUNCTION_TRACER */
+ 
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ ENTRY(ftrace_graph_caller)
+       cmpl $0, function_trace_stop
+       jne ftrace_stub
+ 
+       MCOUNT_SAVE_FRAME
+ 
+       leaq 8(%rbp), %rdi
+       movq 0x38(%rsp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rsi
+ 
+       call    prepare_ftrace_return
+ 
+       MCOUNT_RESTORE_FRAME
+ 
+       retq
+ END(ftrace_graph_caller)
+ 
+ 
+ .globl return_to_handler
+ return_to_handler:
+       subq  $80, %rsp
+ 
         movq %rax, (%rsp)
         movq %rcx, 8(%rsp)
         movq %rdx, 16(%rsp)
@@@ -118,13 -160,14 +159,14 @@@
         movq %rdi, 32(%rsp)
         movq %r8, 40(%rsp)
         movq %r9, 48(%rsp)
+       movq %r10, 56(%rsp)
+       movq %r11, 64(%rsp)
   
-       movq 0x38(%rsp), %rdi
-       movq 8(%rbp), %rsi
-       subq $MCOUNT_INSN_SIZE, %rdi
- 
-       call   *ftrace_trace_function
+       call ftrace_return_to_handler
   
+       movq %rax, 72(%rsp)
+       movq 64(%rsp), %r11
+       movq 56(%rsp), %r10
         movq 48(%rsp), %r9
         movq 40(%rsp), %r8
         movq 32(%rsp), %rdi
@@@ -132,16 -175,14 +174,14 @@@
         movq 16(%rsp), %rdx
         movq 8(%rsp), %rcx
         movq (%rsp), %rax
-       addq $0x38, %rsp
+       addq $72, %rsp
+       retq
+ #endif
   
   
   #ifndef CONFIG_PREEMPT
   #define retint_kernel retint_restore_args
- -#endif        
+ +#endif
   
   #ifdef CONFIG_PARAVIRT
   ENTRY(native_usergs_sysret64)
@@@ -160,29 -201,29 +200,29 @@@
   .endm
   
   /*
- - * C code is not supposed to know about undefined top of stack. Every time 
- - * a C function with an pt_regs argument is called from the SYSCALL based 
+ + * C code is not supposed to know about undefined top of stack. Every time
+ + * a C function with an pt_regs argument is called from the SYSCALL based
    * fast path FIXUP_TOP_OF_STACK is needed.
    * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
    * manipulation.
- - */           
- -              
- -      /* %rsp:at FRAMEEND */ 
- -      .macro FIXUP_TOP_OF_STACK tmp
- -      movq    %gs:pda_oldrsp,\tmp
- -      movq    \tmp,RSP(%rsp)
- -      movq    $__USER_DS,SS(%rsp)
- -      movq    $__USER_CS,CS(%rsp)
- -      movq    $-1,RCX(%rsp)
- -      movq    R11(%rsp),\tmp  /* get eflags */
- -      movq    \tmp,EFLAGS(%rsp)
+ + */
+ +
+ +      /* %rsp:at FRAMEEND */
+ +      .macro FIXUP_TOP_OF_STACK tmp offset=0
+ +      movq %gs:pda_oldrsp,\tmp
+ +      movq \tmp,RSP+\offset(%rsp)
+ +      movq $__USER_DS,SS+\offset(%rsp)
+ +      movq $__USER_CS,CS+\offset(%rsp)
+ +      movq $-1,RCX+\offset(%rsp)
+ +      movq R11+\offset(%rsp),\tmp  /* get eflags */
+ +      movq \tmp,EFLAGS+\offset(%rsp)
         .endm
   
- -      .macro RESTORE_TOP_OF_STACK tmp,offset=0
- -      movq   RSP-\offset(%rsp),\tmp
- -      movq   \tmp,%gs:pda_oldrsp
- -      movq   EFLAGS-\offset(%rsp),\tmp
- -      movq   \tmp,R11-\offset(%rsp)
+ +      .macro RESTORE_TOP_OF_STACK tmp offset=0
+ +      movq RSP+\offset(%rsp),\tmp
+ +      movq \tmp,%gs:pda_oldrsp
+ +      movq EFLAGS+\offset(%rsp),\tmp
+ +      movq \tmp,R11+\offset(%rsp)
         .endm
   
         .macro FAKE_STACK_FRAME child_rip
@@@ -194,7 -235,7 +234,7 @@@
         pushq %rax /* rsp */
         CFI_ADJUST_CFA_OFFSET   8
         CFI_REL_OFFSET  rsp,0
- -      pushq $(1<<9) /* eflags - interrupts on */
+ +      pushq $X86_EFLAGS_IF /* eflags - interrupts on */
         CFI_ADJUST_CFA_OFFSET   8
         /*CFI_REL_OFFSET        rflags,0*/
         pushq $__KERNEL_CS /* cs */
@@@ -212,184 -253,62 +252,184 @@@
         CFI_ADJUST_CFA_OFFSET   -(6*8)
         .endm
   
- -      .macro  CFI_DEFAULT_STACK start=1
+ +/*
+ + * initial frame state for interrupts (and exceptions without error code)
+ + */
+ +      .macro EMPTY_FRAME start=1 offset=0
         .if \start
- -      CFI_STARTPROC   simple
+ +      CFI_STARTPROC simple
         CFI_SIGNAL_FRAME
- -      CFI_DEF_CFA     rsp,SS+8
+ +      CFI_DEF_CFA rsp,8+\offset
         .else
- -      CFI_DEF_CFA_OFFSET SS+8
+ +      CFI_DEF_CFA_OFFSET 8+\offset
         .endif
- -      CFI_REL_OFFSET  r15,R15
- -      CFI_REL_OFFSET  r14,R14
- -      CFI_REL_OFFSET  r13,R13
- -      CFI_REL_OFFSET  r12,R12
- -      CFI_REL_OFFSET  rbp,RBP
- -      CFI_REL_OFFSET  rbx,RBX
- -      CFI_REL_OFFSET  r11,R11
- -      CFI_REL_OFFSET  r10,R10
- -      CFI_REL_OFFSET  r9,R9
- -      CFI_REL_OFFSET  r8,R8
- -      CFI_REL_OFFSET  rax,RAX
- -      CFI_REL_OFFSET  rcx,RCX
- -      CFI_REL_OFFSET  rdx,RDX
- -      CFI_REL_OFFSET  rsi,RSI
- -      CFI_REL_OFFSET  rdi,RDI
- -      CFI_REL_OFFSET  rip,RIP
- -      /*CFI_REL_OFFSET        cs,CS*/
- -      /*CFI_REL_OFFSET        rflags,EFLAGS*/
- -      CFI_REL_OFFSET  rsp,RSP
- -      /*CFI_REL_OFFSET        ss,SS*/
         .endm
+ +
+ +/*
+ + * initial frame state for interrupts (and exceptions without error code)
+ + */
+ +      .macro INTR_FRAME start=1 offset=0
+ +      EMPTY_FRAME \start, SS+8+\offset-RIP
+ +      /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
+ +      CFI_REL_OFFSET rsp, RSP+\offset-RIP
+ +      /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
+ +      /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
+ +      CFI_REL_OFFSET rip, RIP+\offset-RIP
+ +      .endm
+ +
+ +/*
+ + * initial frame state for exceptions with error code (and interrupts
+ + * with vector already pushed)
+ + */
+ +      .macro XCPT_FRAME start=1 offset=0
+ +      INTR_FRAME \start, RIP+\offset-ORIG_RAX
+ +      /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
+ +      .endm
+ +
+ +/*
+ + * frame that enables calling into C.
+ + */
+ +      .macro PARTIAL_FRAME start=1 offset=0
+ +      XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
+ +      CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+ +      .endm
+ +
+ +/*
+ + * frame that enables passing a complete pt_regs to a C function.
+ + */
+ +      .macro DEFAULT_FRAME start=1 offset=0
+ +      PARTIAL_FRAME \start, R11+\offset-R15
+ +      CFI_REL_OFFSET rbx, RBX+\offset
+ +      CFI_REL_OFFSET rbp, RBP+\offset
+ +      CFI_REL_OFFSET r12, R12+\offset
+ +      CFI_REL_OFFSET r13, R13+\offset
+ +      CFI_REL_OFFSET r14, R14+\offset
+ +      CFI_REL_OFFSET r15, R15+\offset
+ +      .endm
+ +
+ +/* save partial stack frame */
+ +ENTRY(save_args)
+ +      XCPT_FRAME
+ +      cld
+ +      movq_cfi rdi, RDI+16-ARGOFFSET
+ +      movq_cfi rsi, RSI+16-ARGOFFSET
+ +      movq_cfi rdx, RDX+16-ARGOFFSET
+ +      movq_cfi rcx, RCX+16-ARGOFFSET
+ +      movq_cfi rax, RAX+16-ARGOFFSET
+ +      movq_cfi  r8,  R8+16-ARGOFFSET
+ +      movq_cfi  r9,  R9+16-ARGOFFSET
+ +      movq_cfi r10, R10+16-ARGOFFSET
+ +      movq_cfi r11, R11+16-ARGOFFSET
+ +
+ +      leaq -ARGOFFSET+16(%rsp),%rdi   /* arg1 for handler */
+ +      movq_cfi rbp, 8         /* push %rbp */
+ +      leaq 8(%rsp), %rbp              /* mov %rsp, %ebp */
+ +      testl $3, CS(%rdi)
+ +      je 1f
+ +      SWAPGS
+ +      /*
+ +       * irqcount is used to check if a CPU is already on an interrupt stack
+ +       * or not. While this is essentially redundant with preempt_count it is
+ +       * a little cheaper to use a separate counter in the PDA (short of
+ +       * moving irq_enter into assembly, which would be too much work)
+ +       */
+ +1:    incl %gs:pda_irqcount
+ +      jne 2f
+ +      popq_cfi %rax                   /* move return address... */
+ +      mov %gs:pda_irqstackptr,%rsp
+ +      EMPTY_FRAME 0
+ +      pushq_cfi %rax                  /* ... to the new stack */
+ +      /*
+ +       * We entered an interrupt context - irqs are off:
+ +       */
+ +2:    TRACE_IRQS_OFF
+ +      ret
+ +      CFI_ENDPROC
+ +END(save_args)
+ +
+ +ENTRY(save_rest)
+ +      PARTIAL_FRAME 1 REST_SKIP+8
+ +      movq 5*8+16(%rsp), %r11 /* save return address */
+ +      movq_cfi rbx, RBX+16
+ +      movq_cfi rbp, RBP+16
+ +      movq_cfi r12, R12+16
+ +      movq_cfi r13, R13+16
+ +      movq_cfi r14, R14+16
+ +      movq_cfi r15, R15+16
+ +      movq %r11, 8(%rsp)      /* return address */
+ +      FIXUP_TOP_OF_STACK %r11, 16
+ +      ret
+ +      CFI_ENDPROC
+ +END(save_rest)
+ +
+ +/* save complete stack frame */
+ +ENTRY(save_paranoid)
+ +      XCPT_FRAME 1 RDI+8
+ +      cld
+ +      movq_cfi rdi, RDI+8
+ +      movq_cfi rsi, RSI+8
+ +      movq_cfi rdx, RDX+8
+ +      movq_cfi rcx, RCX+8
+ +      movq_cfi rax, RAX+8
+ +      movq_cfi r8, R8+8
+ +      movq_cfi r9, R9+8
+ +      movq_cfi r10, R10+8
+ +      movq_cfi r11, R11+8
+ +      movq_cfi rbx, RBX+8
+ +      movq_cfi rbp, RBP+8
+ +      movq_cfi r12, R12+8
+ +      movq_cfi r13, R13+8
+ +      movq_cfi r14, R14+8
+ +      movq_cfi r15, R15+8
+ +      movl $1,%ebx
+ +      movl $MSR_GS_BASE,%ecx
+ +      rdmsr
+ +      testl %edx,%edx
+ +      js 1f   /* negative -> in kernel */
+ +      SWAPGS
+ +      xorl %ebx,%ebx
+ +1:    ret
+ +      CFI_ENDPROC
+ +END(save_paranoid)
+ +
   /*
- - * A newly forked process directly context switches into this.
- - */   
- -/* rdi:       prev */ 
+ + * A newly forked process directly context switches into this address.
+ + *
+ + * rdi: prev task we switched from
+ + */
   ENTRY(ret_from_fork)
- -      CFI_DEFAULT_STACK
+ +      DEFAULT_FRAME
+ +
         push kernel_eflags(%rip)
         CFI_ADJUST_CFA_OFFSET 8
- -      popf                            # reset kernel eflags
+ +      popf                                    # reset kernel eflags
         CFI_ADJUST_CFA_OFFSET -8
- -      call schedule_tail
+ +
+ +      call schedule_tail                      # rdi: 'prev' task parameter
+ +
         GET_THREAD_INFO(%rcx)
- -      testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
- -      jnz rff_trace
- -rff_action:   
+ +
+ +      CFI_REMEMBER_STATE
         RESTORE_REST
- -      testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
+ +
+ +      testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
         je   int_ret_from_sys_call
- -      testl $_TIF_IA32,TI_flags(%rcx)
+ +
+ +      testl $_TIF_IA32, TI_flags(%rcx)        # 32-bit compat task needs IRET
         jnz  int_ret_from_sys_call
- -      RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
- -      jmp ret_from_sys_call
- -rff_trace:
- -      movq %rsp,%rdi
- -      call syscall_trace_leave
- -      GET_THREAD_INFO(%rcx)   
- -      jmp rff_action
+ +
+ +      RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+ +      jmp ret_from_sys_call                   # go to the SYSRET fastpath
+ +
+ +      CFI_RESTORE_STATE
         CFI_ENDPROC
   END(ret_from_fork)
   
@@@ -399,20 -318,20 +439,20 @@@
    * SYSCALL does not save anything on the stack and does not change the
    * stack pointer.
    */
- -              
+ +
   /*
- - * Register setup:    
+ + * Register setup:
    * rax  system call number
    * rdi  arg0
- - * rcx  return address for syscall/sysret, C arg3 
+ + * rcx  return address for syscall/sysret, C arg3
    * rsi  arg1
- - * rdx  arg2  
+ + * rdx  arg2
    * r10  arg3  (--> moved to rcx for C)
    * r8   arg4
    * r9   arg5
    * r11  eflags for syscall/sysret, temporary for C
- - * r12-r15,rbp,rbx saved by C code, not touched.              
- - * 
+ + * r12-r15,rbp,rbx saved by C code, not touched.
+ + *
    * Interrupts are off on entry.
    * Only called from user space.
    *
@@@ -422,7 -341,7 +462,7 @@@
    * When user can change the frames always force IRET. That is because
    * it deals with uncanonical addresses better. SYSRET has trouble
    * with them due to bugs in both AMD and Intel CPUs.
- - */                                   
+ + */
   
   ENTRY(system_call)
         CFI_STARTPROC   simple
@@@ -438,7 -357,7 +478,7 @@@
          */
   ENTRY(system_call_after_swapgs)
   
- -      movq    %rsp,%gs:pda_oldrsp 
+ +      movq    %rsp,%gs:pda_oldrsp
         movq    %gs:pda_kernelstack,%rsp
         /*
          * No need to follow this irqs off/on section - it's straight
@@@ -446,7 -365,7 +486,7 @@@
          */
         ENABLE_INTERRUPTS(CLBR_NONE)
         SAVE_ARGS 8,1
- -      movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
+ +      movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
         movq  %rcx,RIP-ARGOFFSET(%rsp)
         CFI_REL_OFFSET rip,RIP-ARGOFFSET
         GET_THREAD_INFO(%rcx)
@@@ -460,19 -379,19 +500,19 @@@ system_call_fastpath
         movq %rax,RAX-ARGOFFSET(%rsp)
   /*
    * Syscall return path ending with SYSRET (fast path)
- - * Has incomplete stack frame and undefined top of stack. 
- - */           
+ + * Has incomplete stack frame and undefined top of stack.
+ + */
   ret_from_sys_call:
         movl $_TIF_ALLWORK_MASK,%edi
         /* edi: flagmask */
- -sysret_check:         
+ +sysret_check:
         LOCKDEP_SYS_EXIT
         GET_THREAD_INFO(%rcx)
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         movl TI_flags(%rcx),%edx
         andl %edi,%edx
- -      jnz  sysret_careful 
+ +      jnz  sysret_careful
         CFI_REMEMBER_STATE
         /*
          * sysretq will re-enable interrupts:
@@@ -487,7 -406,7 +527,7 @@@
   
         CFI_RESTORE_STATE
         /* Handle reschedules */
- -      /* edx: work, edi: workmask */  
+ +      /* edx: work, edi: workmask */
   sysret_careful:
         bt $TIF_NEED_RESCHED,%edx
         jnc sysret_signal
@@@ -500,7 -419,7 +540,7 @@@
         CFI_ADJUST_CFA_OFFSET -8
         jmp sysret_check
   
- -      /* Handle a signal */ 
+ +      /* Handle a signal */
   sysret_signal:
         TRACE_IRQS_ON
         ENABLE_INTERRUPTS(CLBR_NONE)
@@@ -509,20 -428,17 +549,20 @@@
         jc sysret_audit
   #endif
         /* edx: work flags (arg3) */
- -      leaq do_notify_resume(%rip),%rax
         leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
         xorl %esi,%esi # oldset -> arg2
- -      call ptregscall_common
+ +      SAVE_REST
+ +      FIXUP_TOP_OF_STACK %r11
+ +      call do_notify_resume
+ +      RESTORE_TOP_OF_STACK %r11
+ +      RESTORE_REST
         movl $_TIF_WORK_MASK,%edi
         /* Use IRET because user could have changed frame. This
            works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         jmp int_with_check
- -      
+ +
   badsys:
         movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
         jmp ret_from_sys_call
@@@ -561,7 -477,7 +601,7 @@@ sysret_audit
   #endif        /* CONFIG_AUDITSYSCALL */
   
         /* Do syscall tracing */
- -tracesys:                      
+ +tracesys:
   #ifdef CONFIG_AUDITSYSCALL
         testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
         jz auditsys
@@@ -584,8 -500,8 +624,8 @@@
         call *sys_call_table(,%rax,8)
         movq %rax,RAX-ARGOFFSET(%rsp)
         /* Use IRET because user could have changed frame */
- -              
- -/* 
+ +
+ +/*
    * Syscall return path ending with IRET.
    * Has correct top of stack, but partial stack frame.
    */
@@@ -629,18 -545,18 +669,18 @@@ int_very_careful
         TRACE_IRQS_ON
         ENABLE_INTERRUPTS(CLBR_NONE)
         SAVE_REST
- -      /* Check for syscall exit trace */      
+ +      /* Check for syscall exit trace */
         testl $_TIF_WORK_SYSCALL_EXIT,%edx
         jz int_signal
         pushq %rdi
         CFI_ADJUST_CFA_OFFSET 8
- -      leaq 8(%rsp),%rdi       # &ptregs -> arg1       
+ +      leaq 8(%rsp),%rdi       # &ptregs -> arg1
         call syscall_trace_leave
         popq %rdi
         CFI_ADJUST_CFA_OFFSET -8
         andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
         jmp int_restore_rest
- -      
+ +
   int_signal:
         testl $_TIF_DO_NOTIFY_MASK,%edx
         jz 1f
@@@ -655,24 -571,22 +695,24 @@@ int_restore_rest
         jmp int_with_check
         CFI_ENDPROC
   END(system_call)
- -              
- -/* 
+ +
+ +/*
    * Certain special system calls that need to save a complete full stack frame.
- - */                                                           
- -      
+ + */
         .macro PTREGSCALL label,func,arg
- -      .globl \label
- -\label:
- -      leaq    \func(%rip),%rax
- -      leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
- -      jmp     ptregscall_common
+ +ENTRY(\label)
+ +      PARTIAL_FRAME 1 8               /* offset 8: return address */
+ +      subq $REST_SKIP, %rsp
+ +      CFI_ADJUST_CFA_OFFSET REST_SKIP
+ +      call save_rest
+ +      DEFAULT_FRAME 0 8               /* offset 8: return address */
+ +      leaq 8(%rsp), \arg      /* pt_regs pointer */
+ +      call \func
+ +      jmp ptregscall_common
+ +      CFI_ENDPROC
   END(\label)
         .endm
   
- -      CFI_STARTPROC
- -
         PTREGSCALL stub_clone, sys_clone, %r8
         PTREGSCALL stub_fork, sys_fork, %rdi
         PTREGSCALL stub_vfork, sys_vfork, %rdi
@@@ -680,18 -594,25 +720,18 @@@
         PTREGSCALL stub_iopl, sys_iopl, %rsi
   
   ENTRY(ptregscall_common)
- -      popq %r11
- -      CFI_ADJUST_CFA_OFFSET -8
- -      CFI_REGISTER rip, r11
- -      SAVE_REST
- -      movq %r11, %r15
- -      CFI_REGISTER rip, r15
- -      FIXUP_TOP_OF_STACK %r11
- -      call *%rax
- -      RESTORE_TOP_OF_STACK %r11
- -      movq %r15, %r11
- -      CFI_REGISTER rip, r11
- -      RESTORE_REST
- -      pushq %r11
- -      CFI_ADJUST_CFA_OFFSET 8
- -      CFI_REL_OFFSET rip, 0
- -      ret
+ +      DEFAULT_FRAME 1 8       /* offset 8: return address */
+ +      RESTORE_TOP_OF_STACK %r11, 8
+ +      movq_cfi_restore R15+8, r15
+ +      movq_cfi_restore R14+8, r14
+ +      movq_cfi_restore R13+8, r13
+ +      movq_cfi_restore R12+8, r12
+ +      movq_cfi_restore RBP+8, rbp
+ +      movq_cfi_restore RBX+8, rbx
+ +      ret $REST_SKIP          /* pop extended registers */
         CFI_ENDPROC
   END(ptregscall_common)
- -      
+ +
   ENTRY(stub_execve)
         CFI_STARTPROC
         popq %r11
@@@ -707,11 -628,11 +747,11 @@@
         jmp int_ret_from_sys_call
         CFI_ENDPROC
   END(stub_execve)
- -      
+ +
   /*
    * sigreturn is special because it needs to restore all registers on return.
    * This cannot be done with SYSRET, so use the IRET return path instead.
- - */                
+ + */
   ENTRY(stub_rt_sigreturn)
         CFI_STARTPROC
         addq $8, %rsp
@@@ -727,70 -648,70 +767,70 @@@
   END(stub_rt_sigreturn)
   
   /*
- - * initial frame state for interrupts and exceptions
+ + * Build the entry stubs and pointer table with some assembler magic.
+ + * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ + * single cache line on all modern x86 implementations.
    */
- -      .macro _frame ref
- -      CFI_STARTPROC simple
- -      CFI_SIGNAL_FRAME
- -      CFI_DEF_CFA rsp,SS+8-\ref
- -      /*CFI_REL_OFFSET ss,SS-\ref*/
- -      CFI_REL_OFFSET rsp,RSP-\ref
- -      /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- -      /*CFI_REL_OFFSET cs,CS-\ref*/
- -      CFI_REL_OFFSET rip,RIP-\ref
- -      .endm
+ +      .section .init.rodata,"a"
+ +ENTRY(interrupt)
+ +      .text
+ +      .p2align 5
+ +      .p2align CONFIG_X86_L1_CACHE_SHIFT
+ +ENTRY(irq_entries_start)
+ +      INTR_FRAME
+ +vector=FIRST_EXTERNAL_VECTOR
+ +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+ +      .balign 32
+ +  .rept       7
+ +    .if vector < NR_VECTORS
+ +      .if vector <> FIRST_EXTERNAL_VECTOR
+ +      CFI_ADJUST_CFA_OFFSET -8
+ +      .endif
+ +1:    pushq $(~vector+0x80)   /* Note: always in signed byte range */
+ +      CFI_ADJUST_CFA_OFFSET 8
+ +      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+ +      jmp 2f
+ +      .endif
+ +      .previous
+ +      .quad 1b
+ +      .text
+ +vector=vector+1
+ +    .endif
+ +  .endr
+ +2:    jmp common_interrupt
+ +.endr
+ +      CFI_ENDPROC
+ +END(irq_entries_start)
   
- -/* initial frame state for interrupts (and exceptions without error code) */
- -#define INTR_FRAME _frame RIP
- -/* initial frame state for exceptions with error code (and interrupts with
- -   vector already pushed) */
- -#define XCPT_FRAME _frame ORIG_RAX
+ +.previous
+ +END(interrupt)
+ +.previous
   
- -/* 
+ +/*
    * Interrupt entry/exit.
    *
    * Interrupt entry points save only callee clobbered registers in fast path.
- - *    
- - * Entry runs with interrupts off.    
- - */ 
+ + *
+ + * Entry runs with interrupts off.
+ + */
   
- -/* 0(%rsp): interrupt number */ 
+ +/* 0(%rsp): ~(interrupt number) */
         .macro interrupt func
- -      cld
- -      SAVE_ARGS
- -      leaq -ARGOFFSET(%rsp),%rdi      # arg1 for handler
- -      pushq %rbp
- -      /*
- -       * Save rbp twice: One is for marking the stack frame, as usual, and the
- -       * other, to fill pt_regs properly. This is because bx comes right
- -       * before the last saved register in that structure, and not bp. If the
- -       * base pointer were in the place bx is today, this would not be needed.
- -       */
- -      movq %rbp, -8(%rsp)
- -      CFI_ADJUST_CFA_OFFSET   8
- -      CFI_REL_OFFSET          rbp, 0
- -      movq %rsp,%rbp
- -      CFI_DEF_CFA_REGISTER    rbp
- -      testl $3,CS(%rdi)
- -      je 1f
- -      SWAPGS
- -      /* irqcount is used to check if a CPU is already on an interrupt
- -         stack or not. While this is essentially redundant with preempt_count
- -         it is a little cheaper to use a separate counter in the PDA
- -         (short of moving irq_enter into assembly, which would be too
- -          much work) */
- -1:    incl    %gs:pda_irqcount
- -      cmoveq %gs:pda_irqstackptr,%rsp
- -      push    %rbp                    # backlink for old unwinder
- -      /*
- -       * We entered an interrupt context - irqs are off:
- -       */
- -      TRACE_IRQS_OFF
+ +      subq $10*8, %rsp
+ +      CFI_ADJUST_CFA_OFFSET 10*8
+ +      call save_args
+ +      PARTIAL_FRAME 0
         call \func
         .endm
   
- -ENTRY(common_interrupt)
+ +      /*
+ +       * The interrupt stubs push (~vector+0x80) onto the stack and
+ +       * then jump to common_interrupt.
+ +       */
+ +      .p2align CONFIG_X86_L1_CACHE_SHIFT
+ +common_interrupt:
         XCPT_FRAME
+ +      addq $-0x80,(%rsp)              /* Adjust vector to [-256,-1] range */
         interrupt do_IRQ
         /* 0(%rsp): oldrsp-ARGOFFSET */
   ret_from_intr:
@@@ -804,12 -725,12 +844,12 @@@ exit_intr
         GET_THREAD_INFO(%rcx)
         testl $3,CS-ARGOFFSET(%rsp)
         je retint_kernel
- -      
+ +
         /* Interrupt came from user space */
         /*
          * Has a correct top of stack, but a partial stack frame
          * %rcx: thread info. Interrupts off.
- -       */             
+ +       */
   retint_with_reschedule:
         movl $_TIF_WORK_MASK,%edi
   retint_check:
@@@ -882,20 -803,20 +922,20 @@@ retint_careful
         pushq %rdi
         CFI_ADJUST_CFA_OFFSET   8
         call  schedule
- -      popq %rdi               
+ +      popq %rdi
         CFI_ADJUST_CFA_OFFSET   -8
         GET_THREAD_INFO(%rcx)
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         jmp retint_check
- -      
+ +
   retint_signal:
         testl $_TIF_DO_NOTIFY_MASK,%edx
         jz    retint_swapgs
         TRACE_IRQS_ON
         ENABLE_INTERRUPTS(CLBR_NONE)
         SAVE_REST
- -      movq $-1,ORIG_RAX(%rsp)                         
+ +      movq $-1,ORIG_RAX(%rsp)
         xorl %esi,%esi          # oldset
         movq %rsp,%rdi          # &pt_regs
         call do_notify_resume
@@@ -917,211 -838,324 +957,211 @@@ ENTRY(retint_kernel
         jnc  retint_restore_args
         call preempt_schedule_irq
         jmp exit_intr
- -#endif        
+ +#endif
   
         CFI_ENDPROC
   END(common_interrupt)
- -      
+ +
   /*
    * APIC interrupts.
- - */           
- -      .macro apicinterrupt num,func
+ + */
+ +.macro apicinterrupt num sym do_sym
+ +ENTRY(\sym)
         INTR_FRAME
         pushq $~(\num)
         CFI_ADJUST_CFA_OFFSET 8
- -      interrupt \func
+ +      interrupt \do_sym
         jmp ret_from_intr
         CFI_ENDPROC
- -      .endm
- -
- -ENTRY(thermal_interrupt)
- -      apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
- -END(thermal_interrupt)
- -
- -ENTRY(threshold_interrupt)
- -      apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
- -END(threshold_interrupt)
- -
- -#ifdef CONFIG_SMP     
- -ENTRY(reschedule_interrupt)
- -      apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
- -END(reschedule_interrupt)
- -
- -      .macro INVALIDATE_ENTRY num
- -ENTRY(invalidate_interrupt\num)
- -      apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt 
- -END(invalidate_interrupt\num)
- -      .endm
+ +END(\sym)
+ +.endm
   
- -      INVALIDATE_ENTRY 0
- -      INVALIDATE_ENTRY 1
- -      INVALIDATE_ENTRY 2
- -      INVALIDATE_ENTRY 3
- -      INVALIDATE_ENTRY 4
- -      INVALIDATE_ENTRY 5
- -      INVALIDATE_ENTRY 6
- -      INVALIDATE_ENTRY 7
- -
- -ENTRY(call_function_interrupt)
- -      apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
- -END(call_function_interrupt)
- -ENTRY(call_function_single_interrupt)
- -      apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
- -END(call_function_single_interrupt)
- -ENTRY(irq_move_cleanup_interrupt)
- -      apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
- -END(irq_move_cleanup_interrupt)
+ +#ifdef CONFIG_SMP
+ +apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+ +      irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
   #endif
   
- -ENTRY(apic_timer_interrupt)
- -      apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
- -END(apic_timer_interrupt)
+ +apicinterrupt UV_BAU_MESSAGE \
+ +      uv_bau_message_intr1 uv_bau_message_interrupt
+ +apicinterrupt LOCAL_TIMER_VECTOR \
+ +      apic_timer_interrupt smp_apic_timer_interrupt
+ +
+ +#ifdef CONFIG_SMP
+ +apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
+ +      invalidate_interrupt0 smp_invalidate_interrupt
+ +apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
+ +      invalidate_interrupt1 smp_invalidate_interrupt
+ +apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
+ +      invalidate_interrupt2 smp_invalidate_interrupt
+ +apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
+ +      invalidate_interrupt3 smp_invalidate_interrupt
+ +apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
+ +      invalidate_interrupt4 smp_invalidate_interrupt
+ +apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
+ +      invalidate_interrupt5 smp_invalidate_interrupt
+ +apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
+ +      invalidate_interrupt6 smp_invalidate_interrupt
+ +apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
+ +      invalidate_interrupt7 smp_invalidate_interrupt
+ +#endif
   
- -ENTRY(uv_bau_message_intr1)
- -      apicinterrupt 220,uv_bau_message_interrupt
- -END(uv_bau_message_intr1)
+ +apicinterrupt THRESHOLD_APIC_VECTOR \
+ +      threshold_interrupt mce_threshold_interrupt
+ +apicinterrupt THERMAL_APIC_VECTOR \
+ +      thermal_interrupt smp_thermal_interrupt
+ +
+ +#ifdef CONFIG_SMP
+ +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+ +      call_function_single_interrupt smp_call_function_single_interrupt
+ +apicinterrupt CALL_FUNCTION_VECTOR \
+ +      call_function_interrupt smp_call_function_interrupt
+ +apicinterrupt RESCHEDULE_VECTOR \
+ +      reschedule_interrupt smp_reschedule_interrupt
+ +#endif
   
- -ENTRY(error_interrupt)
- -      apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
- -END(error_interrupt)
+ +apicinterrupt ERROR_APIC_VECTOR \
+ +      error_interrupt smp_error_interrupt
+ +apicinterrupt SPURIOUS_APIC_VECTOR \
+ +      spurious_interrupt smp_spurious_interrupt
   
- -ENTRY(spurious_interrupt)
- -      apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
- -END(spurious_interrupt)
- -                              
   /*
    * Exception entry points.
- - */           
- -      .macro zeroentry sym
+ + */
+ +.macro zeroentry sym do_sym
+ +ENTRY(\sym)
         INTR_FRAME
         PARAVIRT_ADJUST_EXCEPTION_FRAME
- -      pushq $0        /* push error code/oldrax */ 
- -      CFI_ADJUST_CFA_OFFSET 8
- -      pushq %rax      /* push real oldrax to the rdi slot */ 
- -      CFI_ADJUST_CFA_OFFSET 8
- -      CFI_REL_OFFSET rax,0
- -      leaq  \sym(%rip),%rax
- -      jmp error_entry
+ +      pushq_cfi $-1           /* ORIG_RAX: no syscall to restart */
+ +      subq $15*8,%rsp
+ +      CFI_ADJUST_CFA_OFFSET 15*8
+ +      call error_entry
+ +      DEFAULT_FRAME 0
+ +      movq %rsp,%rdi          /* pt_regs pointer */
+ +      xorl %esi,%esi          /* no error code */
+ +      call \do_sym
+ +      jmp error_exit          /* %ebx: no swapgs flag */
         CFI_ENDPROC
- -      .endm   
+ +END(\sym)
+ +.endm
   
- -      .macro errorentry sym
- -      XCPT_FRAME
+ +.macro paranoidzeroentry sym do_sym
+ +ENTRY(\sym)
+ +      INTR_FRAME
         PARAVIRT_ADJUST_EXCEPTION_FRAME
- -      pushq %rax
+ +      pushq $-1               /* ORIG_RAX: no syscall to restart */
         CFI_ADJUST_CFA_OFFSET 8
- -      CFI_REL_OFFSET rax,0
- -      leaq  \sym(%rip),%rax
- -      jmp error_entry
+ +      subq $15*8, %rsp
+ +      call save_paranoid
+ +      TRACE_IRQS_OFF
+ +      movq %rsp,%rdi          /* pt_regs pointer */
+ +      xorl %esi,%esi          /* no error code */
+ +      call \do_sym
+ +      jmp paranoid_exit       /* %ebx: no swapgs flag */
         CFI_ENDPROC
- -      .endm
+ +END(\sym)
+ +.endm
   
- -      /* error code is on the stack already */
- -      /* handle NMI like exceptions that can happen everywhere */
- -      .macro paranoidentry sym, ist=0, irqtrace=1
- -      SAVE_ALL
- -      cld
- -      movl $1,%ebx
- -      movl  $MSR_GS_BASE,%ecx
- -      rdmsr
- -      testl %edx,%edx
- -      js    1f
- -      SWAPGS
- -      xorl  %ebx,%ebx
- -1:
- -      .if \ist
- -      movq    %gs:pda_data_offset, %rbp
- -      .endif
- -      .if \irqtrace
- -      TRACE_IRQS_OFF
- -      .endif
- -      movq %rsp,%rdi
- -      movq ORIG_RAX(%rsp),%rsi
- -      movq $-1,ORIG_RAX(%rsp)
- -      .if \ist
- -      subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- -      .endif
- -      call \sym
- -      .if \ist
- -      addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- -      .endif
- -      DISABLE_INTERRUPTS(CLBR_NONE)
- -      .if \irqtrace
+ +.macro paranoidzeroentry_ist sym do_sym ist
+ +ENTRY(\sym)
+ +      INTR_FRAME
+ +      PARAVIRT_ADJUST_EXCEPTION_FRAME
+ +      pushq $-1               /* ORIG_RAX: no syscall to restart */
+ +      CFI_ADJUST_CFA_OFFSET 8
+ +      subq $15*8, %rsp
+ +      call save_paranoid
         TRACE_IRQS_OFF
- -      .endif
- -      .endm
+ +      movq %rsp,%rdi          /* pt_regs pointer */
+ +      xorl %esi,%esi          /* no error code */
+ +      movq %gs:pda_data_offset, %rbp
+ +      subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ +      call \do_sym
+ +      addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ +      jmp paranoid_exit       /* %ebx: no swapgs flag */
+ +      CFI_ENDPROC
+ +END(\sym)
+ +.endm
   
- -      /*
- -       * "Paranoid" exit path from exception stack.
- -       * Paranoid because this is used by NMIs and cannot take
- -       * any kernel state for granted.
- -       * We don't do kernel preemption checks here, because only
- -       * NMI should be common and it does not enable IRQs and
- -       * cannot get reschedule ticks.
- -       *
- -       * "trace" is 0 for the NMI handler only, because irq-tracing
- -       * is fundamentally NMI-unsafe. (we cannot change the soft and
- -       * hard flags at once, atomically)
- -       */
- -      .macro paranoidexit trace=1
- -      /* ebx: no swapgs flag */
- -paranoid_exit\trace:
- -      testl %ebx,%ebx                         /* swapgs needed? */
- -      jnz paranoid_restore\trace
- -      testl $3,CS(%rsp)
- -      jnz   paranoid_userspace\trace
- -paranoid_swapgs\trace:
- -      .if \trace
- -      TRACE_IRQS_IRETQ 0
- -      .endif
- -      SWAPGS_UNSAFE_STACK
- -paranoid_restore\trace:
- -      RESTORE_ALL 8
- -      jmp irq_return
- -paranoid_userspace\trace:
- -      GET_THREAD_INFO(%rcx)
- -      movl TI_flags(%rcx),%ebx
- -      andl $_TIF_WORK_MASK,%ebx
- -      jz paranoid_swapgs\trace
- -      movq %rsp,%rdi                  /* &pt_regs */
- -      call sync_regs
- -      movq %rax,%rsp                  /* switch stack for scheduling */
- -      testl $_TIF_NEED_RESCHED,%ebx
- -      jnz paranoid_schedule\trace
- -      movl %ebx,%edx                  /* arg3: thread flags */
- -      .if \trace
- -      TRACE_IRQS_ON
- -      .endif
- -      ENABLE_INTERRUPTS(CLBR_NONE)
- -      xorl %esi,%esi                  /* arg2: oldset */
- -      movq %rsp,%rdi                  /* arg1: &pt_regs */
- -      call do_notify_resume
- -      DISABLE_INTERRUPTS(CLBR_NONE)
- -      .if \trace
- -      TRACE_IRQS_OFF
- -      .endif
- -      jmp paranoid_userspace\trace
- -paranoid_schedule\trace:
- -      .if \trace
- -      TRACE_IRQS_ON
- -      .endif
- -      ENABLE_INTERRUPTS(CLBR_ANY)
- -      call schedule
- -      DISABLE_INTERRUPTS(CLBR_ANY)
- -      .if \trace
- -      TRACE_IRQS_OFF
- -      .endif
- -      jmp paranoid_userspace\trace
+ +.macro errorentry sym do_sym
+ +ENTRY(\sym)
+ +      XCPT_FRAME
+ +      PARAVIRT_ADJUST_EXCEPTION_FRAME
+ +      subq $15*8,%rsp
+ +      CFI_ADJUST_CFA_OFFSET 15*8
+ +      call error_entry
+ +      DEFAULT_FRAME 0
+ +      movq %rsp,%rdi                  /* pt_regs pointer */
+ +      movq ORIG_RAX(%rsp),%rsi        /* get error code */
+ +      movq $-1,ORIG_RAX(%rsp)         /* no syscall to restart */
+ +      call \do_sym
+ +      jmp error_exit                  /* %ebx: no swapgs flag */
         CFI_ENDPROC
- -      .endm
+ +END(\sym)
+ +.endm
   
- -/*
- - * Exception entry point. This expects an error code/orig_rax on the stack
- - * and the exception handler in %rax. 
- - */                                           
- -KPROBE_ENTRY(error_entry)
- -      _frame RDI
- -      CFI_REL_OFFSET rax,0
- -      /* rdi slot contains rax, oldrax contains error code */
- -      cld     
- -      subq  $14*8,%rsp
- -      CFI_ADJUST_CFA_OFFSET   (14*8)
- -      movq %rsi,13*8(%rsp)
- -      CFI_REL_OFFSET  rsi,RSI
- -      movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
- -      CFI_REGISTER    rax,rsi
- -      movq %rdx,12*8(%rsp)
- -      CFI_REL_OFFSET  rdx,RDX
- -      movq %rcx,11*8(%rsp)
- -      CFI_REL_OFFSET  rcx,RCX
- -      movq %rsi,10*8(%rsp)    /* store rax */ 
- -      CFI_REL_OFFSET  rax,RAX
- -      movq %r8, 9*8(%rsp)
- -      CFI_REL_OFFSET  r8,R8
- -      movq %r9, 8*8(%rsp)
- -      CFI_REL_OFFSET  r9,R9
- -      movq %r10,7*8(%rsp)
- -      CFI_REL_OFFSET  r10,R10
- -      movq %r11,6*8(%rsp)
- -      CFI_REL_OFFSET  r11,R11
- -      movq %rbx,5*8(%rsp) 
- -      CFI_REL_OFFSET  rbx,RBX
- -      movq %rbp,4*8(%rsp) 
- -      CFI_REL_OFFSET  rbp,RBP
- -      movq %r12,3*8(%rsp) 
- -      CFI_REL_OFFSET  r12,R12
- -      movq %r13,2*8(%rsp) 
- -      CFI_REL_OFFSET  r13,R13
- -      movq %r14,1*8(%rsp) 
- -      CFI_REL_OFFSET  r14,R14
- -      movq %r15,(%rsp) 
- -      CFI_REL_OFFSET  r15,R15
- -      xorl %ebx,%ebx  
- -      testl $3,CS(%rsp)
- -      je  error_kernelspace
- -error_swapgs: 
- -      SWAPGS
- -error_sti:
- -      TRACE_IRQS_OFF
- -      movq %rdi,RDI(%rsp)     
- -      CFI_REL_OFFSET  rdi,RDI
- -      movq %rsp,%rdi
- -      movq ORIG_RAX(%rsp),%rsi        /* get error code */ 
- -      movq $-1,ORIG_RAX(%rsp)
- -      call *%rax
- -      /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
- -error_exit:
- -      movl %ebx,%eax
- -      RESTORE_REST
- -      DISABLE_INTERRUPTS(CLBR_NONE)
+ +      /* error code is on the stack already */
+ +.macro paranoiderrorentry sym do_sym
+ +ENTRY(\sym)
+ +      XCPT_FRAME
+ +      PARAVIRT_ADJUST_EXCEPTION_FRAME
+ +      subq $15*8,%rsp
+ +      CFI_ADJUST_CFA_OFFSET 15*8
+ +      call save_paranoid
+ +      DEFAULT_FRAME 0
         TRACE_IRQS_OFF
- -      GET_THREAD_INFO(%rcx)   
- -      testl %eax,%eax
- -      jne  retint_kernel
- -      LOCKDEP_SYS_EXIT_IRQ
- -      movl  TI_flags(%rcx),%edx
- -      movl  $_TIF_WORK_MASK,%edi
- -      andl  %edi,%edx
- -      jnz  retint_careful
- -      jmp retint_swapgs
+ +      movq %rsp,%rdi                  /* pt_regs pointer */
+ +      movq ORIG_RAX(%rsp),%rsi        /* get error code */
+ +      movq $-1,ORIG_RAX(%rsp)         /* no syscall to restart */
+ +      call \do_sym
+ +      jmp paranoid_exit               /* %ebx: no swapgs flag */
         CFI_ENDPROC
+ +END(\sym)
+ +.endm
   
- -error_kernelspace:
- -      incl %ebx
- -       /* There are two places in the kernel that can potentially fault with
- -          usergs. Handle them here. The exception handlers after
- -         iret run with kernel gs again, so don't set the user space flag.
- -         B stepping K8s sometimes report an truncated RIP for IRET 
- -         exceptions returning to compat mode. Check for these here too. */
- -      leaq irq_return(%rip),%rcx
- -      cmpq %rcx,RIP(%rsp)
- -      je   error_swapgs
- -      movl %ecx,%ecx  /* zero extend */
- -      cmpq %rcx,RIP(%rsp)
- -      je   error_swapgs
- -      cmpq $gs_change,RIP(%rsp)
- -        je   error_swapgs
- -      jmp  error_sti
- -KPROBE_END(error_entry)
- -      
- -       /* Reload gs selector with exception handling */
- -       /* edi:  new selector */ 
+ +zeroentry divide_error do_divide_error
+ +zeroentry overflow do_overflow
+ +zeroentry bounds do_bounds
+ +zeroentry invalid_op do_invalid_op
+ +zeroentry device_not_available do_device_not_available
+ +paranoiderrorentry double_fault do_double_fault
+ +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
+ +errorentry invalid_TSS do_invalid_TSS
+ +errorentry segment_not_present do_segment_not_present
+ +zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
+ +zeroentry coprocessor_error do_coprocessor_error
+ +errorentry alignment_check do_alignment_check
+ +zeroentry simd_coprocessor_error do_simd_coprocessor_error
+ +
+ +      /* Reload gs selector with exception handling */
+ +      /* edi:  new selector */
   ENTRY(native_load_gs_index)
         CFI_STARTPROC
         pushf
         CFI_ADJUST_CFA_OFFSET 8
         DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
- -        SWAPGS
- -gs_change:     
- -        movl %edi,%gs   
+ +      SWAPGS
+ +gs_change:
+ +      movl %edi,%gs
   2:    mfence          /* workaround */
         SWAPGS
- -        popf
+ +      popf
         CFI_ADJUST_CFA_OFFSET -8
- -        ret
+ +      ret
         CFI_ENDPROC
- -ENDPROC(native_load_gs_index)
- -       
- -        .section __ex_table,"a"
- -        .align 8
- -        .quad gs_change,bad_gs
- -        .previous
- -        .section .fixup,"ax"
+ +END(native_load_gs_index)
+ +
+ +      .section __ex_table,"a"
+ +      .align 8
+ +      .quad gs_change,bad_gs
+ +      .previous
+ +      .section .fixup,"ax"
         /* running with kernelgs */
- -bad_gs: 
+ +bad_gs:
         SWAPGS                  /* switch back to user gs */
         xorl %eax,%eax
- -        movl %eax,%gs
- -        jmp  2b
- -        .previous       
- -      
+ +      movl %eax,%gs
+ +      jmp  2b
+ +      .previous
+ +
   /*
    * Create a kernel thread.
    *
@@@ -1144,7 -1178,7 +1184,7 @@@ ENTRY(kernel_thread
   
         xorl %r8d,%r8d
         xorl %r9d,%r9d
- -      
+ +
         # clone now
         call do_fork
         movq %rax,RAX(%rsp)
@@@ -1155,15 -1189,15 +1195,15 @@@
          * so internally to the x86_64 port you can rely on kernel_thread()
          * not to reschedule the child before returning, this avoids the need
          * of hacks for example to fork off the per-CPU idle tasks.
- -         * [Hopefully no generic code relies on the reschedule -AK]   
+ +       * [Hopefully no generic code relies on the reschedule -AK]
          */
         RESTORE_ALL
         UNFAKE_STACK_FRAME
         ret
         CFI_ENDPROC
- -ENDPROC(kernel_thread)
- -      
- -child_rip:
+ +END(kernel_thread)
+ +
+ +ENTRY(child_rip)
         pushq $0                # fake return address
         CFI_STARTPROC
         /*
@@@ -1176,9 -1210,8 +1216,9 @@@
         # exit
         mov %eax, %edi
         call do_exit
+ +      ud2                     # padding for call trace
         CFI_ENDPROC
- -ENDPROC(child_rip)
+ +END(child_rip)
   
   /*
    * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@@ -1198,10 -1231,10 +1238,10 @@@
   ENTRY(kernel_execve)
         CFI_STARTPROC
         FAKE_STACK_FRAME $0
- -      SAVE_ALL        
+ +      SAVE_ALL
         movq %rsp,%rcx
         call sys_execve
- -      movq %rax, RAX(%rsp)    
+ +      movq %rax, RAX(%rsp)
         RESTORE_REST
         testq %rax,%rax
         je int_ret_from_sys_call
@@@ -1209,7 -1242,129 +1249,7 @@@
         UNFAKE_STACK_FRAME
         ret
         CFI_ENDPROC
- -ENDPROC(kernel_execve)
- -
- -KPROBE_ENTRY(page_fault)
- -      errorentry do_page_fault
- -KPROBE_END(page_fault)
- -
- -ENTRY(coprocessor_error)
- -      zeroentry do_coprocessor_error
- -END(coprocessor_error)
- -
- -ENTRY(simd_coprocessor_error)
- -      zeroentry do_simd_coprocessor_error     
- -END(simd_coprocessor_error)
- -
- -ENTRY(device_not_available)
- -      zeroentry do_device_not_available
- -END(device_not_available)
- -
- -      /* runs on exception stack */
- -KPROBE_ENTRY(debug)
- -      INTR_FRAME
- -      PARAVIRT_ADJUST_EXCEPTION_FRAME
- -      pushq $0
- -      CFI_ADJUST_CFA_OFFSET 8         
- -      paranoidentry do_debug, DEBUG_STACK
- -      paranoidexit
- -KPROBE_END(debug)
- -
- -      /* runs on exception stack */   
- -KPROBE_ENTRY(nmi)
- -      INTR_FRAME
- -      PARAVIRT_ADJUST_EXCEPTION_FRAME
- -      pushq $-1
- -      CFI_ADJUST_CFA_OFFSET 8
- -      paranoidentry do_nmi, 0, 0
- -#ifdef CONFIG_TRACE_IRQFLAGS
- -      paranoidexit 0
- -#else
- -      jmp paranoid_exit1
- -      CFI_ENDPROC
- -#endif
- -KPROBE_END(nmi)
- -
- -KPROBE_ENTRY(int3)
- -      INTR_FRAME
- -      PARAVIRT_ADJUST_EXCEPTION_FRAME
- -      pushq $0
- -      CFI_ADJUST_CFA_OFFSET 8
- -      paranoidentry do_int3, DEBUG_STACK
- -      jmp paranoid_exit1
- -      CFI_ENDPROC
- -KPROBE_END(int3)
- -
- -ENTRY(overflow)
- -      zeroentry do_overflow
- -END(overflow)
- -
- -ENTRY(bounds)
- -      zeroentry do_bounds
- -END(bounds)
- -
- -ENTRY(invalid_op)
- -      zeroentry do_invalid_op 
- -END(invalid_op)
- -
- -ENTRY(coprocessor_segment_overrun)
- -      zeroentry do_coprocessor_segment_overrun
- -END(coprocessor_segment_overrun)
- -
- -      /* runs on exception stack */
- -ENTRY(double_fault)
- -      XCPT_FRAME
- -      PARAVIRT_ADJUST_EXCEPTION_FRAME
- -      paranoidentry do_double_fault
- -      jmp paranoid_exit1
- -      CFI_ENDPROC
- -END(double_fault)
- -
- -ENTRY(invalid_TSS)
- -      errorentry do_invalid_TSS
- -END(invalid_TSS)
- -
- -ENTRY(segment_not_present)
- -      errorentry do_segment_not_present
- -END(segment_not_present)
- -
- -      /* runs on exception stack */
- -ENTRY(stack_segment)
- -      XCPT_FRAME
- -      PARAVIRT_ADJUST_EXCEPTION_FRAME
- -      paranoidentry do_stack_segment
- -      jmp paranoid_exit1
- -      CFI_ENDPROC
- -END(stack_segment)
- -
- -KPROBE_ENTRY(general_protection)
- -      errorentry do_general_protection
- -KPROBE_END(general_protection)
- -
- -ENTRY(alignment_check)
- -      errorentry do_alignment_check
- -END(alignment_check)
- -
- -ENTRY(divide_error)
- -      zeroentry do_divide_error
- -END(divide_error)
- -
- -ENTRY(spurious_interrupt_bug)
- -      zeroentry do_spurious_interrupt_bug
- -END(spurious_interrupt_bug)
- -
- -#ifdef CONFIG_X86_MCE
- -      /* runs on exception stack */
- -ENTRY(machine_check)
- -      INTR_FRAME
- -      PARAVIRT_ADJUST_EXCEPTION_FRAME
- -      pushq $0
- -      CFI_ADJUST_CFA_OFFSET 8 
- -      paranoidentry do_machine_check
- -      jmp paranoid_exit1
- -      CFI_ENDPROC
- -END(machine_check)
- -#endif
+ +END(kernel_execve)
   
   /* Call softirq on interrupt stack. Interrupts are off. */
   ENTRY(call_softirq)
@@@ -1229,33 -1384,40 +1269,33 @@@
         decl %gs:pda_irqcount
         ret
         CFI_ENDPROC
- -ENDPROC(call_softirq)
- -
- -KPROBE_ENTRY(ignore_sysret)
- -      CFI_STARTPROC
- -      mov $-ENOSYS,%eax
- -      sysret
- -      CFI_ENDPROC
- -ENDPROC(ignore_sysret)
+ +END(call_softirq)
   
   #ifdef CONFIG_XEN
- -ENTRY(xen_hypervisor_callback)
- -      zeroentry xen_do_hypervisor_callback
- -END(xen_hypervisor_callback)
+ +zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
   
   /*
- -# A note on the "critical region" in our callback handler.
- -# We want to avoid stacking callback handlers due to events occurring
- -# during handling of the last event. To do this, we keep events disabled
- -# until we've done all processing. HOWEVER, we must enable events before
- -# popping the stack frame (can't be done atomically) and so it would still
- -# be possible to get enough handler activations to overflow the stack.
- -# Although unlikely, bugs of that kind are hard to track down, so we'd
- -# like to avoid the possibility.
- -# So, on entry to the handler we detect whether we interrupted an
- -# existing activation in its critical region -- if so, we pop the current
- -# activation and restart the handler using the previous one.
- -*/
+ + * A note on the "critical region" in our callback handler.
+ + * We want to avoid stacking callback handlers due to events occurring
+ + * during handling of the last event. To do this, we keep events disabled
+ + * until we've done all processing. HOWEVER, we must enable events before
+ + * popping the stack frame (can't be done atomically) and so it would still
+ + * be possible to get enough handler activations to overflow the stack.
+ + * Although unlikely, bugs of that kind are hard to track down, so we'd
+ + * like to avoid the possibility.
+ + * So, on entry to the handler we detect whether we interrupted an
+ + * existing activation in its critical region -- if so, we pop the current
+ + * activation and restart the handler using the previous one.
+ + */
   ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
         CFI_STARTPROC
- -/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
- -   see the correct pointer to the pt_regs */
+ +/*
+ + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ + * see the correct pointer to the pt_regs
+ + */
         movq %rdi, %rsp            # we don't return, adjust the stack frame
         CFI_ENDPROC
- -      CFI_DEFAULT_STACK
+ +      DEFAULT_FRAME
   11:   incl %gs:pda_irqcount
         movq %rsp,%rbp
         CFI_DEF_CFA_REGISTER rbp
@@@ -1270,26 -1432,23 +1310,26 @@@
   END(do_hypervisor_callback)
   
   /*
- -# Hypervisor uses this for application faults while it executes.
- -# We get here for two reasons:
- -#  1. Fault while reloading DS, ES, FS or GS
- -#  2. Fault while executing IRET
- -# Category 1 we do not need to fix up as Xen has already reloaded all segment
- -# registers that could be reloaded and zeroed the others.
- -# Category 2 we fix up by killing the current process. We cannot use the
- -# normal Linux return path in this case because if we use the IRET hypercall
- -# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
- -# We distinguish between categories by comparing each saved segment register
- -# with its current contents: any discrepancy means we in category 1.
- -*/
+ + * Hypervisor uses this for application faults while it executes.
+ + * We get here for two reasons:
+ + *  1. Fault while reloading DS, ES, FS or GS
+ + *  2. Fault while executing IRET
+ + * Category 1 we do not need to fix up as Xen has already reloaded all segment
+ + * registers that could be reloaded and zeroed the others.
+ + * Category 2 we fix up by killing the current process. We cannot use the
+ + * normal Linux return path in this case because if we use the IRET hypercall
+ + * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ + * We distinguish between categories by comparing each saved segment register
+ + * with its current contents: any discrepancy means we in category 1.
+ + */
   ENTRY(xen_failsafe_callback)
- -      framesz = (RIP-0x30)    /* workaround buggy gas */
- -      _frame framesz
- -      CFI_REL_OFFSET rcx, 0
- -      CFI_REL_OFFSET r11, 8
+ +      INTR_FRAME 1 (6*8)
+ +      /*CFI_REL_OFFSET gs,GS*/
+ +      /*CFI_REL_OFFSET fs,FS*/
+ +      /*CFI_REL_OFFSET es,ES*/
+ +      /*CFI_REL_OFFSET ds,DS*/
+ +      CFI_REL_OFFSET r11,8
+ +      CFI_REL_OFFSET rcx,0
         movw %ds,%cx
         cmpw %cx,0x10(%rsp)
         CFI_REMEMBER_STATE
@@@ -1310,9 -1469,12 +1350,9 @@@
         CFI_RESTORE r11
         addq $0x30,%rsp
         CFI_ADJUST_CFA_OFFSET -0x30
- -      pushq $0
- -      CFI_ADJUST_CFA_OFFSET 8
- -      pushq %r11
- -      CFI_ADJUST_CFA_OFFSET 8
- -      pushq %rcx
- -      CFI_ADJUST_CFA_OFFSET 8
+ +      pushq_cfi $0    /* RIP */
+ +      pushq_cfi %r11
+ +      pushq_cfi %rcx
         jmp general_protection
         CFI_RESTORE_STATE
   1:    /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@@ -1322,223 -1484,11 +1362,223 @@@
         CFI_RESTORE r11
         addq $0x30,%rsp
         CFI_ADJUST_CFA_OFFSET -0x30
- -      pushq $0
- -      CFI_ADJUST_CFA_OFFSET 8
+ +      pushq_cfi $0
         SAVE_ALL
         jmp error_exit
         CFI_ENDPROC
   END(xen_failsafe_callback)
   
   #endif /* CONFIG_XEN */
+ +
+ +/*
+ + * Some functions should be protected against kprobes
+ + */
+ +      .pushsection .kprobes.text, "ax"
+ +
+ +paranoidzeroentry_ist debug do_debug DEBUG_STACK
+ +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+ +paranoiderrorentry stack_segment do_stack_segment
+ +errorentry general_protection do_general_protection
+ +errorentry page_fault do_page_fault
+ +#ifdef CONFIG_X86_MCE
+ +paranoidzeroentry machine_check do_machine_check
+ +#endif
+ +
+ +      /*
+ +       * "Paranoid" exit path from exception stack.
+ +       * Paranoid because this is used by NMIs and cannot take
+ +       * any kernel state for granted.
+ +       * We don't do kernel preemption checks here, because only
+ +       * NMI should be common and it does not enable IRQs and
+ +       * cannot get reschedule ticks.
+ +       *
+ +       * "trace" is 0 for the NMI handler only, because irq-tracing
+ +       * is fundamentally NMI-unsafe. (we cannot change the soft and
+ +       * hard flags at once, atomically)
+ +       */
+ +
+ +      /* ebx: no swapgs flag */
+ +ENTRY(paranoid_exit)
+ +      INTR_FRAME
+ +      DISABLE_INTERRUPTS(CLBR_NONE)
+ +      TRACE_IRQS_OFF
+ +      testl %ebx,%ebx                         /* swapgs needed? */
+ +      jnz paranoid_restore
+ +      testl $3,CS(%rsp)
+ +      jnz   paranoid_userspace
+ +paranoid_swapgs:
+ +      TRACE_IRQS_IRETQ 0
+ +      SWAPGS_UNSAFE_STACK
+ +paranoid_restore:
+ +      RESTORE_ALL 8
+ +      jmp irq_return
+ +paranoid_userspace:
+ +      GET_THREAD_INFO(%rcx)
+ +      movl TI_flags(%rcx),%ebx
+ +      andl $_TIF_WORK_MASK,%ebx
+ +      jz paranoid_swapgs
+ +      movq %rsp,%rdi                  /* &pt_regs */
+ +      call sync_regs
+ +      movq %rax,%rsp                  /* switch stack for scheduling */
+ +      testl $_TIF_NEED_RESCHED,%ebx
+ +      jnz paranoid_schedule
+ +      movl %ebx,%edx                  /* arg3: thread flags */
+ +      TRACE_IRQS_ON
+ +      ENABLE_INTERRUPTS(CLBR_NONE)
+ +      xorl %esi,%esi                  /* arg2: oldset */
+ +      movq %rsp,%rdi                  /* arg1: &pt_regs */
+ +      call do_notify_resume
+ +      DISABLE_INTERRUPTS(CLBR_NONE)
+ +      TRACE_IRQS_OFF
+ +      jmp paranoid_userspace
+ +paranoid_schedule:
+ +      TRACE_IRQS_ON
+ +      ENABLE_INTERRUPTS(CLBR_ANY)
+ +      call schedule
+ +      DISABLE_INTERRUPTS(CLBR_ANY)
+ +      TRACE_IRQS_OFF
+ +      jmp paranoid_userspace
+ +      CFI_ENDPROC
+ +END(paranoid_exit)
+ +
+ +/*
+ + * Exception entry point. This expects an error code/orig_rax on the stack.
+ + * returns in "no swapgs flag" in %ebx.
+ + */
+ +ENTRY(error_entry)
+ +      XCPT_FRAME
+ +      CFI_ADJUST_CFA_OFFSET 15*8
+ +      /* oldrax contains error code */
+ +      cld
+ +      movq_cfi rdi, RDI+8
+ +      movq_cfi rsi, RSI+8
+ +      movq_cfi rdx, RDX+8
+ +      movq_cfi rcx, RCX+8
+ +      movq_cfi rax, RAX+8
+ +      movq_cfi  r8,  R8+8
+ +      movq_cfi  r9,  R9+8
+ +      movq_cfi r10, R10+8
+ +      movq_cfi r11, R11+8
+ +      movq_cfi rbx, RBX+8
+ +      movq_cfi rbp, RBP+8
+ +      movq_cfi r12, R12+8
+ +      movq_cfi r13, R13+8
+ +      movq_cfi r14, R14+8
+ +      movq_cfi r15, R15+8
+ +      xorl %ebx,%ebx
+ +      testl $3,CS+8(%rsp)
+ +      je error_kernelspace
+ +error_swapgs:
+ +      SWAPGS
+ +error_sti:
+ +      TRACE_IRQS_OFF
+ +      ret
+ +      CFI_ENDPROC
+ +
+ +/*
+ + * There are two places in the kernel that can potentially fault with
+ + * usergs. Handle them here. The exception handlers after iret run with
+ + * kernel gs again, so don't set the user space flag. B stepping K8s
+ + * sometimes report an truncated RIP for IRET exceptions returning to
+ + * compat mode. Check for these here too.
+ + */
+ +error_kernelspace:
+ +      incl %ebx
+ +      leaq irq_return(%rip),%rcx
+ +      cmpq %rcx,RIP+8(%rsp)
+ +      je error_swapgs
+ +      movl %ecx,%ecx  /* zero extend */
+ +      cmpq %rcx,RIP+8(%rsp)
+ +      je error_swapgs
+ +      cmpq $gs_change,RIP+8(%rsp)
+ +      je error_swapgs
+ +      jmp error_sti
+ +END(error_entry)
+ +
+ +
+ +/* ebx:       no swapgs flag (1: don't need swapgs, 0: need it) */
+ +ENTRY(error_exit)
+ +      DEFAULT_FRAME
+ +      movl %ebx,%eax
+ +      RESTORE_REST
+ +      DISABLE_INTERRUPTS(CLBR_NONE)
+ +      TRACE_IRQS_OFF
+ +      GET_THREAD_INFO(%rcx)
+ +      testl %eax,%eax
+ +      jne retint_kernel
+ +      LOCKDEP_SYS_EXIT_IRQ
+ +      movl TI_flags(%rcx),%edx
+ +      movl $_TIF_WORK_MASK,%edi
+ +      andl %edi,%edx
+ +      jnz retint_careful
+ +      jmp retint_swapgs
+ +      CFI_ENDPROC
+ +END(error_exit)
+ +
+ +
+ +      /* runs on exception stack */
+ +ENTRY(nmi)
+ +      INTR_FRAME
+ +      PARAVIRT_ADJUST_EXCEPTION_FRAME
+ +      pushq_cfi $-1
+ +      subq $15*8, %rsp
+ +      CFI_ADJUST_CFA_OFFSET 15*8
+ +      call save_paranoid
+ +      DEFAULT_FRAME 0
+ +      /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ +      movq %rsp,%rdi
+ +      movq $-1,%rsi
+ +      call do_nmi
+ +#ifdef CONFIG_TRACE_IRQFLAGS
+ +      /* paranoidexit; without TRACE_IRQS_OFF */
+ +      /* ebx: no swapgs flag */
+ +      DISABLE_INTERRUPTS(CLBR_NONE)
+ +      testl %ebx,%ebx                         /* swapgs needed? */
+ +      jnz nmi_restore
+ +      testl $3,CS(%rsp)
+ +      jnz nmi_userspace
+ +nmi_swapgs:
+ +      SWAPGS_UNSAFE_STACK
+ +nmi_restore:
+ +      RESTORE_ALL 8
+ +      jmp irq_return
+ +nmi_userspace:
+ +      GET_THREAD_INFO(%rcx)
+ +      movl TI_flags(%rcx),%ebx
+ +      andl $_TIF_WORK_MASK,%ebx
+ +      jz nmi_swapgs
+ +      movq %rsp,%rdi                  /* &pt_regs */
+ +      call sync_regs
+ +      movq %rax,%rsp                  /* switch stack for scheduling */
+ +      testl $_TIF_NEED_RESCHED,%ebx
+ +      jnz nmi_schedule
+ +      movl %ebx,%edx                  /* arg3: thread flags */
+ +      ENABLE_INTERRUPTS(CLBR_NONE)
+ +      xorl %esi,%esi                  /* arg2: oldset */
+ +      movq %rsp,%rdi                  /* arg1: &pt_regs */
+ +      call do_notify_resume
+ +      DISABLE_INTERRUPTS(CLBR_NONE)
+ +      jmp nmi_userspace
+ +nmi_schedule:
+ +      ENABLE_INTERRUPTS(CLBR_ANY)
+ +      call schedule
+ +      DISABLE_INTERRUPTS(CLBR_ANY)
+ +      jmp nmi_userspace
+ +      CFI_ENDPROC
+ +#else
+ +      jmp paranoid_exit
+ +      CFI_ENDPROC
+ +#endif
+ +END(nmi)
+ +
+ +ENTRY(ignore_sysret)
+ +      CFI_STARTPROC
+ +      mov $-ENOSYS,%eax
+ +      sysret
+ +      CFI_ENDPROC
+ +END(ignore_sysret)
+ +
+ +/*
+ + * End of kprobes section
+ + */
+ +      .popsection
diff --combined arch/x86/kernel/irq_64.c

index 1d3d0e7,11c65e8..1df869e
--- 1/arch/x86/kernel/irq_64.c
--- 2/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@@ -13,11 -13,13 +13,12 @@@
   #include <linux/seq_file.h>
   #include <linux/module.h>
   #include <linux/delay.h>
+ #include <linux/ftrace.h>
   #include <asm/uaccess.h>
   #include <asm/io_apic.h>
   #include <asm/idle.h>
   #include <asm/smp.h>
   
- -#ifdef CONFIG_DEBUG_STACKOVERFLOW
   /*
    * Probabilistic stack overflow check:
    *
@@@ -27,25 -29,26 +28,25 @@@
    */
   static inline void stack_overflow_check(struct pt_regs *regs)
   {
+ +#ifdef CONFIG_DEBUG_STACKOVERFLOW
         u64 curbase = (u64)task_stack_page(current);
- -      static unsigned long warned = -60*HZ;
- -
- -      if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
- -          regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
- -          time_after(jiffies, warned + 60*HZ)) {
- -              printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- -                     current->comm, curbase, regs->sp);
- -              show_stack(NULL,NULL);
- -              warned = jiffies;
- -      }
- -}
+ +
+ +      WARN_ONCE(regs->sp >= curbase &&
+ +                regs->sp <= curbase + THREAD_SIZE &&
+ +                regs->sp <  curbase + sizeof(struct thread_info) +
+ +                                      sizeof(struct pt_regs) + 128,
+ +
+ +                "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+ +                      current->comm, curbase, regs->sp);
   #endif
+ +}
   
   /*
    * do_IRQ handles all normal device IRQ's (the special
    * SMP cross-CPU interrupts have their own specific
    * handlers).
    */
- asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+ asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
   {
         struct pt_regs *old_regs = set_irq_regs(regs);
         struct irq_desc *desc;
@@@ -58,7 -61,9 +59,7 @@@
         irq_enter();
         irq = __get_cpu_var(vector_irq)[vector];
   
- -#ifdef CONFIG_DEBUG_STACKOVERFLOW
         stack_overflow_check(regs);
- -#endif
   
         desc = irq_to_desc(irq);
         if (likely(desc))
diff --combined arch/x86/kernel/process.c

index b8f3e9d,cff9a50..e68bb9e
--- 1/arch/x86/kernel/process.c
--- 2/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@@ -1,15 -1,14 +1,16 @@@
   #include <linux/errno.h>
   #include <linux/kernel.h>
   #include <linux/mm.h>
+ +#include <asm/idle.h>
   #include <linux/smp.h>
   #include <linux/slab.h>
   #include <linux/sched.h>
   #include <linux/module.h>
   #include <linux/pm.h>
   #include <linux/clockchips.h>
+ #include <linux/ftrace.h>
   #include <asm/system.h>
+ +#include <asm/apic.h>
   
   unsigned long idle_halt;
   EXPORT_SYMBOL(idle_halt);
@@@ -102,6 -101,9 +103,9 @@@ static inline int hlt_use_halt(void
   void default_idle(void)
   {
         if (hlt_use_halt()) {
+               struct power_trace it;
+ 
+               trace_power_start(&it, POWER_CSTATE, 1);
                 current_thread_info()->status &= ~TS_POLLING;
                 /*
                  * TS_POLLING-cleared state must be visible before we
@@@ -114,6 -116,7 +118,7 @@@
                 else
                         local_irq_enable();
                 current_thread_info()->status |= TS_POLLING;
+               trace_power_end(&it);
         } else {
                 local_irq_enable();
                 /* loop is done by the caller */
@@@ -124,21 -127,6 +129,21 @@@
   EXPORT_SYMBOL(default_idle);
   #endif
   
+ +void stop_this_cpu(void *dummy)
+ +{
+ +      local_irq_disable();
+ +      /*
+ +       * Remove this CPU:
+ +       */
+ +      cpu_clear(smp_processor_id(), cpu_online_map);
+ +      disable_local_APIC();
+ +
+ +      for (;;) {
+ +              if (hlt_works(smp_processor_id()))
+ +                      halt();
+ +      }
+ +}
+ +
   static void do_nothing(void *unused)
   {
   }
@@@ -171,24 -159,31 +176,31 @@@ EXPORT_SYMBOL_GPL(cpu_idle_wait)
    */
   void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
   {
+       struct power_trace it;
+ 
+       trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
         if (!need_resched()) {
                 __monitor((void *)&current_thread_info()->flags, 0, 0);
                 smp_mb();
                 if (!need_resched())
                         __mwait(ax, cx);
         }
+       trace_power_end(&it);
   }
   
   /* Default MONITOR/MWAIT with no hints, used for default C1 state */
   static void mwait_idle(void)
   {
+       struct power_trace it;
         if (!need_resched()) {
+               trace_power_start(&it, POWER_CSTATE, 1);
                 __monitor((void *)&current_thread_info()->flags, 0, 0);
                 smp_mb();
                 if (!need_resched())
                         __sti_mwait(0, 0);
                 else
                         local_irq_enable();
+               trace_power_end(&it);
         } else
                 local_irq_enable();
   }
@@@ -200,9 -195,13 +212,13 @@@
    */
   static void poll_idle(void)
   {
+       struct power_trace it;
+ 
+       trace_power_start(&it, POWER_CSTATE, 0);
         local_irq_enable();
         while (!need_resched())
                 cpu_relax();
+       trace_power_end(&it);
   }
   
   /*
diff --combined arch/x86/kernel/smpboot.c

index 7a430c4,f6174d2..f8500c9
--- 1/arch/x86/kernel/smpboot.c
--- 2/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -62,7 -62,6 +62,7 @@@
   #include <asm/mtrr.h>
   #include <asm/vmi.h>
   #include <asm/genapic.h>
+ +#include <asm/setup.h>
   #include <linux/mc146818rtc.h>
   
   #include <mach_apic.h>
@@@ -288,7 -287,7 +288,7 @@@ static int __cpuinitdata unsafe_smp
   /*
    * Activate a secondary processor.
    */
- static void __cpuinit start_secondary(void *unused)
+ notrace static void __cpuinit start_secondary(void *unused)
   {
         /*
          * Don't put *anything* before cpu_init(), SMP booting is too
@@@ -535,7 -534,7 +535,7 @@@ static void impress_friends(void
         pr_debug("Before bogocount - setting activated=1.\n");
   }
   
- -static inline void __inquire_remote_apic(int apicid)
+ +void __inquire_remote_apic(int apicid)
   {
         unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
         char *names[] = { "ID", "VERSION", "SPIV" };
@@@ -574,13 -573,14 +574,13 @@@
         }
   }
   
- -#ifdef WAKE_SECONDARY_VIA_NMI
   /*
    * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
    * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
    * won't ... remember to clear down the APIC, etc later.
    */
- -static int __devinit
- -wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+ +int __devinit
+ +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
   {
         unsigned long send_status, accept_status = 0;
         int maxlvt;
@@@ -597,7 -597,7 +597,7 @@@
          * Give the other CPU some time to accept the IPI.
          */
         udelay(200);
- -      if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ +      if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
                 maxlvt = lapic_get_maxlvt();
                 if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
                         apic_write(APIC_ESR, 0);
@@@ -612,9 -612,11 +612,9 @@@
   
         return (send_status | accept_status);
   }
- -#endif        /* WAKE_SECONDARY_VIA_NMI */
   
- -#ifdef WAKE_SECONDARY_VIA_INIT
- -static int __devinit
- -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+ +int __devinit
+ +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
   {
         unsigned long send_status, accept_status = 0;
         int maxlvt, num_starts, j;
@@@ -733,6 -735,7 +733,6 @@@
   
         return (send_status | accept_status);
   }
- -#endif        /* WAKE_SECONDARY_VIA_INIT */
   
   struct create_idle {
         struct work_struct work;
@@@ -1081,10 -1084,8 +1081,10 @@@ static int __init smp_sanity_check(unsi
   #endif
   
         if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- -              printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
- -                                  "by the BIOS.\n", hard_smp_processor_id());
+ +              printk(KERN_WARNING
+ +                      "weird, boot CPU (#%d) not listed by the BIOS.\n",
+ +                      hard_smp_processor_id());
+ +
                 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
         }
   
diff --combined arch/x86/kernel/vsyscall_64.c

index ebf2f12,6f3d3d4..44153af
--- 1/arch/x86/kernel/vsyscall_64.c
--- 2/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@@ -17,6 -17,9 +17,9 @@@
    *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
    */
   
+ /* Disable profiling for userspace code: */
+ #define DISABLE_BRANCH_PROFILING
+ 
   #include <linux/time.h>
   #include <linux/init.h>
   #include <linux/kernel.h>
@@@ -128,16 -131,7 +131,16 @@@ static __always_inline void do_vgettime
                         gettimeofday(tv,NULL);
                         return;
                 }
+ +
+ +              /*
+ +               * Surround the RDTSC by barriers, to make sure it's not
+ +               * speculated to outside the seqlock critical section and
+ +               * does not cause time warps:
+ +               */
+ +              rdtsc_barrier();
                 now = vread();
+ +              rdtsc_barrier();
+ +
                 base = __vsyscall_gtod_data.clock.cycle_last;
                 mask = __vsyscall_gtod_data.clock.mask;
                 mult = __vsyscall_gtod_data.clock.mult;
diff --combined arch/x86/mm/fault.c

index 46b5f75,21e996a..57ec8c8
--- 1/arch/x86/mm/fault.c
--- 2/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -53,7 -53,7 +53,7 @@@
   
   static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
   {
- #ifdef CONFIG_MMIOTRACE_HOOKS
+ #ifdef CONFIG_MMIOTRACE
         if (unlikely(is_kmmio_active()))
                 if (kmmio_handler(regs, addr) == 1)
                         return -1;
@@@ -393,7 -393,7 +393,7 @@@ static void show_fault_oops(struct pt_r
                 if (pte && pte_present(*pte) && !pte_exec(*pte))
                         printk(KERN_CRIT "kernel tried to execute "
                                 "NX-protected page - exploit attempt? "
- -                              "(uid: %d)\n", current->uid);
+ +                              "(uid: %d)\n", current_uid());
         }
   #endif
   
diff --combined include/linux/mm.h

index d3ddd73,9979d3f..aaa8b84
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -145,23 -145,6 +145,23 @@@ extern pgprot_t protection_map[16]
   #define FAULT_FLAG_WRITE      0x01    /* Fault was a write access */
   #define FAULT_FLAG_NONLINEAR  0x02    /* Fault was via a nonlinear mapping */
   
+ +/*
+ + * This interface is used by x86 PAT code to identify a pfn mapping that is
+ + * linear over entire vma. This is to optimize PAT code that deals with
+ + * marking the physical region with a particular prot. This is not for generic
+ + * mm use. Note also that this check will not work if the pfn mapping is
+ + * linear for a vma starting at physical address 0. In which case PAT code
+ + * falls back to slow path of reserving physical range page by page.
+ + */
+ +static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
+ +{
+ +      return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff);
+ +}
+ +
+ +static inline int is_pfn_mapping(struct vm_area_struct *vma)
+ +{
+ +      return (vma->vm_flags & VM_PFNMAP);
+ +}
   
   /*
    * vm_fault is filled by the the pagefault handler and passed to the vma's
@@@ -798,8 -781,6 +798,8 @@@ int copy_page_range(struct mm_struct *d
                         struct vm_area_struct *vma);
   void unmap_mapping_range(struct address_space *mapping,
                 loff_t const holebegin, loff_t const holelen, int even_cows);
+ +int follow_phys(struct vm_area_struct *vma, unsigned long address,
+ +              unsigned int flags, unsigned long *prot, resource_size_t *phys);
   int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                         void *buf, int len, int write);
   
@@@ -1305,5 -1286,7 +1305,7 @@@ int vmemmap_populate_basepages(struct p
   int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
   void vmemmap_populate_print_last(void);
   
+ extern void *alloc_locked_buffer(size_t size);
+ extern void free_locked_buffer(void *buffer, size_t size);
   #endif /* __KERNEL__ */
   #endif /* _LINUX_MM_H */
diff --combined include/linux/sched.h

index 9624e2c,dc5ea65..0a1094d
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -96,6 -96,7 +96,7 @@@ struct exec_domain
   struct futex_pi_state;
   struct robust_list_head;
   struct bio;
+ struct bts_tracer;
   
   /*
    * List of flags we want to share for kernel threads,
@@@ -572,6 -573,12 +573,6 @@@ struct signal_struct 
          */
         struct rlimit rlim[RLIM_NLIMITS];
   
- -      /* keep the process-shared keyrings here so that they do the right
- -       * thing in threads created with CLONE_THREAD */
- -#ifdef CONFIG_KEYS
- -      struct key *session_keyring;    /* keyring inherited over fork */
- -      struct key *process_keyring;    /* keyring private to this process */
- -#endif
   #ifdef CONFIG_BSD_PROCESS_ACCT
         struct pacct_struct pacct;      /* per-process accounting information */
   #endif
@@@ -642,7 -649,6 +643,7 @@@ struct user_struct 
         /* Hash table maintenance information */
         struct hlist_node uidhash_node;
         uid_t uid;
+ +      struct user_namespace *user_ns;
   
   #ifdef CONFIG_USER_SCHED
         struct task_group *tg;
@@@ -660,7 -666,6 +661,7 @@@ extern struct user_struct *find_user(ui
   extern struct user_struct root_user;
   #define INIT_USER (&root_user)
   
+ +
   struct backing_dev_info;
   struct reclaim_state;
   
@@@ -884,7 -889,38 +885,7 @@@ partition_sched_domains(int ndoms_new, 
   #endif        /* !CONFIG_SMP */
   
   struct io_context;                    /* See blkdev.h */
- -#define NGROUPS_SMALL         32
- -#define NGROUPS_PER_BLOCK     ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
- -struct group_info {
- -      int ngroups;
- -      atomic_t usage;
- -      gid_t small_block[NGROUPS_SMALL];
- -      int nblocks;
- -      gid_t *blocks[0];
- -};
   
- -/*
- - * get_group_info() must be called with the owning task locked (via task_lock())
- - * when task != current.  The reason being that the vast majority of callers are
- - * looking at current->group_info, which can not be changed except by the
- - * current task.  Changing current->group_info requires the task lock, too.
- - */
- -#define get_group_info(group_info) do { \
- -      atomic_inc(&(group_info)->usage); \
- -} while (0)
- -
- -#define put_group_info(group_info) do { \
- -      if (atomic_dec_and_test(&(group_info)->usage)) \
- -              groups_free(group_info); \
- -} while (0)
- -
- -extern struct group_info *groups_alloc(int gidsetsize);
- -extern void groups_free(struct group_info *group_info);
- -extern int set_current_groups(struct group_info *group_info);
- -extern int groups_search(struct group_info *group_info, gid_t grp);
- -/* access the groups "array" with this macro */
- -#define GROUP_AT(gi, i) \
- -    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
   
   #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
   extern void prefetch_stack(struct task_struct *t);
@@@ -1130,6 -1166,19 +1131,19 @@@ struct task_struct 
         struct list_head ptraced;
         struct list_head ptrace_entry;
   
+ #ifdef CONFIG_X86_PTRACE_BTS
+       /*
+        * This is the tracer handle for the ptrace BTS extension.
+        * This field actually belongs to the ptracer task.
+        */
+       struct bts_tracer *bts;
+       /*
+        * The buffer to hold the BTS data.
+        */
+       void *bts_buffer;
+       size_t bts_size;
+ #endif /* CONFIG_X86_PTRACE_BTS */
+ 
         /* PID/PID hash table linkage. */
         struct pid_link pids[PIDTYPE_MAX];
         struct list_head thread_group;
@@@ -1151,12 -1200,17 +1165,12 @@@
         struct list_head cpu_timers[3];
   
   /* process credentials */
- -      uid_t uid,euid,suid,fsuid;
- -      gid_t gid,egid,sgid,fsgid;
- -      struct group_info *group_info;
- -      kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
- -      struct user_struct *user;
- -      unsigned securebits;
- -#ifdef CONFIG_KEYS
- -      unsigned char jit_keyring;      /* default keyring to attach requested keys to */
- -      struct key *request_key_auth;   /* assumed request_key authority */
- -      struct key *thread_keyring;     /* keyring private to this thread */
- -#endif
+ +      const struct cred *real_cred;   /* objective and real subjective task
+ +                                       * credentials (COW) */
+ +      const struct cred *cred;        /* effective (overridable) subjective task
+ +                                       * credentials (COW) */
+ +      struct mutex cred_exec_mutex;   /* execve vs ptrace cred calculation mutex */
+ +
         char comm[TASK_COMM_LEN]; /* executable name excluding path
                                      - access with [gs]et_task_comm (which lock
                                        it with task_lock())
@@@ -1193,6 -1247,9 +1207,6 @@@
         int (*notifier)(void *priv);
         void *notifier_data;
         sigset_t *notifier_mask;
- -#ifdef CONFIG_SECURITY
- -      void *security;
- -#endif
         struct audit_context *audit_context;
   #ifdef CONFIG_AUDITSYSCALL
         uid_t loginuid;
@@@ -1313,6 -1370,23 +1327,23 @@@
         unsigned long default_timer_slack_ns;
   
         struct list_head        *scm_work_list;
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       /* Index of current stored adress in ret_stack */
+       int curr_ret_stack;
+       /* Stack of return addresses for return function tracing */
+       struct ftrace_ret_stack *ret_stack;
+       /*
+        * Number of functions that haven't been traced
+        * because of depth overrun.
+        */
+       atomic_t trace_overrun;
+       /* Pause for the tracing */
+       atomic_t tracing_graph_pause;
+ #endif
+ #ifdef CONFIG_TRACING
+       /* state flags for use by tracers */
+       unsigned long trace;
+ #endif
   };
   
   /*
@@@ -1732,6 -1806,7 +1763,6 @@@ static inline struct user_struct *get_u
         return u;
   }
   extern void free_uid(struct user_struct *);
- -extern void switch_uid(struct user_struct *);
   extern void release_uids(struct user_namespace *ns);
   
   #include <asm/current.h>
@@@ -1750,6 -1825,9 +1781,6 @@@ extern void wake_up_new_task(struct tas
   extern void sched_fork(struct task_struct *p, int clone_flags);
   extern void sched_dead(struct task_struct *p);
   
- -extern int in_group_p(gid_t);
- -extern int in_egroup_p(gid_t);
- -
   extern void proc_caches_init(void);
   extern void flush_signals(struct task_struct *);
   extern void ignore_signals(struct task_struct *);
@@@ -1881,8 -1959,6 +1912,8 @@@ static inline unsigned long wait_task_i
   #define for_each_process(p) \
         for (p = &init_task ; (p = next_task(p)) != &init_task ; )
   
+ +extern bool is_single_threaded(struct task_struct *);
+ +
   /*
    * Careful: do_each_thread/while_each_thread is a double loop so
    *          'break' will not work as expected - use goto instead.
diff --combined include/linux/tty.h

index 580700f,eaec37c..3f4954c
--- 1/include/linux/tty.h
--- 2/include/linux/tty.h
+++ b/include/linux/tty.h
@@@ -325,7 -325,7 +325,7 @@@ extern struct class *tty_class
    *    go away
    */
   
- extern inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
+ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
   {
         if (tty)
                 kref_get(&tty->kref);
@@@ -442,7 -442,6 +442,7 @@@ extern void tty_audit_add_data(struct t
                                size_t size);
   extern void tty_audit_exit(void);
   extern void tty_audit_fork(struct signal_struct *sig);
+ +extern void tty_audit_tiocsti(struct tty_struct *tty, char ch);
   extern void tty_audit_push(struct tty_struct *tty);
   extern void tty_audit_push_task(struct task_struct *tsk,
                                         uid_t loginuid, u32 sessionid);
@@@ -451,9 -450,6 +451,9 @@@ static inline void tty_audit_add_data(s
                                       unsigned char *data, size_t size)
   {
   }
+ +static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch)
+ +{
+ +}
   static inline void tty_audit_exit(void)
   {
   }
diff --combined init/main.c

index db843bf,79213c0..17e9757
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -63,6 -63,7 +63,7 @@@
   #include <linux/signal.h>
   #include <linux/idr.h>
   #include <linux/ftrace.h>
+ #include <trace/boot.h>
   
   #include <asm/io.h>
   #include <asm/bugs.h>
@@@ -669,7 -670,6 +670,7 @@@ asmlinkage void __init start_kernel(voi
                 efi_enter_virtual_mode();
   #endif
         thread_info_cache_init();
+ +      cred_init();
         fork_init(num_physpages);
         proc_caches_init();
         buffer_init();
@@@ -704,31 -704,35 +705,35 @@@ core_param(initcall_debug, initcall_deb
   int do_one_initcall(initcall_t fn)
   {
         int count = preempt_count();
-       ktime_t delta;
+       ktime_t calltime, delta, rettime;
         char msgbuf[64];
-       struct boot_trace it;
+       struct boot_trace_call call;
+       struct boot_trace_ret ret;
   
         if (initcall_debug) {
-               it.caller = task_pid_nr(current);
-               printk("calling  %pF @ %i\n", fn, it.caller);
-               it.calltime = ktime_get();
+               call.caller = task_pid_nr(current);
+               printk("calling  %pF @ %i\n", fn, call.caller);
+               calltime = ktime_get();
+               trace_boot_call(&call, fn);
+               enable_boot_trace();
         }
   
-       it.result = fn();
+       ret.result = fn();
   
         if (initcall_debug) {
-               it.rettime = ktime_get();
-               delta = ktime_sub(it.rettime, it.calltime);
-               it.duration = (unsigned long long) delta.tv64 >> 10;
+               disable_boot_trace();
+               rettime = ktime_get();
+               delta = ktime_sub(rettime, calltime);
+               ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10;
+               trace_boot_ret(&ret, fn);
                 printk("initcall %pF returned %d after %Ld usecs\n", fn,
-                       it.result, it.duration);
-               trace_boot(&it, fn);
+                       ret.result, ret.duration);
         }
   
         msgbuf[0] = 0;
   
-       if (it.result && it.result != -ENODEV && initcall_debug)
-               sprintf(msgbuf, "error code %d ", it.result);
+       if (ret.result && ret.result != -ENODEV && initcall_debug)
+               sprintf(msgbuf, "error code %d ", ret.result);
   
         if (preempt_count() != count) {
                 strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
@@@ -742,7 -746,7 +747,7 @@@
                 printk("initcall %pF returned with %s\n", fn, msgbuf);
         }
   
-       return it.result;
+       return ret.result;
   }
   
   
@@@ -883,7 -887,7 +888,7 @@@ static int __init kernel_init(void * un
          * we're essentially up and running. Get rid of the
          * initmem segments and start the user-mode stuff..
          */
-       stop_boot_trace();
+ 
         init_post();
         return 0;
   }
diff --combined kernel/exit.c

index ccb8716,e5ae36e..c7422ca
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -46,15 -46,17 +46,19 @@@
   #include <linux/blkdev.h>
   #include <linux/task_io_accounting_ops.h>
   #include <linux/tracehook.h>
+ +#include <linux/init_task.h>
   #include <trace/sched.h>
   
   #include <asm/uaccess.h>
   #include <asm/unistd.h>
   #include <asm/pgtable.h>
   #include <asm/mmu_context.h>
+ +#include "cred-internals.h"
   
+ DEFINE_TRACE(sched_process_free);
+ DEFINE_TRACE(sched_process_exit);
+ DEFINE_TRACE(sched_process_wait);
+ 
   static void exit_mm(struct task_struct * tsk);
   
   static inline int task_detached(struct task_struct *p)
@@@ -166,10 -168,7 +170,10 @@@ void release_task(struct task_struct * 
         int zap_leader;
   repeat:
         tracehook_prepare_release_task(p);
- -      atomic_dec(&p->user->processes);
+ +      /* don't need to get the RCU readlock here - the process is dead and
+ +       * can't be modifying its own credentials */
+ +      atomic_dec(&__task_cred(p)->user->processes);
+ +
         proc_flush_task(p);
         write_lock_irq(&tasklist_lock);
         tracehook_finish_release_task(p);
@@@ -344,12 -343,12 +348,12 @@@ static void reparent_to_kthreadd(void
         /* cpus_allowed? */
         /* rt_priority? */
         /* signals? */
- -      security_task_reparent_to_init(current);
         memcpy(current->signal->rlim, init_task.signal->rlim,
                sizeof(current->signal->rlim));
- -      atomic_inc(&(INIT_USER->__count));
+ +
+ +      atomic_inc(&init_cred.usage);
+ +      commit_creds(&init_cred);
         write_unlock_irq(&tasklist_lock);
- -      switch_uid(INIT_USER);
   }
   
   void __set_special_pids(struct pid *pid)
@@@ -1083,6 -1082,7 +1087,6 @@@ NORET_TYPE void do_exit(long code
         check_stack_usage();
         exit_thread();
         cgroup_exit(tsk, 1);
- -      exit_keys(tsk);
   
         if (group_dead && tsk->signal->leader)
                 disassociate_ctty(1);
@@@ -1127,7 -1127,6 +1131,6 @@@
         preempt_disable();
         /* causes final put_task_struct in finish_task_switch(). */
         tsk->state = TASK_DEAD;
- 
         schedule();
         BUG();
         /* Avoid "noreturn function does return".  */
@@@ -1267,12 -1266,12 +1270,12 @@@ static int wait_task_zombie(struct task
         unsigned long state;
         int retval, status, traced;
         pid_t pid = task_pid_vnr(p);
+ +      uid_t uid = __task_cred(p)->uid;
   
         if (!likely(options & WEXITED))
                 return 0;
   
         if (unlikely(options & WNOWAIT)) {
- -              uid_t uid = p->uid;
                 int exit_code = p->exit_code;
                 int why, status;
   
@@@ -1393,7 -1392,7 +1396,7 @@@
         if (!retval && infop)
                 retval = put_user(pid, &infop->si_pid);
         if (!retval && infop)
- -              retval = put_user(p->uid, &infop->si_uid);
+ +              retval = put_user(uid, &infop->si_uid);
         if (!retval)
                 retval = pid;
   
@@@ -1458,8 -1457,7 +1461,8 @@@ static int wait_task_stopped(int ptrace
         if (!unlikely(options & WNOWAIT))
                 p->exit_code = 0;
   
- -      uid = p->uid;
+ +      /* don't need the RCU readlock here as we're holding a spinlock */
+ +      uid = __task_cred(p)->uid;
   unlock_sig:
         spin_unlock_irq(&p->sighand->siglock);
         if (!exit_code)
@@@ -1533,10 -1531,10 +1536,10 @@@ static int wait_task_continued(struct t
         }
         if (!unlikely(options & WNOWAIT))
                 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+ +      uid = __task_cred(p)->uid;
         spin_unlock_irq(&p->sighand->siglock);
   
         pid = task_pid_vnr(p);
- -      uid = p->uid;
         get_task_struct(p);
         read_unlock(&tasklist_lock);
   
diff --combined kernel/fork.c

index 4e8ca23,65ce60a..6144b36
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -47,6 -47,7 +47,7 @@@
   #include <linux/mount.h>
   #include <linux/audit.h>
   #include <linux/memcontrol.h>
+ #include <linux/ftrace.h>
   #include <linux/profile.h>
   #include <linux/rmap.h>
   #include <linux/acct.h>
@@@ -80,6 -81,8 +81,8 @@@ DEFINE_PER_CPU(unsigned long, process_c
   
   __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
   
+ DEFINE_TRACE(sched_process_fork);
+ 
   int nr_processes(void)
   {
         int cpu;
@@@ -137,6 -140,7 +140,7 @@@ void free_task(struct task_struct *tsk
         prop_local_destroy_single(&tsk->dirties);
         free_thread_info(tsk->stack);
         rt_mutex_debug_task_free(tsk);
+       ftrace_graph_exit_task(tsk);
         free_task_struct(tsk);
   }
   EXPORT_SYMBOL(free_task);
@@@ -147,8 -151,9 +151,8 @@@ void __put_task_struct(struct task_stru
         WARN_ON(atomic_read(&tsk->usage));
         WARN_ON(tsk == current);
   
- -      security_task_free(tsk);
- -      free_uid(tsk->user);
- -      put_group_info(tsk->group_info);
+ +      put_cred(tsk->real_cred);
+ +      put_cred(tsk->cred);
         delayacct_tsk_free(tsk);
   
         if (!profile_handoff_task(tsk))
@@@ -817,6 -822,12 +821,6 @@@ static int copy_signal(unsigned long cl
         if (!sig)
                 return -ENOMEM;
   
- -      ret = copy_thread_group_keys(tsk);
- -      if (ret < 0) {
- -              kmem_cache_free(signal_cachep, sig);
- -              return ret;
- -      }
- -
         atomic_set(&sig->count, 1);
         atomic_set(&sig->live, 1);
         init_waitqueue_head(&sig->wait_chldexit);
@@@ -861,6 -872,7 +865,6 @@@
   void __cleanup_signal(struct signal_struct *sig)
   {
         thread_group_cputime_free(sig);
- -      exit_thread_group_keys(sig);
         tty_kref_put(sig->tty);
         kmem_cache_free(signal_cachep, sig);
   }
@@@ -976,16 -988,16 +980,16 @@@ static struct task_struct *copy_process
         DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
   #endif
         retval = -EAGAIN;
- -      if (atomic_read(&p->user->processes) >=
+ +      if (atomic_read(&p->real_cred->user->processes) >=
                         p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
                 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
- -                  p->user != current->nsproxy->user_ns->root_user)
+ +                  p->real_cred->user != INIT_USER)
                         goto bad_fork_free;
         }
   
- -      atomic_inc(&p->user->__count);
- -      atomic_inc(&p->user->processes);
- -      get_group_info(p->group_info);
+ +      retval = copy_creds(p, clone_flags);
+ +      if (retval < 0)
+ +              goto bad_fork_free;
   
         /*
          * If multiple threads are within copy_process(), then this check
@@@ -1040,6 -1052,10 +1044,6 @@@
         do_posix_clock_monotonic_gettime(&p->start_time);
         p->real_start_time = p->start_time;
         monotonic_to_bootbased(&p->real_start_time);
- -#ifdef CONFIG_SECURITY
- -      p->security = NULL;
- -#endif
- -      p->cap_bset = current->cap_bset;
         p->io_context = NULL;
         p->audit_context = NULL;
         cgroup_fork(p);
@@@ -1080,12 -1096,16 +1084,14 @@@
   #ifdef CONFIG_DEBUG_MUTEXES
         p->blocked_on = NULL; /* not blocked yet */
   #endif
+       if (unlikely(ptrace_reparented(current)))
+               ptrace_fork(p, clone_flags);
   
         /* Perform scheduler related setup. Assign this task to a CPU. */
         sched_fork(p, clone_flags);
   
- -      if ((retval = security_task_alloc(p)))
- -              goto bad_fork_cleanup_policy;
         if ((retval = audit_alloc(p)))
- -              goto bad_fork_cleanup_security;
+ +              goto bad_fork_cleanup_policy;
         /* copy all the process information */
         if ((retval = copy_semundo(clone_flags, p)))
                 goto bad_fork_cleanup_audit;
@@@ -1099,8 -1119,10 +1105,8 @@@
                 goto bad_fork_cleanup_sighand;
         if ((retval = copy_mm(clone_flags, p)))
                 goto bad_fork_cleanup_signal;
- -      if ((retval = copy_keys(clone_flags, p)))
- -              goto bad_fork_cleanup_mm;
         if ((retval = copy_namespaces(clone_flags, p)))
- -              goto bad_fork_cleanup_keys;
+ +              goto bad_fork_cleanup_mm;
         if ((retval = copy_io(clone_flags, p)))
                 goto bad_fork_cleanup_namespaces;
         retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@@ -1120,6 -1142,8 +1126,8 @@@
                 }
         }
   
+       ftrace_graph_init_task(p);
+ 
         p->pid = pid_nr(pid);
         p->tgid = p->pid;
         if (clone_flags & CLONE_THREAD)
@@@ -1128,7 -1152,7 +1136,7 @@@
         if (current->nsproxy != p->nsproxy) {
                 retval = ns_cgroup_clone(p, pid);
                 if (retval)
-                       goto bad_fork_free_pid;
+                       goto bad_fork_free_graph;
         }
   
         p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@@ -1221,7 -1245,7 +1229,7 @@@
                 spin_unlock(&current->sighand->siglock);
                 write_unlock_irq(&tasklist_lock);
                 retval = -ERESTARTNOINTR;
-               goto bad_fork_free_pid;
+               goto bad_fork_free_graph;
         }
   
         if (clone_flags & CLONE_THREAD) {
@@@ -1258,6 -1282,8 +1266,8 @@@
         cgroup_post_fork(p);
         return p;
   
+ bad_fork_free_graph:
+       ftrace_graph_exit_task(p);
   bad_fork_free_pid:
         if (pid != &init_struct_pid)
                 free_pid(pid);
@@@ -1265,6 -1291,8 +1275,6 @@@ bad_fork_cleanup_io
         put_io_context(p->io_context);
   bad_fork_cleanup_namespaces:
         exit_task_namespaces(p);
- -bad_fork_cleanup_keys:
- -      exit_keys(p);
   bad_fork_cleanup_mm:
         if (p->mm)
                 mmput(p->mm);
@@@ -1280,6 -1308,8 +1290,6 @@@ bad_fork_cleanup_semundo
         exit_sem(p);
   bad_fork_cleanup_audit:
         audit_free(p);
- -bad_fork_cleanup_security:
- -      security_task_free(p);
   bad_fork_cleanup_policy:
   #ifdef CONFIG_NUMA
         mpol_put(p->mempolicy);
@@@ -1292,9 -1322,9 +1302,9 @@@ bad_fork_cleanup_cgroup
   bad_fork_cleanup_put_domain:
         module_put(task_thread_info(p)->exec_domain->module);
   bad_fork_cleanup_count:
- -      put_group_info(p->group_info);
- -      atomic_dec(&p->user->processes);
- -      free_uid(p->user);
+ +      atomic_dec(&p->cred->user->processes);
+ +      put_cred(p->real_cred);
+ +      put_cred(p->cred);
   bad_fork_free:
         free_task(p);
   fork_out:
@@@ -1338,21 -1368,6 +1348,21 @@@ long do_fork(unsigned long clone_flags
         long nr;
   
         /*
+ +       * Do some preliminary argument and permissions checking before we
+ +       * actually start allocating stuff
+ +       */
+ +      if (clone_flags & CLONE_NEWUSER) {
+ +              if (clone_flags & CLONE_THREAD)
+ +                      return -EINVAL;
+ +              /* hopefully this check will go away when userns support is
+ +               * complete
+ +               */
+ +              if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
+ +                              !capable(CAP_SETGID))
+ +                      return -EPERM;
+ +      }
+ +
+ +      /*
          * We hope to recycle these flags after 2.6.26
          */
         if (unlikely(clone_flags & CLONE_STOPPED)) {
@@@ -1600,7 -1615,8 +1610,7 @@@ asmlinkage long sys_unshare(unsigned lo
         err = -EINVAL;
         if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- -                              CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
- -                              CLONE_NEWNET))
+ +                              CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
                 goto bad_unshare_out;
   
         /*
diff --combined kernel/ptrace.c

index ca2df68,100a71c..29dc700
--- 1/kernel/ptrace.c
--- 2/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@@ -25,6 -25,17 +25,17 @@@
   #include <asm/pgtable.h>
   #include <asm/uaccess.h>
   
+ 
+ /*
+  * Initialize a new task whose father had been ptraced.
+  *
+  * Called from copy_process().
+  */
+ void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
+ {
+       arch_ptrace_fork(child, clone_flags);
+ }
+ 
   /*
    * ptrace a task: make the debugger its new parent and
    * move it to the ptrace list.
@@@ -72,6 -83,7 +83,7 @@@ void __ptrace_unlink(struct task_struc
         child->parent = child->real_parent;
         list_del_init(&child->ptrace_entry);
   
+       arch_ptrace_untrace(child);
         if (task_is_traced(child))
                 ptrace_untrace(child);
   }
@@@ -115,8 -127,6 +127,8 @@@ int ptrace_check_attach(struct task_str
   
   int __ptrace_may_access(struct task_struct *task, unsigned int mode)
   {
+ +      const struct cred *cred = current_cred(), *tcred;
+ +
         /* May we inspect the given task?
          * This check is used both for attaching with ptrace
          * and for allowing access to sensitive information in /proc.
@@@ -129,19 -139,13 +141,19 @@@
         /* Don't let security modules deny introspection */
         if (task == current)
                 return 0;
- -      if (((current->uid != task->euid) ||
- -           (current->uid != task->suid) ||
- -           (current->uid != task->uid) ||
- -           (current->gid != task->egid) ||
- -           (current->gid != task->sgid) ||
- -           (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+ +      rcu_read_lock();
+ +      tcred = __task_cred(task);
+ +      if ((cred->uid != tcred->euid ||
+ +           cred->uid != tcred->suid ||
+ +           cred->uid != tcred->uid  ||
+ +           cred->gid != tcred->egid ||
+ +           cred->gid != tcred->sgid ||
+ +           cred->gid != tcred->gid) &&
+ +          !capable(CAP_SYS_PTRACE)) {
+ +              rcu_read_unlock();
                 return -EPERM;
+ +      }
+ +      rcu_read_unlock();
         smp_rmb();
         if (task->mm)
                 dumpable = get_dumpable(task->mm);
@@@ -171,14 -175,6 +183,14 @@@ int ptrace_attach(struct task_struct *t
         if (same_thread_group(task, current))
                 goto out;
   
+ +      /* Protect exec's credential calculations against our interference;
+ +       * SUID, SGID and LSM creds get determined differently under ptrace.
+ +       */
+ +      retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+ +      if (retval  < 0)
+ +              goto out;
+ +
+ +      retval = -EPERM;
   repeat:
         /*
          * Nasty, nasty.
@@@ -218,7 -214,6 +230,7 @@@
   bad:
         write_unlock_irqrestore(&tasklist_lock, flags);
         task_unlock(task);
+ +      mutex_unlock(&current->cred_exec_mutex);
   out:
         return retval;
   }
diff --combined kernel/sched.c

index 33cf4a1,dcb39bc..3798b95
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -118,6 -118,12 +118,12 @@@
    */
   #define RUNTIME_INF   ((u64)~0ULL)
   
+ DEFINE_TRACE(sched_wait_task);
+ DEFINE_TRACE(sched_wakeup);
+ DEFINE_TRACE(sched_wakeup_new);
+ DEFINE_TRACE(sched_switch);
+ DEFINE_TRACE(sched_migrate_task);
+ 
   #ifdef CONFIG_SMP
   /*
    * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@@ -345,9 -351,7 +351,9 @@@ static inline struct task_group *task_g
         struct task_group *tg;
   
   #ifdef CONFIG_USER_SCHED
- -      tg = p->user->tg;
+ +      rcu_read_lock();
+ +      tg = __task_cred(p)->user->tg;
+ +      rcu_read_unlock();
   #elif defined(CONFIG_CGROUP_SCHED)
         tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                 struct task_group, css);
@@@ -1847,6 -1851,8 +1853,8 @@@ void set_task_cpu(struct task_struct *p
   
         clock_offset = old_rq->clock - new_rq->clock;
   
+       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ 
   #ifdef CONFIG_SCHEDSTATS
         if (p->se.wait_start)
                 p->se.wait_start -= clock_offset;
@@@ -2318,7 -2324,7 +2326,7 @@@ out_activate
         success = 1;
   
   out_running:
-       trace_sched_wakeup(rq, p);
+       trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
   
         p->state = TASK_RUNNING;
@@@ -2451,7 -2457,7 +2459,7 @@@ void wake_up_new_task(struct task_struc
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
-       trace_sched_wakeup_new(rq, p);
+       trace_sched_wakeup_new(rq, p, 1);
         check_preempt_curr(rq, p, 0);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@@ -2864,7 -2870,6 +2872,6 @@@ static void sched_migrate_task(struct t
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
   
-       trace_sched_migrate_task(rq, p, dest_cpu);
         /* force the process onto the specified CPU */
         if (migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
@@@ -5136,22 -5141,6 +5143,22 @@@ __setscheduler(struct rq *rq, struct ta
         set_load_weight(p);
   }
   
+ +/*
+ + * check the target process has a UID that matches the current process's
+ + */
+ +static bool check_same_owner(struct task_struct *p)
+ +{
+ +      const struct cred *cred = current_cred(), *pcred;
+ +      bool match;
+ +
+ +      rcu_read_lock();
+ +      pcred = __task_cred(p);
+ +      match = (cred->euid == pcred->euid ||
+ +               cred->euid == pcred->uid);
+ +      rcu_read_unlock();
+ +      return match;
+ +}
+ +
   static int __sched_setscheduler(struct task_struct *p, int policy,
                                 struct sched_param *param, bool user)
   {
@@@ -5211,7 -5200,8 +5218,7 @@@ recheck
                         return -EPERM;
   
                 /* can't change other user's priorities */
- -              if ((current->euid != p->euid) &&
- -                  (current->euid != p->uid))
+ +              if (!check_same_owner(p))
                         return -EPERM;
         }
   
@@@ -5443,7 -5433,8 +5450,7 @@@ long sched_setaffinity(pid_t pid, cons
         read_unlock(&tasklist_lock);
   
         retval = -EPERM;
- -      if ((current->euid != p->euid) && (current->euid != p->uid) &&
- -                      !capable(CAP_SYS_NICE))
+ +      if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
   
         retval = security_task_setscheduler(p, 0, NULL);
@@@ -5912,6 -5903,7 +5919,7 @@@ void __cpuinit init_idle(struct task_st
          * The idle tasks have their own, simple scheduling class:
          */
         idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_task(idle);
   }
   
   /*
diff --combined kernel/signal.c

index 2a64304,e9afe63..8e95855
--- 1/kernel/signal.c
--- 2/kernel/signal.c
+++ b/kernel/signal.c
@@@ -41,6 -41,8 +41,8 @@@
   
   static struct kmem_cache *sigqueue_cachep;
   
+ DEFINE_TRACE(sched_signal_send);
+ 
   static void __user *sig_handler(struct task_struct *t, int sig)
   {
         return t->sighand->action[sig - 1].sa.sa_handler;
@@@ -177,11 -179,6 +179,11 @@@ int next_signal(struct sigpending *pend
         return sig;
   }
   
+ +/*
+ + * allocate a new signal queue record
+ + * - this may be called without locks if and only if t == current, otherwise an
+ + *   appopriate lock must be held to stop the target task from exiting
+ + */
   static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
                                          int override_rlimit)
   {
@@@ -189,12 -186,11 +191,12 @@@
         struct user_struct *user;
   
         /*
- -       * In order to avoid problems with "switch_user()", we want to make
- -       * sure that the compiler doesn't re-load "t->user"
+ +       * We won't get problems with the target's UID changing under us
+ +       * because changing it requires RCU be used, and if t != current, the
+ +       * caller must be holding the RCU readlock (by way of a spinlock) and
+ +       * we use RCU protection here
          */
- -      user = t->user;
- -      barrier();
+ +      user = get_uid(__task_cred(t)->user);
         atomic_inc(&user->sigpending);
         if (override_rlimit ||
             atomic_read(&user->sigpending) <=
@@@ -202,14 -198,12 +204,14 @@@
                 q = kmem_cache_alloc(sigqueue_cachep, flags);
         if (unlikely(q == NULL)) {
                 atomic_dec(&user->sigpending);
+ +              free_uid(user);
         } else {
                 INIT_LIST_HEAD(&q->list);
                 q->flags = 0;
- -              q->user = get_uid(user);
+ +              q->user = user;
         }
- -      return(q);
+ +
+ +      return q;
   }
   
   static void __sigqueue_free(struct sigqueue *q)
@@@ -570,12 -564,10 +572,12 @@@ static int rm_from_queue(unsigned long 
   
   /*
    * Bad permissions for sending the signal
+ + * - the caller must hold at least the RCU read lock
    */
   static int check_kill_permission(int sig, struct siginfo *info,
                                  struct task_struct *t)
   {
+ +      const struct cred *cred = current_cred(), *tcred;
         struct pid *sid;
         int error;
   
@@@ -589,11 -581,8 +591,11 @@@
         if (error)
                 return error;
   
- -      if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
- -          (current->uid  ^ t->suid) && (current->uid  ^ t->uid) &&
+ +      tcred = __task_cred(t);
+ +      if ((cred->euid ^ tcred->suid) &&
+ +          (cred->euid ^ tcred->uid) &&
+ +          (cred->uid  ^ tcred->suid) &&
+ +          (cred->uid  ^ tcred->uid) &&
             !capable(CAP_KILL)) {
                 switch (sig) {
                 case SIGCONT:
@@@ -857,7 -846,7 +859,7 @@@ static int send_signal(int sig, struct 
                         q->info.si_errno = 0;
                         q->info.si_code = SI_USER;
                         q->info.si_pid = task_pid_vnr(current);
- -                      q->info.si_uid = current->uid;
+ +                      q->info.si_uid = current_uid();
                         break;
                 case (unsigned long) SEND_SIG_PRIV:
                         q->info.si_signo = sig;
@@@ -1021,10 -1010,6 +1023,10 @@@ struct sighand_struct *lock_task_sighan
         return sighand;
   }
   
+ +/*
+ + * send signal info to all the members of a group
+ + * - the caller must hold the RCU read lock at least
+ + */
   int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
   {
         unsigned long flags;
@@@ -1046,8 -1031,8 +1048,8 @@@
   /*
    * __kill_pgrp_info() sends a signal to a process group: this is what the tty
    * control characters do (^C, ^Z etc)
+ + * - the caller must hold at least a readlock on tasklist_lock
    */
- -
   int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
   {
         struct task_struct *p = NULL;
@@@ -1103,7 -1088,6 +1105,7 @@@ int kill_pid_info_as_uid(int sig, struc
   {
         int ret = -EINVAL;
         struct task_struct *p;
+ +      const struct cred *pcred;
   
         if (!valid_signal(sig))
                 return ret;
@@@ -1114,11 -1098,9 +1116,11 @@@
                 ret = -ESRCH;
                 goto out_unlock;
         }
- -      if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
- -          && (euid != p->suid) && (euid != p->uid)
- -          && (uid != p->suid) && (uid != p->uid)) {
+ +      pcred = __task_cred(p);
+ +      if ((info == SEND_SIG_NOINFO ||
+ +           (!is_si_special(info) && SI_FROMUSER(info))) &&
+ +          euid != pcred->suid && euid != pcred->uid &&
+ +          uid  != pcred->suid && uid  != pcred->uid) {
                 ret = -EPERM;
                 goto out_unlock;
         }
@@@ -1389,9 -1371,10 +1391,9 @@@ int do_notify_parent(struct task_struc
          */
         rcu_read_lock();
         info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ +      info.si_uid = __task_cred(tsk)->uid;
         rcu_read_unlock();
   
- -      info.si_uid = tsk->uid;
- -
         thread_group_cputime(tsk, &cputime);
         info.si_utime = cputime_to_jiffies(cputime.utime);
         info.si_stime = cputime_to_jiffies(cputime.stime);
@@@ -1459,9 -1442,10 +1461,9 @@@ static void do_notify_parent_cldstop(st
          */
         rcu_read_lock();
         info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ +      info.si_uid = __task_cred(tsk)->uid;
         rcu_read_unlock();
   
- -      info.si_uid = tsk->uid;
- -
         info.si_utime = cputime_to_clock_t(tsk->utime);
         info.si_stime = cputime_to_clock_t(tsk->stime);
   
@@@ -1616,7 -1600,7 +1618,7 @@@ void ptrace_notify(int exit_code
         info.si_signo = SIGTRAP;
         info.si_code = exit_code;
         info.si_pid = task_pid_vnr(current);
- -      info.si_uid = current->uid;
+ +      info.si_uid = current_uid();
   
         /* Let the debugger run.  */
         spin_lock_irq(&current->sighand->siglock);
@@@ -1728,7 -1712,7 +1730,7 @@@ static int ptrace_signal(int signr, sig
                 info->si_errno = 0;
                 info->si_code = SI_USER;
                 info->si_pid = task_pid_vnr(current->parent);
- -              info->si_uid = current->parent->uid;
+ +              info->si_uid = task_uid(current->parent);
         }
   
         /* If the (new) signal is now blocked, requeue it.  */
@@@ -2229,7 -2213,7 +2231,7 @@@ sys_kill(pid_t pid, int sig
         info.si_errno = 0;
         info.si_code = SI_USER;
         info.si_pid = task_tgid_vnr(current);
- -      info.si_uid = current->uid;
+ +      info.si_uid = current_uid();
   
         return kill_something_info(sig, &info, pid);
   }
@@@ -2246,7 -2230,7 +2248,7 @@@ static int do_tkill(pid_t tgid, pid_t p
         info.si_errno = 0;
         info.si_code = SI_TKILL;
         info.si_pid = task_tgid_vnr(current);
- -      info.si_uid = current->uid;
+ +      info.si_uid = current_uid();
   
         rcu_read_lock();
         p = find_task_by_vpid(pid);
diff --combined kernel/sysctl.c

index 9d52b57,6ac501a..0b627d9
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -487,6 -487,26 +487,26 @@@ static struct ctl_table kern_table[] = 
                 .proc_handler   = &ftrace_enable_sysctl,
         },
   #endif
+ #ifdef CONFIG_STACK_TRACER
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "stack_tracer_enabled",
+               .data           = &stack_tracer_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &stack_trace_sysctl,
+       },
+ #endif
+ #ifdef CONFIG_TRACING
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "ftrace_dump_on_oops",
+               .data           = &ftrace_dump_on_oops,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+ #endif
   #ifdef CONFIG_MODULES
         {
                 .ctl_name       = KERN_MODPROBE,
@@@ -1651,7 -1671,7 +1671,7 @@@ out
   
   static int test_perm(int mode, int op)
   {
- -      if (!current->euid)
+ +      if (!current_euid())
                 mode >>= 6;
         else if (in_egroup_p(0))
                 mode >>= 3;
diff --combined kernel/trace/trace.c

index 1ee9e4e,79db26e..f4bb380
--- 1/kernel/trace/trace.c
--- 2/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -30,6 -30,7 +30,7 @@@
   #include <linux/gfp.h>
   #include <linux/fs.h>
   #include <linux/kprobes.h>
+ #include <linux/seq_file.h>
   #include <linux/writeback.h>
   
   #include <linux/stacktrace.h>
@@@ -43,6 -44,38 +44,38 @@@
   unsigned long __read_mostly   tracing_max_latency = (cycle_t)ULONG_MAX;
   unsigned long __read_mostly   tracing_thresh;
   
+ /*
+  * We need to change this state when a selftest is running.
+  * A selftest will lurk into the ring-buffer to count the
+  * entries inserted during the selftest although some concurrent
+  * insertions into the ring-buffer such as ftrace_printk could occurred
+  * at the same time, giving false positive or negative results.
+  */
+ static bool __read_mostly tracing_selftest_running;
+ 
+ /* For tracers that don't implement custom flags */
+ static struct tracer_opt dummy_tracer_opt[] = {
+       { }
+ };
+ 
+ static struct tracer_flags dummy_tracer_flags = {
+       .val = 0,
+       .opts = dummy_tracer_opt
+ };
+ 
+ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+ {
+       return 0;
+ }
+ 
+ /*
+  * Kill all tracing for good (never come back).
+  * It is initialized to 1 but will turn to zero if the initialization
+  * of the tracer is successful. But that is the only place that sets
+  * this back to zero.
+  */
+ int tracing_disabled = 1;
+ 
   static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
   
   static inline void ftrace_disable_cpu(void)
@@@ -62,7 -95,36 +95,36 @@@ static cpumask_t __read_mostly              tracing
   #define for_each_tracing_cpu(cpu)     \
         for_each_cpu_mask(cpu, tracing_buffer_mask)
   
- static int tracing_disabled = 1;
+ /*
+  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+  *
+  * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+  * is set, then ftrace_dump is called. This will output the contents
+  * of the ftrace buffers to the console.  This is very useful for
+  * capturing traces that lead to crashes and outputing it to a
+  * serial console.
+  *
+  * It is default off, but you can enable it with either specifying
+  * "ftrace_dump_on_oops" in the kernel command line, or setting
+  * /proc/sys/kernel/ftrace_dump_on_oops to true.
+  */
+ int ftrace_dump_on_oops;
+ 
+ static int tracing_set_tracer(char *buf);
+ 
+ static int __init set_ftrace(char *str)
+ {
+       tracing_set_tracer(str);
+       return 1;
+ }
+ __setup("ftrace", set_ftrace);
+ 
+ static int __init set_ftrace_dump_on_oops(char *str)
+ {
+       ftrace_dump_on_oops = 1;
+       return 1;
+ }
+ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
   
   long
   ns2usecs(cycle_t nsec)
@@@ -112,6 -174,19 +174,19 @@@ static DEFINE_PER_CPU(struct trace_arra
   /* tracer_enabled is used to toggle activation of a tracer */
   static int                    tracer_enabled = 1;
   
+ /**
+  * tracing_is_enabled - return tracer_enabled status
+  *
+  * This function is used by other tracers to know the status
+  * of the tracer_enabled flag.  Tracers may use this function
+  * to know if it should enable their features when starting
+  * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+  */
+ int tracing_is_enabled(void)
+ {
+       return tracer_enabled;
+ }
+ 
   /* function tracing enabled */
   int                           ftrace_function_enabled;
   
@@@ -153,8 -228,9 +228,9 @@@ static DEFINE_MUTEX(trace_types_lock)
   /* trace_wait is a waitqueue for tasks blocked on trace_poll */
   static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
   
- /* trace_flags holds iter_ctrl options */
- unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+ /* trace_flags holds trace_options default values */
+ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+       TRACE_ITER_ANNOTATE;
   
   /**
    * trace_wake_up - wake up tasks waiting for trace input
@@@ -193,13 -269,6 +269,6 @@@ unsigned long nsecs_to_usecs(unsigned l
         return nsecs / 1000;
   }
   
- /*
-  * TRACE_ITER_SYM_MASK masks the options in trace_flags that
-  * control the output of kernel symbols.
-  */
- #define TRACE_ITER_SYM_MASK \
-       (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
- 
   /* These must match the bit postions in trace_iterator_flags */
   static const char *trace_options[] = {
         "print-parent",
@@@ -213,6 -282,12 +282,12 @@@
         "stacktrace",
         "sched-tree",
         "ftrace_printk",
+       "ftrace_preempt",
+       "branch",
+       "annotate",
+       "userstacktrace",
+       "sym-userobj",
+       "printk-msg-only",
         NULL
   };
   
@@@ -246,7 -321,7 +321,7 @@@ __update_max_tr(struct trace_array *tr
   
         memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
         data->pid = tsk->pid;
- -      data->uid = tsk->uid;
+ +      data->uid = task_uid(tsk);
         data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
         data->policy = tsk->policy;
         data->rt_priority = tsk->rt_priority;
@@@ -359,6 -434,28 +434,28 @@@ trace_seq_putmem_hex(struct trace_seq *
         return trace_seq_putmem(s, hex, j);
   }
   
+ static int
+ trace_seq_path(struct trace_seq *s, struct path *path)
+ {
+       unsigned char *p;
+ 
+       if (s->len >= (PAGE_SIZE - 1))
+               return 0;
+       p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+       if (!IS_ERR(p)) {
+               p = mangle_path(s->buffer + s->len, p, "\n");
+               if (p) {
+                       s->len = p - s->buffer;
+                       return 1;
+               }
+       } else {
+               s->buffer[s->len++] = '?';
+               return 1;
+       }
+ 
+       return 0;
+ }
+ 
   static void
   trace_seq_reset(struct trace_seq *s)
   {
@@@ -470,7 -567,17 +567,17 @@@ int register_tracer(struct tracer *type
                 return -1;
         }
   
+       /*
+        * When this gets called we hold the BKL which means that
+        * preemption is disabled. Various trace selftests however
+        * need to disable and enable preemption for successful tests.
+        * So we drop the BKL here and grab it after the tests again.
+        */
+       unlock_kernel();
         mutex_lock(&trace_types_lock);
+ 
+       tracing_selftest_running = true;
+ 
         for (t = trace_types; t; t = t->next) {
                 if (strcmp(type->name, t->name) == 0) {
                         /* already found */
@@@ -481,12 -588,20 +588,20 @@@
                 }
         }
   
+       if (!type->set_flag)
+               type->set_flag = &dummy_set_flag;
+       if (!type->flags)
+               type->flags = &dummy_tracer_flags;
+       else
+               if (!type->flags->opts)
+                       type->flags->opts = dummy_tracer_opt;
+ 
   #ifdef CONFIG_FTRACE_STARTUP_TEST
         if (type->selftest) {
                 struct tracer *saved_tracer = current_trace;
                 struct trace_array *tr = &global_trace;
-               int saved_ctrl = tr->ctrl;
                 int i;
+ 
                 /*
                  * Run a selftest on this tracer.
                  * Here we reset the trace buffer, and set the current
@@@ -494,25 -609,23 +609,23 @@@
                  * internal tracing to verify that everything is in order.
                  * If we fail, we do not register this tracer.
                  */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                         tracing_reset(tr, i);
-               }
+ 
                 current_trace = type;
-               tr->ctrl = 0;
                 /* the test is responsible for initializing and enabling */
                 pr_info("Testing tracer %s: ", type->name);
                 ret = type->selftest(type, tr);
                 /* the test is responsible for resetting too */
                 current_trace = saved_tracer;
-               tr->ctrl = saved_ctrl;
                 if (ret) {
                         printk(KERN_CONT "FAILED!\n");
                         goto out;
                 }
                 /* Only reset on passing, to avoid touching corrupted buffers */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                         tracing_reset(tr, i);
-               }
+ 
                 printk(KERN_CONT "PASSED\n");
         }
   #endif
@@@ -524,7 -637,9 +637,9 @@@
                 max_tracer_type_len = len;
   
    out:
+       tracing_selftest_running = false;
         mutex_unlock(&trace_types_lock);
+       lock_kernel();
   
         return ret;
   }
@@@ -564,6 -679,16 +679,16 @@@ void tracing_reset(struct trace_array *
         ftrace_enable_cpu();
   }
   
+ void tracing_reset_online_cpus(struct trace_array *tr)
+ {
+       int cpu;
+ 
+       tr->time_start = ftrace_now(tr->cpu);
+ 
+       for_each_online_cpu(cpu)
+               tracing_reset(tr, cpu);
+ }
+ 
   #define SAVED_CMDLINES 128
   static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
   static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
@@@ -581,6 -706,91 +706,91 @@@ static void trace_init_cmdlines(void
         cmdline_idx = 0;
   }
   
+ static int trace_stop_count;
+ static DEFINE_SPINLOCK(tracing_start_lock);
+ 
+ /**
+  * ftrace_off_permanent - disable all ftrace code permanently
+  *
+  * This should only be called when a serious anomally has
+  * been detected.  This will turn off the function tracing,
+  * ring buffers, and other tracing utilites. It takes no
+  * locks and can be called from any context.
+  */
+ void ftrace_off_permanent(void)
+ {
+       tracing_disabled = 1;
+       ftrace_stop();
+       tracing_off_permanent();
+ }
+ 
+ /**
+  * tracing_start - quick start of the tracer
+  *
+  * If tracing is enabled but was stopped by tracing_stop,
+  * this will start the tracer back up.
+  */
+ void tracing_start(void)
+ {
+       struct ring_buffer *buffer;
+       unsigned long flags;
+ 
+       if (tracing_disabled)
+               return;
+ 
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (--trace_stop_count)
+               goto out;
+ 
+       if (trace_stop_count < 0) {
+               /* Someone screwed up their debugging */
+               WARN_ON_ONCE(1);
+               trace_stop_count = 0;
+               goto out;
+       }
+ 
+ 
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+ 
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+ 
+       ftrace_start();
+  out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+ }
+ 
+ /**
+  * tracing_stop - quick stop of the tracer
+  *
+  * Light weight way to stop tracing. Use in conjunction with
+  * tracing_start.
+  */
+ void tracing_stop(void)
+ {
+       struct ring_buffer *buffer;
+       unsigned long flags;
+ 
+       ftrace_stop();
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (trace_stop_count++)
+               goto out;
+ 
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+ 
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+ 
+  out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+ }
+ 
   void trace_stop_cmdline_recording(void);
   
   static void trace_save_cmdline(struct task_struct *tsk)
@@@ -618,7 -828,7 +828,7 @@@
         spin_unlock(&trace_cmdline_lock);
   }
   
- static char *trace_find_cmdline(int pid)
+ char *trace_find_cmdline(int pid)
   {
         char *cmdline = "<...>";
         unsigned map;
@@@ -655,6 -865,7 +865,7 @@@ tracing_generic_entry_update(struct tra
   
         entry->preempt_count            = pc & 0xff;
         entry->pid                      = (tsk) ? tsk->pid : 0;
+       entry->tgid                     = (tsk) ? tsk->tgid : 0;
         entry->flags =
   #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@@ -691,6 -902,56 +902,56 @@@ trace_function(struct trace_array *tr, 
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
   }
   
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ static void __trace_graph_entry(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ent *trace,
+                               unsigned long flags,
+                               int pc)
+ {
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ent_entry *entry;
+       unsigned long irq_flags;
+ 
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+ 
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_ENT;
+       entry->graph_ent                        = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ }
+ 
+ static void __trace_graph_return(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ret *trace,
+                               unsigned long flags,
+                               int pc)
+ {
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ret_entry *entry;
+       unsigned long irq_flags;
+ 
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+ 
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_RET;
+       entry->ret                              = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ }
+ #endif
+ 
   void
   ftrace(struct trace_array *tr, struct trace_array_cpu *data,
          unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@@ -742,6 -1003,46 +1003,46 @@@ void __trace_stack(struct trace_array *
         ftrace_trace_stack(tr, data, flags, skip, preempt_count());
   }
   
+ static void ftrace_trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags, int pc)
+ {
+ #ifdef CONFIG_STACKTRACE
+       struct ring_buffer_event *event;
+       struct userstack_entry *entry;
+       struct stack_trace trace;
+       unsigned long irq_flags;
+ 
+       if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+               return;
+ 
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type         = TRACE_USER_STACK;
+ 
+       memset(&entry->caller, 0, sizeof(entry->caller));
+ 
+       trace.nr_entries        = 0;
+       trace.max_entries       = FTRACE_STACK_ENTRIES;
+       trace.skip              = 0;
+       trace.entries           = entry->caller;
+ 
+       save_stack_trace_user(&trace);
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ #endif
+ }
+ 
+ void __trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags)
+ {
+       ftrace_trace_userstack(tr, data, flags, preempt_count());
+ }
+ 
   static void
   ftrace_trace_special(void *__tr, void *__data,
                      unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@@ -765,6 -1066,7 +1066,7 @@@
         entry->arg3                     = arg3;
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
         ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+       ftrace_trace_userstack(tr, data, irq_flags, pc);
   
         trace_wake_up();
   }
@@@ -803,6 -1105,7 +1105,7 @@@ tracing_sched_switch_trace(struct trace
         entry->next_cpu = task_cpu(next);
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
         ftrace_trace_stack(tr, data, flags, 5, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
   }
   
   void
@@@ -832,6 -1135,7 +1135,7 @@@ tracing_sched_wakeup_trace(struct trace
         entry->next_cpu                 = task_cpu(wakee);
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
         ftrace_trace_stack(tr, data, flags, 6, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
   
         trace_wake_up();
   }
@@@ -841,26 -1145,28 +1145,28 @@@ ftrace_special(unsigned long arg1, unsi
   {
         struct trace_array *tr = &global_trace;
         struct trace_array_cpu *data;
+       unsigned long flags;
         int cpu;
         int pc;
   
-       if (tracing_disabled || !tr->ctrl)
+       if (tracing_disabled)
                 return;
   
         pc = preempt_count();
-       preempt_disable_notrace();
+       local_irq_save(flags);
         cpu = raw_smp_processor_id();
         data = tr->data[cpu];
   
-       if (likely(!atomic_read(&data->disabled)))
+       if (likely(atomic_inc_return(&data->disabled) == 1))
                 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
   
-       preempt_enable_notrace();
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
   }
   
   #ifdef CONFIG_FUNCTION_TRACER
   static void
- function_trace_call(unsigned long ip, unsigned long parent_ip)
+ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
   {
         struct trace_array *tr = &global_trace;
         struct trace_array_cpu *data;
@@@ -873,8 -1179,7 +1179,7 @@@
                 return;
   
         pc = preempt_count();
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
         local_save_flags(flags);
         cpu = raw_smp_processor_id();
         data = tr->data[cpu];
@@@ -884,12 -1189,97 +1189,97 @@@
                 trace_function(tr, data, ip, parent_ip, flags, pc);
   
         atomic_dec(&data->disabled);
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
   }
   
+ static void
+ function_trace_call(unsigned long ip, unsigned long parent_ip)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+ 
+       if (unlikely(!ftrace_function_enabled))
+               return;
+ 
+       /*
+        * Need to use raw, since this must be called before the
+        * recursive protection is performed.
+        */
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+ 
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               trace_function(tr, data, ip, parent_ip, flags, pc);
+       }
+ 
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+ }
+ 
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ int trace_graph_entry(struct ftrace_graph_ent *trace)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+ 
+       if (!ftrace_trace_task(current))
+               return 0;
+ 
+       if (!ftrace_graph_addr(trace->func))
+               return 0;
+ 
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_entry(tr, data, trace, flags, pc);
+       }
+       /* Only do the atomic if it is not already set */
+       if (!test_tsk_trace_graph(current))
+               set_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+ 
+       return 1;
+ }
+ 
+ void trace_graph_return(struct ftrace_graph_ret *trace)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+ 
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_return(tr, data, trace, flags, pc);
+       }
+       if (!trace->depth)
+               clear_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+ }
+ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+ 
   static struct ftrace_ops trace_ops __read_mostly =
   {
         .func = function_trace_call,
@@@ -898,9 -1288,14 +1288,14 @@@
   void tracing_start_function_trace(void)
   {
         ftrace_function_enabled = 0;
+ 
+       if (trace_flags & TRACE_ITER_PREEMPTONLY)
+               trace_ops.func = function_trace_call_preempt_only;
+       else
+               trace_ops.func = function_trace_call;
+ 
         register_ftrace_function(&trace_ops);
-       if (tracer_enabled)
-               ftrace_function_enabled = 1;
+       ftrace_function_enabled = 1;
   }
   
   void tracing_stop_function_trace(void)
@@@ -912,6 -1307,7 +1307,7 @@@
   
   enum trace_file_type {
         TRACE_FILE_LAT_FMT      = 1,
+       TRACE_FILE_ANNOTATE     = 2,
   };
   
   static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@@ -1047,10 -1443,6 +1443,6 @@@ static void *s_start(struct seq_file *m
   
         atomic_inc(&trace_record_cmdline_disabled);
   
-       /* let the tracer grab locks here if needed */
-       if (current_trace->start)
-               current_trace->start(iter);
- 
         if (*pos != iter->pos) {
                 iter->ent = NULL;
                 iter->cpu = 0;
@@@ -1077,14 -1469,7 +1469,7 @@@
   
   static void s_stop(struct seq_file *m, void *p)
   {
-       struct trace_iterator *iter = m->private;
- 
         atomic_dec(&trace_record_cmdline_disabled);
- 
-       /* let the tracer release locks here if needed */
-       if (current_trace && current_trace == iter->trace && iter->trace->stop)
-               iter->trace->stop(iter);
- 
         mutex_unlock(&trace_types_lock);
   }
   
@@@ -1143,7 -1528,7 +1528,7 @@@ seq_print_sym_offset(struct trace_seq *
   # define IP_FMT "%016lx"
   #endif
   
- static int
+ int
   seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
   {
         int ret;
@@@ -1164,6 -1549,78 +1549,78 @@@
         return ret;
   }
   
+ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+                                   unsigned long ip, unsigned long sym_flags)
+ {
+       struct file *file = NULL;
+       unsigned long vmstart = 0;
+       int ret = 1;
+ 
+       if (mm) {
+               const struct vm_area_struct *vma;
+ 
+               down_read(&mm->mmap_sem);
+               vma = find_vma(mm, ip);
+               if (vma) {
+                       file = vma->vm_file;
+                       vmstart = vma->vm_start;
+               }
+               if (file) {
+                       ret = trace_seq_path(s, &file->f_path);
+                       if (ret)
+                               ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
+               }
+               up_read(&mm->mmap_sem);
+       }
+       if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+               ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+       return ret;
+ }
+ 
+ static int
+ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+                     unsigned long sym_flags)
+ {
+       struct mm_struct *mm = NULL;
+       int ret = 1;
+       unsigned int i;
+ 
+       if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+               struct task_struct *task;
+               /*
+                * we do the lookup on the thread group leader,
+                * since individual threads might have already quit!
+                */
+               rcu_read_lock();
+               task = find_task_by_vpid(entry->ent.tgid);
+               if (task)
+                       mm = get_task_mm(task);
+               rcu_read_unlock();
+       }
+ 
+       for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+               unsigned long ip = entry->caller[i];
+ 
+               if (ip == ULONG_MAX || !ret)
+                       break;
+               if (i && ret)
+                       ret = trace_seq_puts(s, " <- ");
+               if (!ip) {
+                       if (ret)
+                               ret = trace_seq_puts(s, "??");
+                       continue;
+               }
+               if (!ret)
+                       break;
+               if (ret)
+                       ret = seq_print_user_ip(s, mm, ip, sym_flags);
+       }
+ 
+       if (mm)
+               mmput(mm);
+       return ret;
+ }
+ 
   static void print_lat_help_header(struct seq_file *m)
   {
         seq_puts(m, "#                  _------=> CPU#            \n");
@@@ -1301,6 -1758,13 +1758,13 @@@ lat_print_timestamp(struct trace_seq *s
   
   static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
   
+ static int task_state_char(unsigned long state)
+ {
+       int bit = state ? __ffs(state) + 1 : 0;
+ 
+       return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+ }
+ 
   /*
    * The message is supposed to contain an ending newline.
    * If the printing stops prematurely, try to add a newline of our own.
@@@ -1338,6 -1802,23 +1802,23 @@@ void trace_seq_print_cont(struct trace_
                 trace_seq_putc(s, '\n');
   }
   
+ static void test_cpu_buff_start(struct trace_iterator *iter)
+ {
+       struct trace_seq *s = &iter->seq;
+ 
+       if (!(trace_flags & TRACE_ITER_ANNOTATE))
+               return;
+ 
+       if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+               return;
+ 
+       if (cpu_isset(iter->cpu, iter->started))
+               return;
+ 
+       cpu_set(iter->cpu, iter->started);
+       trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+ }
+ 
   static enum print_line_t
   print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
   {
@@@ -1352,11 -1833,12 +1833,12 @@@
         char *comm;
         int S, T;
         int i;
-       unsigned state;
   
         if (entry->type == TRACE_CONT)
                 return TRACE_TYPE_HANDLED;
   
+       test_cpu_buff_start(iter);
+ 
         next_entry = find_next_entry(iter, NULL, &next_ts);
         if (!next_entry)
                 next_ts = iter->ts;
@@@ -1396,12 -1878,8 +1878,8 @@@
   
                 trace_assign_type(field, entry);
   
-               T = field->next_state < sizeof(state_to_char) ?
-                       state_to_char[field->next_state] : 'X';
- 
-               state = field->prev_state ?
-                       __ffs(field->prev_state) + 1 : 0;
-               S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+               T = task_state_char(field->next_state);
+               S = task_state_char(field->prev_state);
                 comm = trace_find_cmdline(field->next_pid);
                 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
                                  field->prev_pid,
@@@ -1448,6 -1926,27 +1926,27 @@@
                         trace_seq_print_cont(s, iter);
                 break;
         }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+ 
+               trace_assign_type(field, entry);
+ 
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+ 
+               trace_assign_type(field, entry);
+ 
+               seq_print_userip_objs(field, s, sym_flags);
+               trace_seq_putc(s, '\n');
+               break;
+       }
         default:
                 trace_seq_printf(s, "Unknown type %d\n", entry->type);
         }
@@@ -1472,6 -1971,8 +1971,8 @@@ static enum print_line_t print_trace_fm
         if (entry->type == TRACE_CONT)
                 return TRACE_TYPE_HANDLED;
   
+       test_cpu_buff_start(iter);
+ 
         comm = trace_find_cmdline(iter->ent->pid);
   
         t = ns2usecs(iter->ts);
@@@ -1519,10 -2020,8 +2020,8 @@@
   
                 trace_assign_type(field, entry);
   
-               S = field->prev_state < sizeof(state_to_char) ?
-                       state_to_char[field->prev_state] : 'X';
-               T = field->next_state < sizeof(state_to_char) ?
-                       state_to_char[field->next_state] : 'X';
+               T = task_state_char(field->next_state);
+               S = task_state_char(field->prev_state);
                 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
                                        field->prev_pid,
                                        field->prev_prio,
@@@ -1581,6 -2080,37 +2080,37 @@@
                         trace_seq_print_cont(s, iter);
                 break;
         }
+       case TRACE_GRAPH_RET: {
+               return print_graph_function(iter);
+       }
+       case TRACE_GRAPH_ENT: {
+               return print_graph_function(iter);
+       }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+ 
+               trace_assign_type(field, entry);
+ 
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+ 
+               trace_assign_type(field, entry);
+ 
+               ret = seq_print_userip_objs(field, s, sym_flags);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               ret = trace_seq_putc(s, '\n');
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               break;
+       }
         }
         return TRACE_TYPE_HANDLED;
   }
@@@ -1621,12 -2151,9 +2151,9 @@@ static enum print_line_t print_raw_fmt(
   
                 trace_assign_type(field, entry);
   
-               S = field->prev_state < sizeof(state_to_char) ?
-                       state_to_char[field->prev_state] : 'X';
-               T = field->next_state < sizeof(state_to_char) ?
-                       state_to_char[field->next_state] : 'X';
-               if (entry->type == TRACE_WAKE)
-                       S = '+';
+               T = task_state_char(field->next_state);
+               S = entry->type == TRACE_WAKE ? '+' :
+                       task_state_char(field->prev_state);
                 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
                                        field->prev_pid,
                                        field->prev_prio,
@@@ -1640,6 -2167,7 +2167,7 @@@
                 break;
         }
         case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
         case TRACE_STACK: {
                 struct special_entry *field;
   
@@@ -1712,12 -2240,9 +2240,9 @@@ static enum print_line_t print_hex_fmt(
   
                 trace_assign_type(field, entry);
   
-               S = field->prev_state < sizeof(state_to_char) ?
-                       state_to_char[field->prev_state] : 'X';
-               T = field->next_state < sizeof(state_to_char) ?
-                       state_to_char[field->next_state] : 'X';
-               if (entry->type == TRACE_WAKE)
-                       S = '+';
+               T = task_state_char(field->next_state);
+               S = entry->type == TRACE_WAKE ? '+' :
+                       task_state_char(field->prev_state);
                 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
                 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
                 SEQ_PUT_HEX_FIELD_RET(s, S);
@@@ -1728,6 -2253,7 +2253,7 @@@
                 break;
         }
         case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
         case TRACE_STACK: {
                 struct special_entry *field;
   
@@@ -1744,6 -2270,25 +2270,25 @@@
         return TRACE_TYPE_HANDLED;
   }
   
+ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
+ {
+       struct trace_seq *s = &iter->seq;
+       struct trace_entry *entry = iter->ent;
+       struct print_entry *field;
+       int ret;
+ 
+       trace_assign_type(field, entry);
+ 
+       ret = trace_seq_printf(s, field->buf);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+ 
+       if (entry->flags & TRACE_FLAG_CONT)
+               trace_seq_print_cont(s, iter);
+ 
+       return TRACE_TYPE_HANDLED;
+ }
+ 
   static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
   {
         struct trace_seq *s = &iter->seq;
@@@ -1782,6 -2327,7 +2327,7 @@@
                 break;
         }
         case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
         case TRACE_STACK: {
                 struct special_entry *field;
   
@@@ -1823,6 -2369,11 +2369,11 @@@ static enum print_line_t print_trace_li
                         return ret;
         }
   
+       if (iter->ent->type == TRACE_PRINT &&
+                       trace_flags & TRACE_ITER_PRINTK &&
+                       trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+               return print_printk_msg_only(iter);
+ 
         if (trace_flags & TRACE_ITER_BIN)
                 return print_bin_fmt(iter);
   
@@@ -1847,7 -2398,9 +2398,9 @@@ static int s_show(struct seq_file *m, v
                         seq_printf(m, "# tracer: %s\n", iter->trace->name);
                         seq_puts(m, "#\n");
                 }
-               if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+               if (iter->trace && iter->trace->print_header)
+                       iter->trace->print_header(m);
+               else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                         /* print nothing if the buffers are empty */
                         if (trace_empty(iter))
                                 return 0;
@@@ -1899,6 -2452,15 +2452,15 @@@ __tracing_open(struct inode *inode, str
         iter->trace = current_trace;
         iter->pos = -1;
   
+       /* Notify the tracer early; before we stop tracing. */
+       if (iter->trace && iter->trace->open)
+               iter->trace->open(iter);
+ 
+       /* Annotate start of buffers if we had overruns */
+       if (ring_buffer_overruns(iter->tr->buffer))
+               iter->iter_flags |= TRACE_FILE_ANNOTATE;
+ 
+ 
         for_each_tracing_cpu(cpu) {
   
                 iter->buffer_iter[cpu] =
@@@ -1917,13 -2479,7 +2479,7 @@@
         m->private = iter;
   
         /* stop the trace while dumping */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 0;
-               ftrace_function_enabled = 0;
-       }
- 
-       if (iter->trace && iter->trace->open)
-                       iter->trace->open(iter);
+       tracing_stop();
   
         mutex_unlock(&trace_types_lock);
   
@@@ -1966,14 -2522,7 +2522,7 @@@ int tracing_release(struct inode *inode
                 iter->trace->close(iter);
   
         /* reenable tracing if it was previously enabled */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 1;
-               /*
-                * It is safe to enable function tracing even if it
-                * isn't used
-                */
-               ftrace_function_enabled = 1;
-       }
+       tracing_start();
         mutex_unlock(&trace_types_lock);
   
         seq_release(inode, file);
@@@ -2151,7 -2700,7 +2700,7 @@@ tracing_cpumask_write(struct file *filp
         if (err)
                 goto err_unlock;
   
-       raw_local_irq_disable();
+       local_irq_disable();
         __raw_spin_lock(&ftrace_max_lock);
         for_each_tracing_cpu(cpu) {
                 /*
@@@ -2168,7 -2717,7 +2717,7 @@@
                 }
         }
         __raw_spin_unlock(&ftrace_max_lock);
-       raw_local_irq_enable();
+       local_irq_enable();
   
         tracing_cpumask = tracing_cpumask_new;
   
@@@ -2189,13 -2738,16 +2738,16 @@@ static struct file_operations tracing_c
   };
   
   static ssize_t
- tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+ tracing_trace_options_read(struct file *filp, char __user *ubuf,
                        size_t cnt, loff_t *ppos)
   {
+       int i;
         char *buf;
         int r = 0;
         int len = 0;
-       int i;
+       u32 tracer_flags = current_trace->flags->val;
+       struct tracer_opt *trace_opts = current_trace->flags->opts;
+ 
   
         /* calulate max size */
         for (i = 0; trace_options[i]; i++) {
@@@ -2203,6 -2755,15 +2755,15 @@@
                 len += 3; /* "no" and space */
         }
   
+       /*
+        * Increase the size with names of options specific
+        * of the current tracer.
+        */
+       for (i = 0; trace_opts[i].name; i++) {
+               len += strlen(trace_opts[i].name);
+               len += 3; /* "no" and space */
+       }
+ 
         /* +2 for \n and \0 */
         buf = kmalloc(len + 2, GFP_KERNEL);
         if (!buf)
@@@ -2215,6 -2776,15 +2776,15 @@@
                         r += sprintf(buf + r, "no%s ", trace_options[i]);
         }
   
+       for (i = 0; trace_opts[i].name; i++) {
+               if (tracer_flags & trace_opts[i].bit)
+                       r += sprintf(buf + r, "%s ",
+                               trace_opts[i].name);
+               else
+                       r += sprintf(buf + r, "no%s ",
+                               trace_opts[i].name);
+       }
+ 
         r += sprintf(buf + r, "\n");
         WARN_ON(r >= len + 2);
   
@@@ -2225,13 -2795,48 +2795,48 @@@
         return r;
   }
   
+ /* Try to assign a tracer specific option */
+ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+ {
+       struct tracer_flags *trace_flags = trace->flags;
+       struct tracer_opt *opts = NULL;
+       int ret = 0, i = 0;
+       int len;
+ 
+       for (i = 0; trace_flags->opts[i].name; i++) {
+               opts = &trace_flags->opts[i];
+               len = strlen(opts->name);
+ 
+               if (strncmp(cmp, opts->name, len) == 0) {
+                       ret = trace->set_flag(trace_flags->val,
+                               opts->bit, !neg);
+                       break;
+               }
+       }
+       /* Not found */
+       if (!trace_flags->opts[i].name)
+               return -EINVAL;
+ 
+       /* Refused to handle */
+       if (ret)
+               return ret;
+ 
+       if (neg)
+               trace_flags->val &= ~opts->bit;
+       else
+               trace_flags->val |= opts->bit;
+ 
+       return 0;
+ }
+ 
   static ssize_t
- tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                         size_t cnt, loff_t *ppos)
   {
         char buf[64];
         char *cmp = buf;
         int neg = 0;
+       int ret;
         int i;
   
         if (cnt >= sizeof(buf))
@@@ -2258,11 -2863,13 +2863,13 @@@
                         break;
                 }
         }
-       /*
-        * If no option could be set, return an error:
-        */
-       if (!trace_options[i])
-               return -EINVAL;
+ 
+       /* If no option could be set, test the specific tracer options */
+       if (!trace_options[i]) {
+               ret = set_tracer_option(current_trace, cmp, neg);
+               if (ret)
+                       return ret;
+       }
   
         filp->f_pos += cnt;
   
@@@ -2271,8 -2878,8 +2878,8 @@@
   
   static struct file_operations tracing_iter_fops = {
         .open           = tracing_open_generic,
-       .read           = tracing_iter_ctrl_read,
-       .write          = tracing_iter_ctrl_write,
+       .read           = tracing_trace_options_read,
+       .write          = tracing_trace_options_write,
   };
   
   static const char readme_msg[] =
@@@ -2286,9 -2893,9 +2893,9 @@@
         "# echo sched_switch > /debug/tracing/current_tracer\n"
         "# cat /debug/tracing/current_tracer\n"
         "sched_switch\n"
-       "# cat /debug/tracing/iter_ctrl\n"
+       "# cat /debug/tracing/trace_options\n"
         "noprint-parent nosym-offset nosym-addr noverbose\n"
-       "# echo print-parent > /debug/tracing/iter_ctrl\n"
+       "# echo print-parent > /debug/tracing/trace_options\n"
         "# echo 1 > /debug/tracing/tracing_enabled\n"
         "# cat /debug/tracing/trace > /tmp/trace.txt\n"
         "echo 0 > /debug/tracing/tracing_enabled\n"
@@@ -2311,11 -2918,10 +2918,10 @@@ static ssize_
   tracing_ctrl_read(struct file *filp, char __user *ubuf,
                   size_t cnt, loff_t *ppos)
   {
         char buf[64];
         int r;
   
-       r = sprintf(buf, "%ld\n", tr->ctrl);
+       r = sprintf(buf, "%u\n", tracer_enabled);
         return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
   }
   
@@@ -2343,16 -2949,18 +2949,18 @@@ tracing_ctrl_write(struct file *filp, c
         val = !!val;
   
         mutex_lock(&trace_types_lock);
-       if (tr->ctrl ^ val) {
-               if (val)
+       if (tracer_enabled ^ val) {
+               if (val) {
                         tracer_enabled = 1;
-               else
+                       if (current_trace->start)
+                               current_trace->start(tr);
+                       tracing_start();
+               } else {
                         tracer_enabled = 0;
- 
-               tr->ctrl = val;
- 
-               if (current_trace && current_trace->ctrl_update)
-                       current_trace->ctrl_update(tr);
+                       tracing_stop();
+                       if (current_trace->stop)
+                               current_trace->stop(tr);
+               }
         }
         mutex_unlock(&trace_types_lock);
   
@@@ -2378,29 -2986,11 +2986,11 @@@ tracing_set_trace_read(struct file *fil
         return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
   }
   
- static ssize_t
- tracing_set_trace_write(struct file *filp, const char __user *ubuf,
-                       size_t cnt, loff_t *ppos)
+ static int tracing_set_tracer(char *buf)
   {
         struct trace_array *tr = &global_trace;
         struct tracer *t;
-       char buf[max_tracer_type_len+1];
-       int i;
-       size_t ret;
- 
-       ret = cnt;
- 
-       if (cnt > max_tracer_type_len)
-               cnt = max_tracer_type_len;
- 
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
- 
-       buf[cnt] = 0;
- 
-       /* strip ending whitespace. */
-       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
-               buf[i] = 0;
+       int ret = 0;
   
         mutex_lock(&trace_types_lock);
         for (t = trace_types; t; t = t->next) {
@@@ -2414,18 -3004,52 +3004,52 @@@
         if (t == current_trace)
                 goto out;
   
+       trace_branch_disable();
         if (current_trace && current_trace->reset)
                 current_trace->reset(tr);
   
         current_trace = t;
-       if (t->init)
-               t->init(tr);
+       if (t->init) {
+               ret = t->init(tr);
+               if (ret)
+                       goto out;
+       }
   
+       trace_branch_enable(tr);
    out:
         mutex_unlock(&trace_types_lock);
   
-       if (ret > 0)
-               filp->f_pos += ret;
+       return ret;
+ }
+ 
+ static ssize_t
+ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+ {
+       char buf[max_tracer_type_len+1];
+       int i;
+       size_t ret;
+       int err;
+ 
+       ret = cnt;
+ 
+       if (cnt > max_tracer_type_len)
+               cnt = max_tracer_type_len;
+ 
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+ 
+       buf[cnt] = 0;
+ 
+       /* strip ending whitespace. */
+       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+               buf[i] = 0;
+ 
+       err = tracing_set_tracer(buf);
+       if (err)
+               return err;
+ 
+       filp->f_pos += ret;
   
         return ret;
   }
@@@ -2492,6 -3116,10 +3116,10 @@@ static int tracing_open_pipe(struct ino
                 return -ENOMEM;
   
         mutex_lock(&trace_types_lock);
+ 
+       /* trace pipe does not show start of buffer */
+       cpus_setall(iter->started);
+ 
         iter->tr = &global_trace;
         iter->trace = current_trace;
         filp->private_data = iter;
@@@ -2667,7 -3295,7 +3295,7 @@@ tracing_entries_read(struct file *filp
         char buf[64];
         int r;
   
-       r = sprintf(buf, "%lu\n", tr->entries);
+       r = sprintf(buf, "%lu\n", tr->entries >> 10);
         return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
   }
   
@@@ -2678,7 -3306,6 +3306,6 @@@ tracing_entries_write(struct file *filp
         unsigned long val;
         char buf[64];
         int ret, cpu;
-       struct trace_array *tr = filp->private_data;
   
         if (cnt >= sizeof(buf))
                 return -EINVAL;
@@@ -2698,12 -3325,7 +3325,7 @@@
   
         mutex_lock(&trace_types_lock);
   
-       if (tr->ctrl) {
-               cnt = -EBUSY;
-               pr_info("ftrace: please disable tracing"
-                       " before modifying buffer size\n");
-               goto out;
-       }
+       tracing_stop();
   
         /* disable all cpu buffers */
         for_each_tracing_cpu(cpu) {
@@@ -2713,6 -3335,9 +3335,9 @@@
                         atomic_inc(&max_tr.data[cpu]->disabled);
         }
   
+       /* value is in KB */
+       val <<= 10;
+ 
         if (val != global_trace.entries) {
                 ret = ring_buffer_resize(global_trace.buffer, val);
                 if (ret < 0) {
@@@ -2751,6 -3376,7 +3376,7 @@@
                         atomic_dec(&max_tr.data[cpu]->disabled);
         }
   
+       tracing_start();
         max_tr.entries = global_trace.entries;
         mutex_unlock(&trace_types_lock);
   
@@@ -2762,7 -3388,7 +3388,7 @@@ static int mark_printk(const char *fmt
         int ret;
         va_list args;
         va_start(args, fmt);
-       ret = trace_vprintk(0, fmt, args);
+       ret = trace_vprintk(0, -1, fmt, args);
         va_end(args);
         return ret;
   }
@@@ -2773,9 -3399,8 +3399,8 @@@ tracing_mark_write(struct file *filp, c
   {
         char *buf;
         char *end;
-       struct trace_array *tr = &global_trace;
   
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled)
                 return -EINVAL;
   
         if (cnt > TRACE_BUF_SIZE)
@@@ -2841,22 -3466,38 +3466,38 @@@ static struct file_operations tracing_m
   
   #ifdef CONFIG_DYNAMIC_FTRACE
   
+ int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+ {
+       return 0;
+ }
+ 
   static ssize_t
- tracing_read_long(struct file *filp, char __user *ubuf,
+ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
                   size_t cnt, loff_t *ppos)
   {
+       static char ftrace_dyn_info_buffer[1024];
+       static DEFINE_MUTEX(dyn_info_mutex);
         unsigned long *p = filp->private_data;
-       char buf[64];
+       char *buf = ftrace_dyn_info_buffer;
+       int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
         int r;
   
-       r = sprintf(buf, "%ld\n", *p);
+       mutex_lock(&dyn_info_mutex);
+       r = sprintf(buf, "%ld ", *p);
   
-       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+       r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+       buf[r++] = '\n';
+ 
+       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ 
+       mutex_unlock(&dyn_info_mutex);
+ 
+       return r;
   }
   
- static struct file_operations tracing_read_long_fops = {
+ static struct file_operations tracing_dyn_info_fops = {
         .open           = tracing_open_generic,
-       .read           = tracing_read_long,
+       .read           = tracing_read_dyn_info,
   };
   #endif
   
@@@ -2897,10 -3538,10 +3538,10 @@@ static __init int tracer_init_debugfs(v
         if (!entry)
                 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
   
-       entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+       entry = debugfs_create_file("trace_options", 0644, d_tracer,
                                     NULL, &tracing_iter_fops);
         if (!entry)
-               pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+               pr_warning("Could not create debugfs 'trace_options' entry\n");
   
         entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
                                     NULL, &tracing_cpumask_fops);
@@@ -2950,11 -3591,11 +3591,11 @@@
                 pr_warning("Could not create debugfs "
                            "'trace_pipe' entry\n");
   
-       entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+       entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
                                     &global_trace, &tracing_entries_fops);
         if (!entry)
                 pr_warning("Could not create debugfs "
-                          "'trace_entries' entry\n");
+                          "'buffer_size_kb' entry\n");
   
         entry = debugfs_create_file("trace_marker", 0220, d_tracer,
                                     NULL, &tracing_mark_fops);
@@@ -2965,7 -3606,7 +3606,7 @@@
   #ifdef CONFIG_DYNAMIC_FTRACE
         entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                                     &ftrace_update_tot_cnt,
-                                   &tracing_read_long_fops);
+                                   &tracing_dyn_info_fops);
         if (!entry)
                 pr_warning("Could not create debugfs "
                            "'dyn_ftrace_total_info' entry\n");
@@@ -2976,7 -3617,7 +3617,7 @@@
         return 0;
   }
   
- int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
   {
         static DEFINE_SPINLOCK(trace_buf_lock);
         static char trace_buf[TRACE_BUF_SIZE];
@@@ -2984,11 -3625,11 +3625,11 @@@
         struct ring_buffer_event *event;
         struct trace_array *tr = &global_trace;
         struct trace_array_cpu *data;
-       struct print_entry *entry;
-       unsigned long flags, irq_flags;
         int cpu, len = 0, size, pc;
+       struct print_entry *entry;
+       unsigned long irq_flags;
   
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled || tracing_selftest_running)
                 return 0;
   
         pc = preempt_count();
@@@ -2999,7 -3640,8 +3640,8 @@@
         if (unlikely(atomic_read(&data->disabled)))
                 goto out;
   
-       spin_lock_irqsave(&trace_buf_lock, flags);
+       pause_graph_tracing();
+       spin_lock_irqsave(&trace_buf_lock, irq_flags);
         len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
   
         len = min(len, TRACE_BUF_SIZE-1);
@@@ -3010,17 -3652,18 +3652,18 @@@
         if (!event)
                 goto out_unlock;
         entry = ring_buffer_event_data(event);
-       tracing_generic_entry_update(&entry->ent, flags, pc);
+       tracing_generic_entry_update(&entry->ent, irq_flags, pc);
         entry->ent.type                 = TRACE_PRINT;
         entry->ip                       = ip;
+       entry->depth                    = depth;
   
         memcpy(&entry->buf, trace_buf, len);
         entry->buf[len] = 0;
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
   
    out_unlock:
-       spin_unlock_irqrestore(&trace_buf_lock, flags);
- 
+       spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+       unpause_graph_tracing();
    out:
         preempt_enable_notrace();
   
@@@ -3037,7 -3680,7 +3680,7 @@@ int __ftrace_printk(unsigned long ip, c
                 return 0;
   
         va_start(ap, fmt);
-       ret = trace_vprintk(ip, fmt, ap);
+       ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
         va_end(ap);
         return ret;
   }
@@@ -3046,7 -3689,8 +3689,8 @@@ EXPORT_SYMBOL_GPL(__ftrace_printk)
   static int trace_panic_handler(struct notifier_block *this,
                                unsigned long event, void *unused)
   {
-       ftrace_dump();
+       if (ftrace_dump_on_oops)
+               ftrace_dump();
         return NOTIFY_OK;
   }
   
@@@ -3062,7 -3706,8 +3706,8 @@@ static int trace_die_handler(struct not
   {
         switch (val) {
         case DIE_OOPS:
-               ftrace_dump();
+               if (ftrace_dump_on_oops)
+                       ftrace_dump();
                 break;
         default:
                 break;
@@@ -3103,7 -3748,6 +3748,6 @@@ trace_printk_seq(struct trace_seq *s
         trace_seq_reset(s);
   }
   
- 
   void ftrace_dump(void)
   {
         static DEFINE_SPINLOCK(ftrace_dump_lock);
@@@ -3128,6 -3772,9 +3772,9 @@@
                 atomic_inc(&global_trace.data[cpu]->disabled);
         }
   
+       /* don't look at user memory in panic mode */
+       trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ 
         printk(KERN_TRACE "Dumping ftrace buffer:\n");
   
         iter.tr = &global_trace;
@@@ -3221,7 -3868,6 +3868,6 @@@ __init static int tracer_alloc_buffers(
   #endif
   
         /* All seems OK, enable tracing */
-       global_trace.ctrl = tracer_enabled;
         tracing_disabled = 0;
   
         atomic_notifier_chain_register(&panic_notifier_list,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 28 Dec 2008 20:21:10 +0000 (12:21 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 28 Dec 2008 20:21:10 +0000 (12:21 -0800)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/msr.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/thread_info.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_32.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irq_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/vsyscall_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/tty.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/ptrace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/signal.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.c	patch \|	diff1 \|	diff2 \|	blob \| history