* interrupt-retrigger: should we handle this via lost interrupts and IPIs
* or should we not care like we do now ? --BenH.
*/
- struct hw_interrupt_type;
+ struct irq_chip;
+#ifdef CONFIG_PERF_COUNTERS
+static inline unsigned long get_perf_counter_pending(void)
+{
+ unsigned long x;
+
+ asm volatile("lbz %0,%1(13)"
+ : "=r" (x)
+ : "i" (offsetof(struct paca_struct, perf_counter_pending)));
+ return x;
+}
+
+static inline void set_perf_counter_pending(int x)
+{
+ asm volatile("stb %0,%1(13)" : :
+ "r" (x),
+ "i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+extern void perf_counter_do_pending(void);
+
+#else
+
+static inline unsigned long get_perf_counter_pending(void)
+{
+ return 0;
+}
+
+static inline void set_perf_counter_pending(int x) {}
+static inline void perf_counter_do_pending(void) {}
+#endif /* CONFIG_PERF_COUNTERS */
+
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_HW_IRQ_H */
SYSX(sys_ni_syscall,sys_olduname, sys_olduname)
COMPAT_SYS_SPU(umask)
SYSCALL_SPU(chroot)
- SYSCALL(ustat)
+ COMPAT_SYS(ustat)
SYSCALL_SPU(dup2)
SYSCALL_SPU(getppid)
SYSCALL_SPU(getpgrp)
SYSCALL_SPU(dup3)
SYSCALL_SPU(pipe2)
SYSCALL(inotify_init1)
+SYSCALL_SPU(perf_counter_open)
CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
-
- ifdef CONFIG_DYNAMIC_FTRACE
- # dynamic ftrace setup.
+ # do not trace tracer code
CFLAGS_REMOVE_ftrace.o = -pg -mno-sched-epilog
- endif
-
+ # timers used by tracing
+ CFLAGS_REMOVE_time.o = -pg -mno-sched-epilog
endif
obj-y := cputable.o ptrace.o syscalls.o \
obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o
obj-$(CONFIG_MODULES) += module.o module_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_44x) += cpu_setup_44x.o
+ obj-$(CONFIG_FSL_BOOKE) += cpu_setup_fsl_booke.o dbell.o
extra-$(CONFIG_PPC_STD_MMU) := head_32.o
extra-$(CONFIG_PPC64) := head_64.o
obj-$(CONFIG_PPC32) += entry_32.o setup_32.o
obj-$(CONFIG_PPC64) += dma-iommu.o iommu.o
obj-$(CONFIG_KGDB) += kgdb.o
- obj-$(CONFIG_PPC_MULTIPLATFORM) += prom_init.o
+ obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o
obj-$(CONFIG_MODULES) += ppc_ksyms.o
obj-$(CONFIG_BOOTX_TEXT) += btext.o
obj-$(CONFIG_SMP) += smp.o
obj64-$(CONFIG_AUDIT) += compat_audit.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \
+ power5-pmu.o power5+-pmu.o power6-pmu.o
obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
#include <asm/iseries/alpaca.h>
#endif
#ifdef CONFIG_KVM
- #include <asm/kvm_44x.h>
+ #include <linux/kvm_host.h>
#endif
#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+ DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
#endif /* ! CONFIG_PPC64 */
/* About the CPU features table */
- DEFINE(CPU_SPEC_ENTRY_SIZE, sizeof(struct cpu_spec));
- DEFINE(CPU_SPEC_PVR_MASK, offsetof(struct cpu_spec, pvr_mask));
- DEFINE(CPU_SPEC_PVR_VALUE, offsetof(struct cpu_spec, pvr_value));
DEFINE(CPU_SPEC_FEATURES, offsetof(struct cpu_spec, cpu_features));
DEFINE(CPU_SPEC_SETUP, offsetof(struct cpu_spec, cpu_setup));
DEFINE(CPU_SPEC_RESTORE, offsetof(struct cpu_spec, cpu_restore));
DEFINE(PTE_SIZE, sizeof(pte_t));
#ifdef CONFIG_KVM
- DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
-
DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
2:
TRACE_AND_RESTORE_IRQ(r5);
+#ifdef CONFIG_PERF_COUNTERS
+ /* check paca->perf_counter_pending if we're enabling ints */
+ lbz r3,PACAPERFPEND(r13)
+ and. r3,r3,r5
+ beq 27f
+ bl .perf_counter_do_pending
+27:
+#endif /* CONFIG_PERF_COUNTERS */
+
/* extract EE bit and use it to restore paca->hard_enabled */
ld r3,_MSR(r1)
rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
ftrace_call:
bl ftrace_stub
nop
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ .globl ftrace_graph_call
+ ftrace_graph_call:
+ b ftrace_graph_stub
+ _GLOBAL(ftrace_graph_stub)
+ #endif
ld r0, 128(r1)
mtlr r0
addi r1, r1, 112
ld r5,0(r5)
mtctr r5
bctrl
-
nop
+
+
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ b ftrace_graph_caller
+ #endif
ld r0, 128(r1)
mtlr r0
addi r1, r1, 112
_GLOBAL(ftrace_stub)
blr
- #endif
- #endif
+ #endif /* CONFIG_DYNAMIC_FTRACE */
+
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ _GLOBAL(ftrace_graph_caller)
+ /* load r4 with local address */
+ ld r4, 128(r1)
+ subi r4, r4, MCOUNT_INSN_SIZE
+
+ /* get the parent address */
+ ld r11, 112(r1)
+ addi r3, r11, 16
+
+ bl .prepare_ftrace_return
+ nop
+
+ ld r0, 128(r1)
+ mtlr r0
+ addi r1, r1, 112
+ blr
+
+ _GLOBAL(return_to_handler)
+ /* need to save return values */
+ std r4, -24(r1)
+ std r3, -16(r1)
+ std r31, -8(r1)
+ mr r31, r1
+ stdu r1, -112(r1)
+
+ bl .ftrace_return_to_handler
+ nop
+
+ /* return value has real return address */
+ mtlr r3
+
+ ld r1, 0(r1)
+ ld r4, -24(r1)
+ ld r3, -16(r1)
+ ld r31, -8(r1)
+
+ /* Jump back to real return address */
+ blr
+
+ _GLOBAL(mod_return_to_handler)
+ /* need to save return values */
+ std r4, -32(r1)
+ std r3, -24(r1)
+ /* save TOC */
+ std r2, -16(r1)
+ std r31, -8(r1)
+ mr r31, r1
+ stdu r1, -112(r1)
+
+ /*
+ * We are in a module using the module's TOC.
+ * Switch to our TOC to run inside the core kernel.
+ */
+ LOAD_REG_IMMEDIATE(r4,ftrace_return_to_handler)
+ ld r2, 8(r4)
+
+ bl .ftrace_return_to_handler
+ nop
+
+ /* return value has real return address */
+ mtlr r3
+
+ ld r1, 0(r1)
+ ld r4, -32(r1)
+ ld r3, -24(r1)
+ ld r2, -16(r1)
+ ld r31, -8(r1)
+
+ /* Jump back to real return address */
+ blr
+ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+ #endif /* CONFIG_FUNCTION_TRACER */
: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
}
+#ifdef CONFIG_PERF_COUNTERS
+notrace void __weak perf_counter_do_pending(void)
+{
+ set_perf_counter_pending(0);
+}
+#endif
+
notrace void raw_local_irq_restore(unsigned long en)
{
/*
iseries_handle_interrupts();
}
+ if (get_perf_counter_pending())
+ perf_counter_do_pending();
+
/*
* if (get_paca()->hard_enabled) return;
* But again we need to take care that gcc gets hard_enabled directly
{
int i = *(loff_t *)v, j;
struct irqaction *action;
- irq_desc_t *desc;
+ struct irq_desc *desc;
unsigned long flags;
if (i == 0) {
seq_printf(p, "%3d: ", i);
#ifdef CONFIG_SMP
for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
#else
seq_printf(p, "%10u ", kstat_irqs(i));
#endif /* CONFIG_SMP */
static int virq_debug_show(struct seq_file *m, void *private)
{
unsigned long flags;
- irq_desc_t *desc;
+ struct irq_desc *desc;
const char *p;
char none[] = "none";
int i;
config PPC64
bool "64-bit kernel"
default n
+ select HAVE_PERF_COUNTERS
help
This option selects whether a 32-bit or a 64-bit kernel
will be built.
endchoice
+ # Until we have a choice of exclusive CPU types on 64-bit, we always
+ # use PPC_BOOK3S. On 32-bit, this is equivalent to 6xx which is
+ # "classic" MMU
+
+ config PPC_BOOK3S
+ def_bool y
+ depends on PPC64 || 6xx
+
config POWER4_ONLY
bool "Optimize for POWER4"
- depends on PPC64
+ depends on PPC64 && PPC_BOOK3S
default n
---help---
Cause the compiler to optimize for POWER4/POWER5/PPC970 processors.
config POWER3
bool
- depends on PPC64
+ depends on PPC64 && PPC_BOOK3S
default y if !POWER4_ONLY
config POWER4
- depends on PPC64
+ depends on PPC64 && PPC_BOOK3S
def_bool y
config TUNE_CELL
bool "Optimize for Cell Broadband Engine"
- depends on PPC64
+ depends on PPC64 && PPC_BOOK3S
help
Cause the compiler to optimize for the PPE of the Cell Broadband
Engine. This will make the code run considerably faster on Cell
config ALTIVEC
bool "AltiVec Support"
- depends on CLASSIC32 || POWER4
+ depends on 6xx || POWER4
---help---
This option enables kernel support for the Altivec extensions to the
PowerPC processor. The kernel currently supports saving and restoring
def_bool y
depends on !PPC_STD_MMU
+ config PPC_BOOK3E_MMU
+ def_bool y
+ depends on FSL_BOOKE
+
config PPC_MM_SLICES
bool
default y if HUGETLB_PAGE || (PPC_STD_MMU_64 && PPC_64K_PAGES)
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
+ select HAVE_FTRACE_SYSCALLS
select HAVE_KVM
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select USER_STACKTRACE_SUPPORT
+ select HAVE_DMA_API_DEBUG
+ select HAVE_KERNEL_GZIP
+ select HAVE_KERNEL_BZIP2
+ select HAVE_KERNEL_LZMA
config ARCH_DEFCONFIG
string
config HAVE_SETUP_PER_CPU_AREA
def_bool y
+ config HAVE_DYNAMIC_PER_CPU_AREA
+ def_bool y
+
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP
config ARCH_SUPPORTS_OPTIMIZED_INLINING
def_bool y
+ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ def_bool y
+
# Use the generic interrupt handling code in kernel/irq/:
config GENERIC_HARDIRQS
bool
default y
+ config GENERIC_HARDIRQS_NO__DO_IRQ
+ def_bool y
+
config GENERIC_IRQ_PROBE
bool
default y
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+ select HAVE_PERF_COUNTERS if (!M386 && !M486)
config X86_IO_APIC
def_bool y
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
+ config X86_MCE_THRESHOLD
+ depends on X86_MCE_AMD || X86_MCE_INTEL
+ bool
+ default y
+
config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_32 && X86_MCE
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
/dev/cpu/31/cpuid.
+ config X86_CPU_DEBUG
+ tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
+ ---help---
+ If you select this option, this will provide various x86 CPUs
+ information through debugfs.
+
choice
prompt "High Memory Support"
default HIGHMEM4G if !X86_NUMAQ
config NODES_SHIFT
int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
- range 1 9 if X86_64
+ range 1 9
default "9" if MAXSMP
default "6" if X86_64
default "4" if X86_NUMAQ
depends on NEED_MULTIPLE_NODES
---help---
Specify the maximum number of NUMA Nodes available on the target
- system. Increases memory reserved to accomodate various tables.
+ system. Increases memory reserved to accommodate various tables.
- config HAVE_ARCH_BOOTMEM_NODE
+ config HAVE_ARCH_BOOTMEM
def_bool y
depends on X86_32 && NUMA
add writeback entries.
Can be disabled with disable_mtrr_cleanup on the kernel command line.
- The largest mtrr entry size for a continous block can be set with
+ The largest mtrr entry size for a continuous block can be set with
mtrr_chunk_size.
If unsure, say Y.
config KEXEC_JUMP
bool "kexec jump (EXPERIMENTAL)"
depends on EXPERIMENTAL
- depends on KEXEC && HIBERNATION && X86_32
+ depends on KEXEC && HIBERNATION
---help---
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
config DMAR
bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
- depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
- ---help---
+ depends on PCI_MSI && ACPI && EXPERIMENTAL
+ help
DMA remapping (DMAR) devices support enables independent address
translations for Direct Memory Access (DMA) from devices.
These DMA remapping devices are reported via ACPI tables
.quad sys32_olduname
.quad sys_umask /* 60 */
.quad sys_chroot
- .quad sys32_ustat
+ .quad compat_sys_ustat
.quad sys_dup2
.quad sys_getppid
.quad sys_getpgrp /* 65 */
.quad compat_sys_signalfd4
.quad sys_eventfd2
.quad sys_epoll_create1
- .quad sys_dup3 /* 330 */
+ .quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
+ .quad compat_sys_preadv
+ .quad compat_sys_pwritev
+ .quad sys_perf_counter_open
ia32_syscall_end:
unsigned int apic_timer_irqs; /* arch dependent */
unsigned int irq_spurious_count;
#endif
+ unsigned int generic_irqs; /* arch dependent */
+ unsigned int apic_perf_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
/* Interrupt handlers registered during init_IRQ */
extern void apic_timer_interrupt(void);
+ extern void generic_interrupt(void);
extern void error_interrupt(void);
+extern void perf_counter_interrupt(void);
+
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
+#define TIF_PERF_COUNTERS 11 /* notify perf counter work */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* 32bit process */
#define TIF_FORK 18 /* ret_from_fork */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
+ #define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
+#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
+ #define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \
_TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
/* work to do in syscall_trace_leave() */
#define _TIF_WORK_SYSCALL_EXIT \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
+ _TIF_SYSCALL_FTRACE)
/* work to do on interrupt/exception return */
#define _TIF_WORK_MASK \
_TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
/* work to do on any return to user space */
- #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
+ #define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+ #define __NR_preadv 333
+ #define __NR_pwritev 334
+#define __NR_perf_counter_open 333
#ifdef __KERNEL__
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+ #define __NR_preadv 295
+ __SYSCALL(__NR_preadv, sys_preadv)
+ #define __NR_pwritev 296
+ __SYSCALL(__NR_pwritev, sys_pwritev)
+#define __NR_perf_counter_open 295
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/perf_counter.h>
#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/mpspec.h>
#include <asm/idle.h>
#include <asm/mtrr.h>
#include <asm/smp.h>
+ #include <asm/mce.h>
unsigned int num_processors;
inc_irq_stat(apic_timer_irqs);
evt->event_handler(evt);
+
+ perf_counter_unthrottle();
}
/*
u32 v;
/* APIC hasn't been mapped yet */
- if (!apic_phys)
+ if (!x2apic && !apic_phys)
return;
maxlvt = lapic_get_maxlvt();
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
}
#endif
+ #ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6) {
+ v = apic_read(APIC_LVTCMCI);
+ if (!(v & APIC_LVT_MASKED))
+ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+ }
+ #endif
+
/*
* Clean APIC state for other OSs:
*/
apic_write(APIC_ESR, 0);
}
#endif
+ perf_counters_lapic_init(0);
preempt_disable();
apic_write(APIC_LVT1, value);
preempt_enable();
+
+ #ifdef CONFIG_X86_MCE_INTEL
+ /* Recheck CMCI information after local APIC is up on CPU #0 */
+ if (smp_processor_id() == 0)
+ cmci_recheck();
+ #endif
}
void __cpuinit end_local_APIC_setup(void)
return;
}
- local_irq_save(flags);
- mask_8259A();
-
- ret = save_mask_IO_APIC_setup();
+ ret = save_IO_APIC_setup();
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
goto end;
}
+ local_irq_save(flags);
+ mask_IO_APIC_setup();
+ mask_8259A();
+
ret = enable_intr_remapping(1);
if (ret && x2apic_preenabled) {
else
reinit_intr_remapped_IO_APIC(x2apic_preenabled);
unmask_8259A();
local_irq_restore(flags);
+ end:
if (!ret) {
if (!x2apic_preenabled)
pr_info("Enabled x2apic and interrupt-remapping\n");
*/
void __init init_apic_mappings(void)
{
- #ifdef CONFIG_X86_X2APIC
if (x2apic) {
boot_cpu_physical_apicid = read_apic_id();
return;
}
- #endif
/*
* If no local APIC can be found then set up a fake all
local_irq_save(flags);
- #ifdef CONFIG_X86_X2APIC
if (x2apic)
enable_x2apic();
- else
- #endif
- {
+ else {
/*
* Make sure the APICBASE points to the right address
*
#
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
#
# Don't trace early stages of a secondary CPU boot
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
+ obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
+
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
- obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
- obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
+ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_X86_MCE) += mcheck/
-obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_CPU_FREQ) += cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/apic.h>
+ #include <asm/cpu.h>
#ifdef CONFIG_X86_64
# include <asm/numa_64.h>
}
}
+ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+ {
+ #ifdef CONFIG_SMP
+ /* calling is from identify_secondary_cpu() ? */
+ if (c->cpu_index == boot_cpu_id)
+ return;
+
+ /*
+ * Certain Athlons might work (for various values of 'work') in SMP
+ * but they are not certified as MP capable.
+ */
+ /* Athlon 660/661 is valid. */
+ if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+ (c->x86_mask == 1)))
+ goto valid_k7;
+
+ /* Duron 670 is valid */
+ if ((c->x86_model == 7) && (c->x86_mask == 0))
+ goto valid_k7;
+
+ /*
+ * Athlon 662, Duron 671, and Athlon >model 7 have capability
+ * bit. It's worth noting that the A5 stepping (662) of some
+ * Athlon XP's have the MP bit set.
+ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+ * more.
+ */
+ if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ (c->x86_model > 7))
+ if (cpu_has_mp)
+ goto valid_k7;
+
+ /* If we get here, not a certified SMP capable AMD system. */
+
+ /*
+ * Don't taint if we are running SMP kernel on a single non-MP
+ * approved Athlon
+ */
+ WARN_ONCE(1, "WARNING: This combination of AMD"
+ "processors is not suitable for SMP.\n");
+ if (!test_taint(TAINT_UNSAFE_SMP))
+ add_taint(TAINT_UNSAFE_SMP);
+
+ valid_k7:
+ ;
+ #endif
+ }
+
static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
{
u32 l, h;
}
set_cpu_cap(c, X86_FEATURE_K7);
+
+ amd_k7_smp_check(c);
}
#endif
if (c->x86 >= 6)
set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+ /* Enable Performance counter for K7 and later */
+ if (c->x86 > 6 && c->x86 <= 0x11)
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+
if (!c->x86_model_id[0]) {
switch (c->x86) {
case 0xf:
}
#endif
- static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
- #include <linux/init.h>
- #include <linux/kernel.h>
- #include <linux/sched.h>
- #include <linux/string.h>
#include <linux/bootmem.h>
+ #include <linux/linkage.h>
#include <linux/bitops.h>
+ #include <linux/kernel.h>
#include <linux/module.h>
- #include <linux/kgdb.h>
- #include <linux/topology.h>
+ #include <linux/percpu.h>
+ #include <linux/string.h>
#include <linux/delay.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+ #include <linux/kgdb.h>
#include <linux/smp.h>
- #include <linux/percpu.h>
- #include <asm/i387.h>
- #include <asm/msr.h>
- #include <asm/io.h>
- #include <asm/linkage.h>
+ #include <linux/io.h>
+
+ #include <asm/stackprotector.h>
++#include <asm/perf_counter.h>
#include <asm/mmu_context.h>
+ #include <asm/hypervisor.h>
+ #include <asm/processor.h>
+ #include <asm/sections.h>
+ #include <asm/topology.h>
+ #include <asm/cpumask.h>
+ #include <asm/pgtable.h>
+ #include <asm/atomic.h>
+ #include <asm/proto.h>
+ #include <asm/setup.h>
+ #include <asm/apic.h>
+ #include <asm/desc.h>
+ #include <asm/i387.h>
#include <asm/mtrr.h>
+ #include <asm/numa.h>
+ #include <asm/asm.h>
+ #include <asm/cpu.h>
#include <asm/mce.h>
- #include <asm/perf_counter.h>
+ #include <asm/msr.h>
#include <asm/pat.h>
- #include <asm/asm.h>
- #include <asm/numa.h>
#include <asm/smp.h>
- #include <asm/cpu.h>
- #include <asm/cpumask.h>
- #include <asm/apic.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
#endif
- #include <asm/pgtable.h>
- #include <asm/processor.h>
- #include <asm/desc.h>
- #include <asm/atomic.h>
- #include <asm/proto.h>
- #include <asm/sections.h>
- #include <asm/setup.h>
- #include <asm/hypervisor.h>
- #include <asm/stackprotector.h>
-
#include "cpu.h"
- #ifdef CONFIG_X86_64
-
/* all of these masks are initialized in setup_cpu_local_masks() */
- cpumask_var_t cpu_callin_mask;
- cpumask_var_t cpu_callout_mask;
cpumask_var_t cpu_initialized_mask;
+ cpumask_var_t cpu_callout_mask;
+ cpumask_var_t cpu_callin_mask;
/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
}
- #else /* CONFIG_X86_32 */
-
- cpumask_t cpu_callin_map;
- cpumask_t cpu_callout_map;
- cpumask_t cpu_initialized;
- cpumask_t cpu_sibling_setup_map;
-
- #endif /* CONFIG_X86_32 */
-
-
- static struct cpu_dev *this_cpu __cpuinitdata;
+ static const struct cpu_dev *this_cpu __cpuinitdata;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
* IRET will check the segment types kkeil 2000/10/28
* Also sysret mandates a special GDT layout
*
- * The TLS descriptors are currently at a different place compared to i386.
+ * TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
#else
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
/* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
/* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
/* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
/* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
/* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
- [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
- [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+ [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
GDT_STACK_CANARY_INIT
#endif
} };
* the CPUID. Add "volatile" to not allow gcc to
* optimize the subsequent calls to this function.
*/
- asm volatile ("pushfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "movl %0,%1\n\t"
- "xorl %2,%0\n\t"
- "pushl %0\n\t"
- "popfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "popfl\n\t"
+ asm volatile ("pushfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "movl %0, %1 \n\t"
+ "xorl %2, %0 \n\t"
+ "pushl %0 \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "popfl \n\t"
+
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
- /* Disable processor serial number */
- unsigned long lo, hi;
- rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- lo |= 0x200000;
- wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
- clear_cpu_cap(c, X86_FEATURE_PN);
-
- /* Disabling the serial number may affect the cpuid level */
- c->cpuid_level = cpuid_eax(0);
- }
+ unsigned long lo, hi;
+
+ if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+ return;
+
+ /* Disable processor serial number: */
+
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ lo |= 0x200000;
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+ printk(KERN_NOTICE "CPU serial number disabled.\n");
+ clear_cpu_cap(c, X86_FEATURE_PN);
+
+ /* Disabling the serial number may affect the cpuid level */
+ c->cpuid_level = cpuid_eax(0);
}
static int __init x86_serial_nr_setup(char *s)
u32 feature;
u32 level;
};
+
static const struct cpuid_dependent_feature __cpuinitconst
cpuid_dependent_features[] = {
{ X86_FEATURE_MWAIT, 0x00000005 },
static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
{
const struct cpuid_dependent_feature *df;
+
for (df = cpuid_dependent_features; df->feature; df++) {
+
+ if (!cpu_has(c, df->feature))
+ continue;
/*
* Note: cpuid_level is set to -1 if unavailable, but
* extended_extended_level is set to 0 if unavailable
* when signed; hence the weird messing around with
* signs here...
*/
- if (cpu_has(c, df->feature) &&
- ((s32)df->level < 0 ?
+ if (!((s32)df->level < 0 ?
(u32)df->level > (u32)c->extended_cpuid_level :
- (s32)df->level > (s32)c->cpuid_level)) {
- clear_cpu_cap(c, df->feature);
- if (warn)
- printk(KERN_WARNING
- "CPU: CPU feature %s disabled "
- "due to lack of CPUID level 0x%x\n",
- x86_cap_flags[df->feature],
- df->level);
- }
+ (s32)df->level > (s32)c->cpuid_level))
+ continue;
+
+ clear_cpu_cap(c, df->feature);
+ if (!warn)
+ continue;
+
+ printk(KERN_WARNING
+ "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
/*
* Naming convention should be: <Name> [(<Codename>)]
* This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
*/
/* Look up CPU names by table lookup. */
- static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
{
- struct cpu_model_info *info;
+ const struct cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
load_stack_canary_segment();
}
- /* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
+ /*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;
load_percpu_segment(cpu);
}
- static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+ static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
static void __cpuinit default_init(struct cpuinfo_x86 *c)
{
#endif
}
- static struct cpu_dev __cpuinitdata default_cpu = {
+ static const struct cpu_dev __cpuinitconst default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
if (c->extended_cpuid_level < 0x80000004)
return;
- v = (unsigned int *) c->x86_model_id;
+ v = (unsigned int *)c->x86_model_id;
cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /* Intel chips right-justify this string for some dumb reason;
- undo that brain damage */
+ /*
+ * Intel chips right-justify this string for some dumb reason;
+ * undo that brain damage:
+ */
p = q = &c->x86_model_id[0];
while (*p == ' ')
- p++;
+ p++;
if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+ while (*p)
+ *q++ = *p++;
+ while (q <= &c->x86_model_id[48])
+ *q++ = '\0'; /* Zero-pad the rest */
}
}
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
+ goto out;
+ }
- if (smp_num_siblings > nr_cpu_ids) {
- printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
- smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
+ if (smp_num_siblings <= 1)
+ goto out;
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+ if (smp_num_siblings > nr_cpu_ids) {
+ pr_warning("CPU: Unsupported number of siblings %d",
+ smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
- index_msb = get_count_order(smp_num_siblings);
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
- core_bits = get_count_order(c->x86_max_cores);
+ index_msb = get_count_order(smp_num_siblings);
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
- }
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
out:
if ((c->x86_max_cores * smp_num_siblings) > 1) {
static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
- int i;
static int printed;
+ int i;
for (i = 0; i < X86_VENDOR_NUM; i++) {
if (!cpu_devs[i])
if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
(cpu_devs[i]->c_ident[1] &&
!strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
this_cpu = cpu_devs[i];
c->x86_vendor = this_cpu->c_x86_vendor;
return;
if (!printed) {
printed++;
- printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+ printk(KERN_ERR
+ "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+
printk(KERN_ERR "CPU: Your system may be unstable.\n");
}
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 junk, tfms, cap0, misc;
+
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
c->x86 = (tfms >> 8) & 0xf;
c->x86_model = (tfms >> 4) & 0xf;
c->x86_mask = tfms & 0xf;
+
if (c->x86 == 0xf)
c->x86 += (tfms >> 20) & 0xff;
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
c->x86_cache_alignment = c->x86_clflush_size;
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 capability, excap;
+
cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
c->x86_capability[0] = capability;
c->x86_capability[4] = excap;
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
+
if ((xlvl & 0xffff0000) == 0x80000000) {
if (xlvl >= 0x80000001) {
c->x86_capability[1] = cpuid_edx(0x80000001);
}
}
- #ifdef CONFIG_X86_64
if (c->extended_cpuid_level >= 0x80000008) {
u32 eax = cpuid_eax(0x80000008);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
}
+ #ifdef CONFIG_X86_32
+ else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
#endif
if (c->extended_cpuid_level >= 0x80000007)
{
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
void __init early_cpu_init(void)
{
- struct cpu_dev **cdev;
+ const struct cpu_dev *const *cdev;
int count = 0;
- printk("KERNEL supported cpus:\n");
+ printk(KERN_INFO "KERNEL supported cpus:\n");
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
- struct cpu_dev *cpudev = *cdev;
+ const struct cpu_dev *cpudev = *cdev;
unsigned int j;
if (count >= X86_VENDOR_NUM)
for (j = 0; j < 2; j++) {
if (!cpudev->c_ident[j])
continue;
- printk(" %s %s\n", cpudev->c_vendor,
+ printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
cpudev->c_ident[j]);
}
}
c->x86_coreid_bits = 0;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->cpuid_level = -1; /* CPUID not detected */
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
squash_the_stupid_serial_number(c);
/*
- * The vendor-specific functions might have changed features. Now
- * we do "generic changes."
+ * The vendor-specific functions might have changed features.
+ * Now we do "generic changes."
*/
/* Filter out anything that depends on CPUID levels we don't have */
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
- char *p;
+ const char *p;
p = table_lookup_model(c);
if (p)
strcpy(c->x86_model_id, p);
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
+ init_c1e_mask();
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
#else
vgetcpu_set_mode();
#endif
+ init_hw_perf_counters();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
}
struct msr_range {
- unsigned min;
- unsigned max;
+ unsigned min;
+ unsigned max;
};
- static struct msr_range msr_range_array[] __cpuinitdata = {
+ static const struct msr_range msr_range_array[] __cpuinitconst = {
{ 0x00000000, 0x00000418},
{ 0xc0000000, 0xc000040b},
{ 0xc0010000, 0xc0010142},
static void __cpuinit print_cpu_msr(void)
{
+ unsigned index_min, index_max;
unsigned index;
u64 val;
int i;
- unsigned index_min, index_max;
for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
index_min = msr_range_array[i].min;
index_max = msr_range_array[i].max;
+
for (index = index_min; index < index_max; index++) {
if (rdmsrl_amd_safe(index, &val))
continue;
}
static int show_msr __cpuinitdata;
+
static __init int setup_show_msr(char *arg)
{
int num;
void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
- char *vendor = NULL;
+ const char *vendor = NULL;
- if (c->x86_vendor < X86_VENDOR_NUM)
+ if (c->x86_vendor < X86_VENDOR_NUM) {
vendor = this_cpu->c_vendor;
- else if (c->cpuid_level >= 0)
- vendor = c->x86_vendor_id;
+ } else {
+ if (c->cpuid_level >= 0)
+ vendor = c->x86_vendor_id;
+ }
if (vendor && !strstr(c->x86_model_id, vendor))
printk(KERN_CONT "%s ", vendor);
static __init int setup_disablecpuid(char *arg)
{
int bit;
+
if (get_option(&arg, &bit) && bit < NCAPINTS*32)
setup_clear_cpu_cap(bit);
else
return 0;
+
return 1;
}
__setup("clearcpuid=", setup_disablecpuid);
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE);
+
DEFINE_PER_CPU(char *, irq_stack_ptr) =
init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+ /*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+ };
+
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
__aligned(PAGE_SIZE);
- extern asmlinkage void ignore_sysret(void);
-
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
- #else /* x86_64 */
+ #else /* CONFIG_X86_64 */
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU(unsigned long, stack_canary);
memset(regs, 0, sizeof(struct pt_regs));
regs->fs = __KERNEL_PERCPU;
regs->gs = __KERNEL_STACK_CANARY;
+
return regs;
}
- #endif /* x86_64 */
+ #endif /* CONFIG_X86_64 */
+
+ /*
+ * Clear all 6 debug registers:
+ */
+ static void clear_all_debug_regs(void)
+ {
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+
+ set_debugreg(0, i);
+ }
+ }
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* A lot of state is already set up in PDA init for 64 bit
*/
#ifdef CONFIG_X86_64
+
void __cpuinit cpu_init(void)
{
- int cpu = stack_smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
- unsigned long v;
+ struct orig_ist *orig_ist;
struct task_struct *me;
+ struct tss_struct *t;
+ unsigned long v;
+ int cpu;
int i;
+ cpu = stack_smp_processor_id();
+ t = &per_cpu(init_tss, cpu);
+ orig_ist = &per_cpu(orig_ist, cpu);
+
#ifdef CONFIG_NUMA
if (cpu != 0 && percpu_read(node_number) == 0 &&
cpu_to_node(cpu) != NUMA_NO_NODE)
* set up and load the per-CPU TSS
*/
if (!orig_ist->ist[0]) {
- static const unsigned int sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
- };
char *estacks = per_cpu(exception_stacks, cpu);
+
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- estacks += sizes[v];
+ estacks += exception_stack_sizes[v];
orig_ist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
}
}
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
/*
* <= is required because the CPU will access up to
* 8 bits beyond the end of the IO permission bitmap.
atomic_inc(&init_mm.mm_count);
me->active_mm = &init_mm;
- if (me->mm)
- BUG();
+ BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);
load_sp0(t, ¤t->thread);
arch_kgdb_ops.correct_hw_break();
else
#endif
- {
- /*
- * Clear all 6 debug registers:
- */
- set_debugreg(0UL, 0);
- set_debugreg(0UL, 1);
- set_debugreg(0UL, 2);
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
- }
+ clear_all_debug_regs();
fpu_init();
if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;) local_irq_enable();
+ for (;;)
+ local_irq_enable();
}
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
*/
atomic_inc(&init_mm.mm_count);
curr->active_mm = &init_mm;
- if (curr->mm)
- BUG();
+ BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);
load_sp0(t, thread);
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
- /* Clear all 6 debug registers: */
- set_debugreg(0, 0);
- set_debugreg(0, 1);
- set_debugreg(0, 2);
- set_debugreg(0, 3);
- set_debugreg(0, 6);
- set_debugreg(0, 7);
+ clear_all_debug_regs();
/*
* Force FPU initialization:
xsave_init();
}
-
-
#endif
END(save_rest)
/* save complete stack frame */
+ .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
1: ret
CFI_ENDPROC
END(save_paranoid)
+ .popsection
/*
* A newly forked process directly context switches into this address.
GET_THREAD_INFO(%rcx)
- CFI_REMEMBER_STATE
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
- CFI_RESTORE_STATE
CFI_ENDPROC
END(ret_from_fork)
#endif
apicinterrupt LOCAL_TIMER_VECTOR \
apic_timer_interrupt smp_apic_timer_interrupt
+ apicinterrupt GENERIC_INTERRUPT_VECTOR \
+ generic_interrupt smp_generic_interrupt
#ifdef CONFIG_SMP
apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PERF_VECTOR \
+ perf_counter_interrupt smp_perf_counter_interrupt
+#endif
+
/*
* Exception entry points.
*/
atomic_t irq_err_count;
+ /* Function pointer for generic interrupt vector handling */
+ void (*generic_interrupt_extension)(void) = NULL;
+
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
/*
* /proc/interrupts printing:
*/
- static int show_other_interrupts(struct seq_file *p)
+ static int show_other_interrupts(struct seq_file *p, int prec)
{
int j;
- seq_printf(p, "NMI: ");
+ seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
seq_printf(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
+ seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
seq_printf(p, " Local timer interrupts\n");
+
+ seq_printf(p, "%*s: ", prec, "SPU");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+ seq_printf(p, " Spurious interrupts\n");
+ seq_printf(p, "CNT: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ seq_printf(p, " Performance counter interrupts\n");
#endif
+ if (generic_interrupt_extension) {
+ seq_printf(p, "PLT: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+ seq_printf(p, " Platform interrupts\n");
+ }
#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
+ seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
+ seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
seq_printf(p, " Function call interrupts\n");
- seq_printf(p, "TLB: ");
+ seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
+ seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
# ifdef CONFIG_X86_64
- seq_printf(p, "THR: ");
+ seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
# endif
#endif
- #ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "SPU: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
- #endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
return 0;
}
int show_interrupts(struct seq_file *p, void *v)
{
unsigned long flags, any_count = 0;
- int i = *(loff_t *) v, j;
+ int i = *(loff_t *) v, j, prec;
struct irqaction *action;
struct irq_desc *desc;
if (i > nr_irqs)
return 0;
+ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+ j *= 10;
+
if (i == nr_irqs)
- return show_other_interrupts(p);
+ return show_other_interrupts(p, prec);
/* print header */
if (i == 0) {
- seq_printf(p, " ");
+ seq_printf(p, "%*s", prec + 8, "");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d", j);
seq_putc(p, '\n');
return 0;
spin_lock_irqsave(&desc->lock, flags);
- #ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
- #else
for_each_online_cpu(j)
any_count |= kstat_irqs_cpu(i, j);
- #endif
action = desc->action;
if (!action && !any_count)
goto out;
- seq_printf(p, "%3d: ", i);
- #ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
- #else
+ seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
- #endif
seq_printf(p, " %8s", desc->chip->name);
seq_printf(p, "-%-8s", desc->name);
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
+ sum += irq_stats(cpu)->irq_spurious_count;
+ sum += irq_stats(cpu)->apic_perf_irqs;
#endif
+ if (generic_interrupt_extension)
+ sum += irq_stats(cpu)->generic_irqs;
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
sum += irq_stats(cpu)->irq_threshold_count;
#endif
#endif
- #ifdef CONFIG_X86_LOCAL_APIC
- sum += irq_stats(cpu)->irq_spurious_count;
- #endif
return sum;
}
return 1;
}
+ /*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+ void smp_generic_interrupt(struct pt_regs *regs)
+ {
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ ack_APIC_irq();
+
+ exit_idle();
+
+ irq_enter();
+
+ inc_irq_stat(generic_irqs);
+
+ if (generic_interrupt_extension)
+ generic_interrupt_extension();
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+ }
+
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
*/
static struct irqaction fpu_irq = {
.handler = math_error_irq,
- .mask = CPU_MASK_NONE,
.name = "fpu",
};
*/
static struct irqaction irq2 = {
.handler = no_action,
- .mask = CPU_MASK_NONE,
.name = "cascade",
};
return 0;
}
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init smp_intr_init(void)
{
- int i;
-
- /* Execute any quirks before the call gates are initialised: */
- x86_quirk_pre_intr_init();
-
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
- /* SYSCALL_VECTOR was reserved in trap_init. */
- if (i != SYSCALL_VECTOR)
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
- }
-
-
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
#endif
+}
+
+static void __init apic_intr_init(void)
+{
+ smp_intr_init();
#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ /* generic IPI for platform specific use */
+ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-#endif
+# ifdef CONFIG_PERF_COUNTERS
+ alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+# endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+# ifdef CONFIG_X86_MCE_P4THERMAL
/* thermal monitor LVT interrupt */
alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+# endif
#endif
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ /* Execute any quirks before the call gates are initialised: */
+ x86_quirk_pre_intr_init();
+
+ apic_intr_init();
+
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ /* SYSCALL_VECTOR was reserved in trap_init. */
+ if (!test_bit(vector, used_vectors))
+ set_intr_gate(vector, interrupt[i]);
+ }
if (!acpi_ioapic)
setup_irq(2, &irq2);
static struct irqaction irq2 = {
.handler = no_action,
- .mask = CPU_MASK_NONE,
.name = "cascade",
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ /* generic IPI for platform specific use */
+ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+ /* Performance monitoring interrupt: */
+#ifdef CONFIG_PERF_COUNTERS
+ alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+#endif
}
void __init native_init_IRQ(void)
int i;
init_ISA_irqs();
+
+ apic_intr_init();
+
/*
* Cover the whole vector space, no vector can escape
* us. (some of these will be overridden and become
*/
for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
int vector = FIRST_EXTERNAL_VECTOR + i;
- if (vector != IA32_SYSCALL_VECTOR)
+ if (!test_bit(vector, used_vectors))
set_intr_gate(vector, interrupt[i]);
}
- apic_intr_init();
-
if (!acpi_ioapic)
setup_irq(2, &irq2);
}
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
-
+#include <linux/perf_counter.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
/*
* Set up a signal frame.
*/
+
+ /*
+ * Determine which stack to use..
+ */
+ static unsigned long align_sigframe(unsigned long sp)
+ {
+ #ifdef CONFIG_X86_32
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
+ sp = ((sp + 4) & -16ul) - 4;
+ #else /* !CONFIG_X86_32 */
+ sp = round_down(sp, 16) - 8;
+ #endif
+ return sp;
+ }
+
+ static inline void __user *
+ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+ void __user **fpstate)
+ {
+ /* Default to using normal stack */
+ unsigned long sp = regs->sp;
+ int onsigstack = on_sig_stack(sp);
+
+ #ifdef CONFIG_X86_64
+ /* redzone */
+ sp -= 128;
+ #endif /* CONFIG_X86_64 */
+
+ if (!onsigstack) {
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ if (current->sas_ss_size)
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ } else {
+ #ifdef CONFIG_X86_32
+ /* This is the legacy signal stack switching. */
+ if ((regs->ss & 0xffff) != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer)
+ sp = (unsigned long) ka->sa.sa_restorer;
+ #endif /* CONFIG_X86_32 */
+ }
+ }
+
+ if (used_math()) {
+ sp -= sig_xstate_size;
+ #ifdef CONFIG_X86_64
+ sp = round_down(sp, 64);
+ #endif /* CONFIG_X86_64 */
+ *fpstate = (void __user *)sp;
+ }
+
+ sp = align_sigframe(sp - frame_size);
+
+ /*
+ * If we are on the alternate signal stack and would overflow it, don't.
+ * Return an always-bogus address instead so we will die with SIGSEGV.
+ */
+ if (onsigstack && !likely(on_sig_stack(sp)))
+ return (void __user *)-1L;
+
+ /* save i387 state */
+ if (used_math() && save_i387_xstate(*fpstate) < 0)
+ return (void __user *)-1L;
+
+ return (void __user *)sp;
+ }
+
#ifdef CONFIG_X86_32
static const struct {
u16 poplmovl;
0
};
- /*
- * Determine which stack to use..
- */
- static inline void __user *
- get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
- void **fpstate)
- {
- unsigned long sp;
-
- /* Default to using normal stack */
- sp = regs->sp;
-
- /*
- * If we are on the alternate signal stack and would overflow it, don't.
- * Return an always-bogus address instead so we will die with SIGSEGV.
- */
- if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
- return (void __user *) -1L;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
- sp = current->sas_ss_sp + current->sas_ss_size;
- } else {
- /* This is the legacy signal stack switching. */
- if ((regs->ss & 0xffff) != __USER_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer)
- sp = (unsigned long) ka->sa.sa_restorer;
- }
-
- if (used_math()) {
- sp = sp - sig_xstate_size;
- *fpstate = (struct _fpstate *) sp;
- if (save_i387_xstate(*fpstate) < 0)
- return (void __user *)-1L;
- }
-
- sp -= frame_size;
- /*
- * Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0.
- */
- sp = ((sp + 4) & -16ul) - 4;
-
- return (void __user *) sp;
- }
-
static int
__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
struct pt_regs *regs)
return 0;
}
#else /* !CONFIG_X86_32 */
- /*
- * Determine which stack to use..
- */
- static void __user *
- get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
- {
- /* Default to using normal stack - redzone*/
- sp -= 128;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
- sp = current->sas_ss_sp + current->sas_ss_size;
- }
-
- return (void __user *)round_down(sp - size, 64);
- }
-
static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs *regs)
{
int err = 0;
struct task_struct *me = current;
- if (used_math()) {
- fp = get_stack(ka, regs->sp, sig_xstate_size);
- frame = (void __user *)round_down(
- (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
-
- if (save_i387_xstate(fp) < 0)
- return -EFAULT;
- } else
- frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
+ frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
tracehook_notify_resume(regs);
}
+ if (thread_info_flags & _TIF_PERF_COUNTERS) {
+ clear_thread_flag(TIF_PERF_COUNTERS);
+ perf_counter_notify(regs);
+ }
+
#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
#endif /* CONFIG_X86_32 */
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_perf_counter_open
+ .long sys_preadv
+ .long sys_pwritev
if (!user_mode_vm(regs))
die(str, regs, err);
}
-
- /*
- * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
- * invalid offset set (the LAZY one) and the faulting thread has
- * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
- * we set the offset field correctly and return 1.
- */
- static int lazy_iobitmap_copy(void)
- {
- struct thread_struct *thread;
- struct tss_struct *tss;
- int cpu;
-
- cpu = get_cpu();
- tss = &per_cpu(init_tss, cpu);
- thread = ¤t->thread;
-
- if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
- thread->io_bitmap_ptr) {
- memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
- thread->io_bitmap_max);
- /*
- * If the previously set map was extending to higher ports
- * than the current one, pad extra space with 0xff (no access).
- */
- if (thread->io_bitmap_max < tss->io_bitmap_max) {
- memset((char *) tss->io_bitmap +
- thread->io_bitmap_max, 0xff,
- tss->io_bitmap_max - thread->io_bitmap_max);
- }
- tss->io_bitmap_max = thread->io_bitmap_max;
- tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
- tss->io_bitmap_owner = thread;
- put_cpu();
-
- return 1;
- }
- put_cpu();
-
- return 0;
- }
#endif
static void __kprobes
conditional_sti(regs);
#ifdef CONFIG_X86_32
- if (lazy_iobitmap_copy()) {
- /* restart the faulting instruction */
- return;
- }
-
if (regs->flags & X86_VM_MASK)
goto gp_in_vm86;
#endif
#endif
set_intr_gate(19, &simd_coprocessor_error);
+ /* Reserve all the builtin and the syscall vector: */
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+ set_bit(i, used_vectors);
+
#ifdef CONFIG_IA32_EMULATION
set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+ set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
}
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
+
/*
* Should be a barrier for any external CPU state:
*/
#define _COMPONENT ACPI_PROCESSOR_COMPONENT
ACPI_MODULE_NAME("processor_idle");
#define ACPI_PROCESSOR_FILE_POWER "power"
- #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY)
#define C2_OVERHEAD 1 /* 1us */
#define C3_OVERHEAD 1 /* 1us */
static unsigned int latency_factor __read_mostly = 2;
module_param(latency_factor, uint, 0644);
+ static s64 us_to_pm_timer_ticks(s64 t)
+ {
+ return div64_u64(t * PM_TIMER_FREQUENCY, 1000000);
+ }
/*
* IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
* For now disable this. Probably a bug somewhere else.
/* Actually this shouldn't be __cpuinitdata, would be better to fix the
callers to only run once -AK */
static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET70WW")}, (void *)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW")}, (void *)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET43WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET45WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET47WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET50WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET52WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET55WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET56WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET59WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET61WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET62WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET64WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET65WW") }, (void*)1},
- { set_max_cstate, "IBM ThinkPad R40e", {
- DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
- DMI_MATCH(DMI_BIOS_VERSION,"1SET68WW") }, (void*)1},
- { set_max_cstate, "Medion 41700", {
- DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
- DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J")}, (void *)1},
{ set_max_cstate, "Clevo 5600D", {
DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")},
{},
};
- static inline u32 ticks_elapsed(u32 t1, u32 t2)
- {
- if (t2 >= t1)
- return (t2 - t1);
- else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
- return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
- else
- return ((0xFFFFFFFF - t1) + t2);
- }
-
- static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
- {
- if (t2 >= t1)
- return PM_TIMER_TICKS_TO_US(t2 - t1);
- else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
- return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
- else
- return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
- }
/*
* Callers should disable interrupts before the call and enable
* In either case, the proper way to
* handle BM_RLD is to set it and leave it set.
*/
- acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
+ acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
return;
}
{
u32 bm_status = 0;
- acpi_get_register_unlocked(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
+ acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
if (bm_status)
- acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
+ acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
/*
* PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
* the true state of bus mastering activity; forcing us to
*/
static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
{
+ u64 perf_flags;
+
/* Don't trace irqs off for idle */
stop_critical_timings();
+ perf_flags = hw_perf_save_disable();
if (cx->entry_method == ACPI_CSTATE_FFH) {
/* Call into architectural FFH based C-state */
acpi_processor_ffh_cstate_enter(cx);
gets asserted in time to freeze execution properly. */
unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
}
+ hw_perf_restore(perf_flags);
start_critical_timings();
}
static int acpi_idle_enter_c1(struct cpuidle_device *dev,
struct cpuidle_state *state)
{
- u32 t1, t2;
+ ktime_t kt1, kt2;
+ s64 idle_time;
struct acpi_processor *pr;
struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
return 0;
}
- t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+ kt1 = ktime_get_real();
acpi_idle_do_entry(cx);
- t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+ kt2 = ktime_get_real();
+ idle_time = ktime_to_us(ktime_sub(kt2, kt1));
local_irq_enable();
cx->usage++;
- return ticks_elapsed_in_us(t1, t2);
+ return idle_time;
}
/**
{
struct acpi_processor *pr;
struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
- u32 t1, t2;
- int sleep_ticks = 0;
+ ktime_t kt1, kt2;
+ s64 idle_time;
+ s64 sleep_ticks = 0;
pr = __get_cpu_var(processors);
if (cx->type == ACPI_STATE_C3)
ACPI_FLUSH_CPU_CACHE();
- t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+ kt1 = ktime_get_real();
/* Tell the scheduler that we are going deep-idle: */
sched_clock_idle_sleep_event();
acpi_idle_do_entry(cx);
- t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+ kt2 = ktime_get_real();
+ idle_time = ktime_to_us(ktime_sub(kt2, kt1));
#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
/* TSC could halt in idle, so notify users */
if (tsc_halts_in_c(cx->type))
mark_tsc_unstable("TSC halts in idle");;
#endif
- sleep_ticks = ticks_elapsed(t1, t2);
+ sleep_ticks = us_to_pm_timer_ticks(idle_time);
/* Tell the scheduler how much we idled: */
sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
acpi_state_timer_broadcast(pr, cx, 0);
cx->time += sleep_ticks;
- return ticks_elapsed_in_us(t1, t2);
+ return idle_time;
}
static int c3_cpu_count;
{
struct acpi_processor *pr;
struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
- u32 t1, t2;
- int sleep_ticks = 0;
+ ktime_t kt1, kt2;
+ s64 idle_time;
+ s64 sleep_ticks = 0;
+
pr = __get_cpu_var(processors);
c3_cpu_count++;
/* Disable bus master arbitration when all CPUs are in C3 */
if (c3_cpu_count == num_online_cpus())
- acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
+ acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
spin_unlock(&c3_lock);
} else if (!pr->flags.bm_check) {
ACPI_FLUSH_CPU_CACHE();
}
- t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+ kt1 = ktime_get_real();
acpi_idle_do_entry(cx);
- t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+ kt2 = ktime_get_real();
+ idle_time = ktime_to_us(ktime_sub(kt2, kt1));
/* Re-enable bus master arbitration */
if (pr->flags.bm_check && pr->flags.bm_control) {
spin_lock(&c3_lock);
- acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+ acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
c3_cpu_count--;
spin_unlock(&c3_lock);
}
if (tsc_halts_in_c(ACPI_STATE_C3))
mark_tsc_unstable("TSC halts in idle");
#endif
- sleep_ticks = ticks_elapsed(t1, t2);
+ sleep_ticks = us_to_pm_timer_ticks(idle_time);
/* Tell the scheduler how much we idled: */
sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
acpi_state_timer_broadcast(pr, cx, 0);
cx->time += sleep_ticks;
- return ticks_elapsed_in_us(t1, t2);
+ return idle_time;
}
struct cpuidle_driver acpi_idle_driver = {
#include <linux/kbd_kern.h>
#include <linux/proc_fs.h>
#include <linux/quotaops.h>
+#include <linux/perf_counter.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/vt_kern.h>
#include <linux/workqueue.h>
#include <linux/kexec.h>
- #include <linux/irq.h>
+ #include <linux/interrupt.h>
#include <linux/hrtimer.h>
#include <linux/oom.h>
struct pt_regs *regs = get_irq_regs();
if (regs)
show_regs(regs);
+ perf_counter_print_debug();
}
static struct sysrq_key_op sysrq_showregs_op = {
.handler = sysrq_handle_showregs,
}
static struct sysrq_key_op sysrq_ftrace_dump_op = {
.handler = sysrq_ftrace_dump,
- .help_msg = "dumpZ-ftrace-buffer",
+ .help_msg = "dump-ftrace-buffer(Z)",
.action_msg = "Dump ftrace buffer",
.enable_mask = SYSRQ_ENABLE_DUMP,
};
.enable_mask = SYSRQ_ENABLE_SIGNAL,
};
+ #ifdef CONFIG_BLOCK
+ static void sysrq_handle_thaw(int key, struct tty_struct *tty)
+ {
+ emergency_thaw_all();
+ }
+ static struct sysrq_key_op sysrq_thaw_op = {
+ .handler = sysrq_handle_thaw,
+ .help_msg = "thaw-filesystems(J)",
+ .action_msg = "Emergency Thaw of all frozen filesystems",
+ .enable_mask = SYSRQ_ENABLE_SIGNAL,
+ };
+ #endif
+
static void sysrq_handle_kill(int key, struct tty_struct *tty)
{
send_sig_all(SIGKILL);
&sysrq_moom_op, /* f */
/* g: May be registered by ppc for kgdb */
NULL, /* g */
- NULL, /* h */
+ NULL, /* h - reserved for help */
&sysrq_kill_op, /* i */
+ #ifdef CONFIG_BLOCK
+ &sysrq_thaw_op, /* j */
+ #else
NULL, /* j */
+ #endif
&sysrq_SAK_op, /* k */
#ifdef CONFIG_SMP
&sysrq_showallcpus_op, /* l */
#include <linux/string.h>
#include <linux/init.h>
#include <linux/pagemap.h>
+#include <linux/perf_counter.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/proc_fs.h>
#include <linux/mount.h>
#include <linux/security.h>
+ #include <linux/ima.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
+ #include <linux/fs_struct.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
MAY_READ | MAY_EXEC | MAY_OPEN);
if (error)
goto exit;
+ error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+ if (error)
+ goto exit;
file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
error = PTR_ERR(file);
err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
if (err)
goto out_path_put;
+ err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+ if (err)
+ goto out_path_put;
file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
if (IS_ERR(file))
current->personality &= ~bprm->per_clear;
+ /*
+ * Flush performance counters when crossing a
+ * security domain:
+ */
+ if (!get_dumpable(current->mm))
+ perf_counter_exit_task(current);
+
/* An exec changes our domain. We are no longer part of the thread
group */
* - the caller must hold current->cred_exec_mutex to protect against
* PTRACE_ATTACH
*/
- void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
+ int check_unsafe_exec(struct linux_binprm *bprm)
{
struct task_struct *p = current, *t;
unsigned long flags;
- unsigned n_fs, n_files, n_sighand;
+ unsigned n_fs;
+ int res = 0;
bprm->unsafe = tracehook_unsafe_exec(p);
n_fs = 1;
- n_files = 1;
- n_sighand = 1;
+ write_lock(&p->fs->lock);
lock_task_sighand(p, &flags);
for (t = next_thread(p); t != p; t = next_thread(t)) {
if (t->fs == p->fs)
n_fs++;
- if (t->files == files)
- n_files++;
- n_sighand++;
}
- if (atomic_read(&p->fs->count) > n_fs ||
- atomic_read(&p->files->count) > n_files ||
- atomic_read(&p->sighand->count) > n_sighand)
+ if (p->fs->users > n_fs) {
bprm->unsafe |= LSM_UNSAFE_SHARE;
+ } else {
+ if (p->fs->in_exec)
+ res = -EAGAIN;
+ p->fs->in_exec = 1;
+ }
unlock_task_sighand(p, &flags);
+ write_unlock(&p->fs->lock);
+
+ return res;
}
/*
retval = security_bprm_check(bprm);
if (retval)
return retval;
+ retval = ima_bprm_check(bprm);
+ if (retval)
+ return retval;
/* kernel module loader fixup */
/* so we don't try to load run modprobe in kernel space. */
retval = mutex_lock_interruptible(¤t->cred_exec_mutex);
if (retval < 0)
goto out_free;
+ current->in_execve = 1;
retval = -ENOMEM;
bprm->cred = prepare_exec_creds();
if (!bprm->cred)
goto out_unlock;
- check_unsafe_exec(bprm, displaced);
+
+ retval = check_unsafe_exec(bprm);
+ if (retval)
+ goto out_unlock;
file = open_exec(filename);
retval = PTR_ERR(file);
if (IS_ERR(file))
- goto out_unlock;
+ goto out_unmark;
sched_exec();
goto out;
/* execve succeeded */
+ write_lock(¤t->fs->lock);
+ current->fs->in_exec = 0;
+ write_unlock(¤t->fs->lock);
+ current->in_execve = 0;
mutex_unlock(¤t->cred_exec_mutex);
acct_update_integrals(current);
free_bprm(bprm);
fput(bprm->file);
}
+ out_unmark:
+ write_lock(¤t->fs->lock);
+ current->fs->in_exec = 0;
+ write_unlock(¤t->fs->lock);
+
out_unlock:
+ current->in_execve = 0;
mutex_unlock(¤t->cred_exec_mutex);
out_free:
extern struct cred init_cred;
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk) \
+ .perf_counter_ctx.counter_list = \
+ LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
+ .perf_counter_ctx.lock = \
+ __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
.nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
+ .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
.real_parent = &tsk, \
INIT_IDS \
INIT_TRACE_IRQFLAGS \
INIT_LOCKDEP \
+ INIT_PERF_COUNTERS(tsk) \
}
struct kernel_stat {
struct cpu_usage_stat cpustat;
- #ifndef CONFIG_SPARSE_IRQ
+ #ifndef CONFIG_GENERIC_HARDIRQS
unsigned int irqs[NR_IRQS];
#endif
};
extern unsigned long long nr_context_switches(void);
- #ifndef CONFIG_SPARSE_IRQ
+ #ifndef CONFIG_GENERIC_HARDIRQS
#define kstat_irqs_this_cpu(irq) \
(kstat_this_cpu.irqs[irq])
{
kstat_this_cpu.irqs[irq]++;
}
- #endif
-
- #ifndef CONFIG_SPARSE_IRQ
static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
return kstat_cpu(cpu).irqs[irq];
}
#else
+ #include <linux/irq.h>
extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+ #define kstat_irqs_this_cpu(DESC) \
+ ((DESC)->kstat_irqs[smp_processor_id()])
+ #define kstat_incr_irqs_this_cpu(irqno, DESC) \
+ ((DESC)->kstat_irqs[smp_processor_id()]++)
+
#endif
/*
return sum;
}
+
+/*
+ * Lock/unlock the current runqueue - to extract task statistics:
+ */
+extern void curr_rq_lock_irq_save(unsigned long *flags);
+extern void curr_rq_unlock_irq_restore(unsigned long *flags);
+extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
extern unsigned long long task_delta_exec(struct task_struct *);
+
extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
extern void account_steal_time(cputime_t);
#include <linux/smp.h>
#include <linux/sem.h>
#include <linux/signal.h>
- #include <linux/fs_struct.h>
+ #include <linux/path.h>
#include <linux/compiler.h>
#include <linux/completion.h>
+#include <linux/perf_counter.h>
#include <linux/pid.h>
#include <linux/percpu.h>
#include <linux/topology.h>
struct robust_list_head;
struct bio;
struct bts_tracer;
+ struct fs_struct;
/*
* List of flags we want to share for kernel threads,
extern unsigned long nr_uninterruptible(void);
extern unsigned long nr_active(void);
extern unsigned long nr_iowait(void);
+extern u64 cpu_nr_switches(int cpu);
+extern u64 cpu_nr_migrations(int cpu);
+ extern unsigned long get_parent_ip(unsigned long addr);
+
struct seq_file;
struct cfs_rq;
struct task_group;
extern signed long schedule_timeout_interruptible(signed long timeout);
extern signed long schedule_timeout_killable(signed long timeout);
extern signed long schedule_timeout_uninterruptible(signed long timeout);
+ asmlinkage void __schedule(void);
asmlinkage void schedule(void);
+ extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
struct nsproxy;
struct user_namespace;
(mm)->hiwater_vm = (mm)->total_vm; \
} while (0)
- #define get_mm_hiwater_rss(mm) max((mm)->hiwater_rss, get_mm_rss(mm))
- #define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
+ static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
+ {
+ return max(mm->hiwater_rss, get_mm_rss(mm));
+ }
+
+ static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
+ {
+ return max(mm->hiwater_vm, mm->total_vm);
+ }
extern void set_dumpable(struct mm_struct *mm, int value);
extern int get_dumpable(struct mm_struct *mm);
struct list_head cpu_timers[3];
- /* job control IDs */
-
- /*
- * pgrp and session fields are deprecated.
- * use the task_session_Xnr and task_pgrp_Xnr routines below
- */
-
- union {
- pid_t pgrp __deprecated;
- pid_t __pgrp;
- };
-
struct pid *tty_old_pgrp;
- union {
- pid_t session __deprecated;
- pid_t __session;
- };
-
/* boolean value for session group leader */
int leader;
struct rq *busiest, struct sched_domain *sd,
enum cpu_idle_type idle);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+ int (*needs_post_schedule) (struct rq *this_rq);
void (*post_schedule) (struct rq *this_rq);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
u64 last_wakeup;
u64 avg_overlap;
- u64 nr_migrations;
+ u64 nr_migrations;
+
+ u64 start_runtime;
+ u64 avg_wakeup;
+
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
#endif
struct list_head tasks;
+ struct plist_node pushable_tasks;
struct mm_struct *mm, *active_mm;
/* ??? */
unsigned int personality;
unsigned did_exec:1;
+ unsigned in_execve:1; /* Tell the LSMs that the process is doing an
+ * execve */
pid_t pid;
pid_t tgid;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
+ gfp_t lockdep_reclaim_gfp;
#endif
/* journalling filesystem info */
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
+ struct perf_counter_context perf_counter_ctx;
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
int curr_ret_stack;
/* Stack of return addresses for return function tracing */
struct ftrace_ret_stack *ret_stack;
+ /* time stamp for last schedule */
+ unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun.
#endif
};
+ /* Future-safe accessor for struct task_struct's cpus_allowed. */
+ #define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
+
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
return rt_prio(p->prio);
}
- static inline void set_task_session(struct task_struct *tsk, pid_t session)
- {
- tsk->signal->__session = session;
- }
-
- static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
- {
- tsk->signal->__pgrp = pgrp;
- }
-
static inline struct pid *task_pid(struct task_struct *task)
{
return task->pids[PIDTYPE_PID].pid;
return task->group_leader->pids[PIDTYPE_PID].pid;
}
+ /*
+ * Without tasklist or rcu lock it is not safe to dereference
+ * the result of task_pgrp/task_session even if task == current,
+ * we can race with another thread doing sys_setsid/sys_setpgid.
+ */
static inline struct pid *task_pgrp(struct task_struct *task)
{
return task->group_leader->pids[PIDTYPE_PGID].pid;
*
* see also pid_nr() etc in include/linux/pid.h
*/
+ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+ struct pid_namespace *ns);
static inline pid_t task_pid_nr(struct task_struct *tsk)
{
return tsk->pid;
}
- pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+ static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
+ struct pid_namespace *ns)
+ {
+ return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+ }
static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
- return pid_vnr(task_pid(tsk));
+ return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}
}
- static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+ static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
+ struct pid_namespace *ns)
{
- return tsk->signal->__pgrp;
+ return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}
- pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
-
static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
- return pid_vnr(task_pgrp(tsk));
+ return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}
- static inline pid_t task_session_nr(struct task_struct *tsk)
+ static inline pid_t task_session_nr_ns(struct task_struct *tsk,
+ struct pid_namespace *ns)
{
- return tsk->signal->__session;
+ return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}
- pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
-
static inline pid_t task_session_vnr(struct task_struct *tsk)
{
- return pid_vnr(task_session(tsk));
+ return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}
+ /* obsolete, do not use */
+ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+ {
+ return task_pgrp_nr_ns(tsk, &init_pid_ns);
+ }
/**
* pid_alive - check that a task structure is not stale
return set_cpus_allowed_ptr(p, &new_mask);
}
+ /*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ extern int sched_clock_stable;
+ #endif
+
extern unsigned long long sched_clock(void);
extern void sched_clock_init(void);
/* Allocate a new mm structure and copy contents from tsk->mm */
extern struct mm_struct *dup_mm(struct task_struct *tsk);
- extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+ extern int copy_thread(unsigned long, unsigned long, unsigned long,
+ struct task_struct *, struct pt_regs *);
extern void flush_thread(void);
extern void exit_thread(void);
#define delay_group_leader(p) \
(thread_group_leader(p) && !thread_group_empty(p))
+ static inline int task_detached(struct task_struct *p)
+ {
+ return p->exit_signal == -1;
+ }
+
/*
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
+/*
+ * Call the function if the target task is executing on a CPU right now:
+ */
+extern void task_oncpu_function_call(struct task_struct *p,
+ void (*func) (void *info), void *info);
+
+
#ifdef CONFIG_MM_OWNER
extern void mm_update_next_owner(struct mm_struct *mm);
extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
struct robust_list_head;
struct getcpu_cache;
struct old_linux_dirent;
+struct perf_counter_hw_event;
#include <linux/types.h>
#include <linux/aio_abi.h>
#include <asm/signal.h>
#include <linux/quota.h>
#include <linux/key.h>
+ #include <linux/ftrace.h>
#define __SC_DECL1(t1, a1) t1 a1
#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
+ #ifdef CONFIG_FTRACE_SYSCALLS
+ #define __SC_STR_ADECL1(t, a) #a
+ #define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__)
+ #define __SC_STR_ADECL3(t, a, ...) #a, __SC_STR_ADECL2(__VA_ARGS__)
+ #define __SC_STR_ADECL4(t, a, ...) #a, __SC_STR_ADECL3(__VA_ARGS__)
+ #define __SC_STR_ADECL5(t, a, ...) #a, __SC_STR_ADECL4(__VA_ARGS__)
+ #define __SC_STR_ADECL6(t, a, ...) #a, __SC_STR_ADECL5(__VA_ARGS__)
+
+ #define __SC_STR_TDECL1(t, a) #t
+ #define __SC_STR_TDECL2(t, a, ...) #t, __SC_STR_TDECL1(__VA_ARGS__)
+ #define __SC_STR_TDECL3(t, a, ...) #t, __SC_STR_TDECL2(__VA_ARGS__)
+ #define __SC_STR_TDECL4(t, a, ...) #t, __SC_STR_TDECL3(__VA_ARGS__)
+ #define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__)
+ #define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__)
+
+ #define SYSCALL_METADATA(sname, nb) \
+ static const struct syscall_metadata __used \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("__syscalls_metadata"))) \
+ __syscall_meta_##sname = { \
+ .name = "sys"#sname, \
+ .nb_args = nb, \
+ .types = types_##sname, \
+ .args = args_##sname, \
+ }
+
+ #define SYSCALL_DEFINE0(sname) \
+ static const struct syscall_metadata __used \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("__syscalls_metadata"))) \
+ __syscall_meta_##sname = { \
+ .name = "sys_"#sname, \
+ .nb_args = 0, \
+ }; \
+ asmlinkage long sys_##sname(void)
+
+ #else
#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void)
+ #endif
+
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#endif
#endif
+ #ifdef CONFIG_FTRACE_SYSCALLS
+ #define SYSCALL_DEFINEx(x, sname, ...) \
+ static const char *types_##sname[] = { \
+ __SC_STR_TDECL##x(__VA_ARGS__) \
+ }; \
+ static const char *args_##sname[] = { \
+ __SC_STR_ADECL##x(__VA_ARGS__) \
+ }; \
+ SYSCALL_METADATA(sname, x); \
+ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+ #else
+ #define SYSCALL_DEFINEx(x, sname, ...) \
+ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+ #endif
+
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
#define SYSCALL_DEFINE(name) static inline long SYSC_##name
- #define SYSCALL_DEFINEx(x, name, ...) \
+
+ #define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \
static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \
asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \
#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
#define SYSCALL_DEFINE(name) asmlinkage long sys_##name
- #define SYSCALL_DEFINEx(x, name, ...) \
+ #define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
size_t count, loff_t pos);
asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos);
+ asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+ asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
asmlinkage long sys_mkdir(const char __user *pathname, int mode);
asmlinkage long sys_chdir(const char __user *filename);
int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
+
+asmlinkage long sys_perf_counter_open(
+ const struct perf_counter_hw_event __user *hw_event_uptr,
+ pid_t pid, int cpu, int group_fd, unsigned long flags);
#endif
which is done within the script "scripts/setlocalversion".)
+ config HAVE_KERNEL_GZIP
+ bool
+
+ config HAVE_KERNEL_BZIP2
+ bool
+
+ config HAVE_KERNEL_LZMA
+ bool
+
+ choice
+ prompt "Kernel compression mode"
+ default KERNEL_GZIP
+ depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA
+ help
+ The linux kernel is a kind of self-extracting executable.
+ Several compression algorithms are available, which differ
+ in efficiency, compression and decompression speed.
+ Compression speed is only relevant when building a kernel.
+ Decompression speed is relevant at each boot.
+
+ If you have any problems with bzip2 or lzma compressed
+ kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older
+ version of this functionality (bzip2 only), for 2.4, was
+ supplied by Christian Ludwig)
+
+ High compression options are mostly useful for users, who
+ are low on disk space (embedded systems), but for whom ram
+ size matters less.
+
+ If in doubt, select 'gzip'
+
+ config KERNEL_GZIP
+ bool "Gzip"
+ depends on HAVE_KERNEL_GZIP
+ help
+ The old and tried gzip compression. Its compression ratio is
+ the poorest among the 3 choices; however its speed (both
+ compression and decompression) is the fastest.
+
+ config KERNEL_BZIP2
+ bool "Bzip2"
+ depends on HAVE_KERNEL_BZIP2
+ help
+ Its compression ratio and speed is intermediate.
+ Decompression speed is slowest among the three. The kernel
+ size is about 10% smaller with bzip2, in comparison to gzip.
+ Bzip2 uses a large amount of memory. For modern kernels you
+ will need at least 8MB RAM or more for booting.
+
+ config KERNEL_LZMA
+ bool "LZMA"
+ depends on HAVE_KERNEL_LZMA
+ help
+ The most recent compression algorithm.
+ Its ratio is best, decompression speed is between the other
+ two. Compression is slowest. The kernel size is about 33%
+ smaller with LZMA in comparison to gzip.
+
+ endchoice
+
config SWAP
bool "Support for paging of anonymous memory (swap)"
depends on MMU && BLOCK
config CPUSETS
bool "Cpuset support"
- depends on SMP && CGROUPS
+ depends on CGROUPS
help
This option will let you create and manage CPUSETs which
allow dynamically partitioning a system into sets of CPUs and
select MM_OWNER
help
Provides a memory resource controller that manages both anonymous
- memory and page cache. (See Documentation/controllers/memory.txt)
+ memory and page cache. (See Documentation/cgroups/memory.txt)
Note that setting this option increases fixed memory overhead
associated with each page of memory in the system. By this,
is disabled by boot option, this will be automatically disabled and
there will be no overhead from this. Even when you set this config=y,
if boot option "noswapaccount" is set, swap will not be accounted.
+ Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
+ size is 4096bytes, 512k per 1Gbytes of swap.
endif # CGROUPS
depends on NAMESPACES && EXPERIMENTAL
help
Support process id namespaces. This allows having multiple
- process with the same pid as long as they are in different
+ processes with the same pid as long as they are in different
pid namespaces. This is a building block of containers.
Unless you want to work with an experimental feature
config SYSCTL
bool
+ config ANON_INODES
+ bool
+
menuconfig EMBEDDED
bool "Configure standard kernel features (for small systems)"
help
This option allows to disable the internal PC-Speaker
support, saving some memory.
- config COMPAT_BRK
- bool "Disable heap randomization"
- default y
- help
- Randomizing heap placement makes heap exploits harder, but it
- also breaks ancient binaries (including anything libc5 based).
- This option changes the bootup default to heap randomization
- disabled, and can be overriden runtime by setting
- /proc/sys/kernel/randomize_va_space to 2.
-
- On non-ancient distros (post-2000 ones) N is usually a safe choice.
-
config BASE_FULL
default y
bool "Enable full-sized data structures for core" if EMBEDDED
support for "fast userspace mutexes". The resulting kernel may not
run glibc-based applications correctly.
- config ANON_INODES
- bool
-
config EPOLL
bool "Enable eventpoll support" if EMBEDDED
default y
by some high performance threaded applications. Disabling
this option saves about 7k.
+config HAVE_PERF_COUNTERS
+ bool
+
+menu "Performance Counters"
+
+config PERF_COUNTERS
+ bool "Kernel Performance Counters"
+ depends on HAVE_PERF_COUNTERS
+ default y
+ select ANON_INODES
+ help
+ Enable kernel support for performance counter hardware.
+
+ Performance counters are special hardware registers available
+ on most modern CPUs. These registers count the number of certain
+ types of hw events: such as instructions executed, cachemisses
+ suffered, or branches mis-predicted - without slowing down the
+ kernel or applications. These registers can also trigger interrupts
+ when a threshold number of events have passed - and can thus be
+ used to profile the code that runs on that CPU.
+
+ The Linux Performance Counter subsystem provides an abstraction of
+ these hardware capabilities, available via a system call. It
+ provides per task and per CPU counters, and it provides event
+ capabilities on top of those.
+
+ Say Y if unsure.
+
+endmenu
+
config VM_EVENT_COUNTERS
default y
bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
SLUB sysfs support. /sys/slab will not exist and there will be
no support for cache validation etc.
+ config COMPAT_BRK
+ bool "Disable heap randomization"
+ default y
+ help
+ Randomizing heap placement makes heap exploits harder, but it
+ also breaks ancient binaries (including anything libc5 based).
+ This option changes the bootup default to heap randomization
+ disabled, and can be overridden at runtime by setting
+ /proc/sys/kernel/randomize_va_space to 2.
+
+ On non-ancient distros (post-2000 ones) N is usually a safe choice.
+
choice
prompt "Choose SLAB allocator"
default SLUB
config MARKERS
bool "Activate markers"
- depends on TRACEPOINTS
+ select TRACEPOINTS
help
Place an empty function call at each marker site. Can be
dynamically changed for a probe function.
source "arch/Kconfig"
+ config SLOW_WORK
+ default n
+ bool "Enable slow work thread pool"
+ help
+ The slow work thread pool provides a number of dynamically allocated
+ threads that can be used by the kernel to perform operations that
+ take a relatively long time.
+
+ An example of this would be CacheFiles doing a path lookup followed
+ by a series of mkdirs and a create call, all of which have to touch
+ disk.
+
endmenu # General setup
config HAVE_GENERIC_DMA_COHERENT
config RT_MUTEXES
boolean
- select PLIST
config BASE_SMALL
int
cpu_possible_map, some of them chose to initialize cpu_possible_map
with all 1s, and others with all 0s. When they were centralised,
it was better to provide this option than to break all the archs
- and have several arch maintainers persuing me down dark alleys.
+ and have several arch maintainers pursuing me down dark alleys.
config STOP_MACHINE
bool
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
+ obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
+ #include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <trace/sched.h>
static void exit_mm(struct task_struct * tsk);
- static inline int task_detached(struct task_struct *p)
- {
- return p->exit_signal == -1;
- }
-
static void __unhash_process(struct task_struct *p)
{
nr_threads--;
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+#ifdef CONFIG_PERF_COUNTERS
+ WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+#endif
trace_sched_process_free(tsk);
put_task_struct(tsk);
}
void __set_special_pids(struct pid *pid)
{
struct task_struct *curr = current->group_leader;
- pid_t nr = pid_nr(pid);
- if (task_session(curr) != pid) {
+ if (task_session(curr) != pid)
change_pid(curr, PIDTYPE_SID, pid);
- set_task_session(curr, nr);
- }
- if (task_pgrp(curr) != pid) {
+
+ if (task_pgrp(curr) != pid)
change_pid(curr, PIDTYPE_PGID, pid);
- set_task_pgrp(curr, nr);
- }
}
static void set_special_pids(struct pid *pid)
void daemonize(const char *name, ...)
{
va_list args;
- struct fs_struct *fs;
sigset_t blocked;
va_start(args, name);
/* Become as one with the init task */
- exit_fs(current); /* current->fs->count--; */
- fs = init_task.fs;
- current->fs = fs;
- atomic_inc(&fs->count);
-
+ daemonize_fs_struct();
exit_files(current);
current->files = init_task.files;
atomic_inc(¤t->files->count);
}
}
- void put_fs_struct(struct fs_struct *fs)
- {
- /* No need to hold fs->lock if we are killing it */
- if (atomic_dec_and_test(&fs->count)) {
- path_put(&fs->root);
- path_put(&fs->pwd);
- kmem_cache_free(fs_cachep, fs);
- }
- }
-
- void exit_fs(struct task_struct *tsk)
- {
- struct fs_struct * fs = tsk->fs;
-
- if (fs) {
- task_lock(tsk);
- tsk->fs = NULL;
- task_unlock(tsk);
- put_fs_struct(fs);
- }
- }
-
- EXPORT_SYMBOL_GPL(exit_fs);
-
#ifdef CONFIG_MM_OWNER
/*
* Task p is exiting and it owned mm, lets find a new owner for it
}
/*
- * Return nonzero if @parent's children should reap themselves.
- *
- * Called with write_lock_irq(&tasklist_lock) held.
- */
- static int ignoring_children(struct task_struct *parent)
- {
- int ret;
- struct sighand_struct *psig = parent->sighand;
- unsigned long flags;
- spin_lock_irqsave(&psig->siglock, flags);
- ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
- (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
- spin_unlock_irqrestore(&psig->siglock, flags);
- return ret;
- }
-
- /*
- * Detach all tasks we were using ptrace on.
- * Any that need to be release_task'd are put on the @dead list.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
- static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
- {
- struct task_struct *p, *n;
- int ign = -1;
-
- list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
- __ptrace_unlink(p);
-
- if (p->exit_state != EXIT_ZOMBIE)
- continue;
-
- /*
- * If it's a zombie, our attachedness prevented normal
- * parent notification or self-reaping. Do notification
- * now if it would have happened earlier. If it should
- * reap itself, add it to the @dead list. We can't call
- * release_task() here because we already hold tasklist_lock.
- *
- * If it's our own child, there is no notification to do.
- * But if our normal children self-reap, then this child
- * was prevented by ptrace and we must reap it now.
- */
- if (!task_detached(p) && thread_group_empty(p)) {
- if (!same_thread_group(p->real_parent, parent))
- do_notify_parent(p, p->exit_signal);
- else {
- if (ign < 0)
- ign = ignoring_children(parent);
- if (ign)
- p->exit_signal = -1;
- }
- }
-
- if (task_detached(p)) {
- /*
- * Mark it as in the process of being reaped.
- */
- p->exit_state = EXIT_DEAD;
- list_add(&p->ptrace_entry, dead);
- }
- }
- }
-
- /*
- * Finish up exit-time ptrace cleanup.
- *
- * Called without locks.
- */
- static void ptrace_exit_finish(struct task_struct *parent,
- struct list_head *dead)
- {
- struct task_struct *p, *n;
-
- BUG_ON(!list_empty(&parent->ptraced));
-
- list_for_each_entry_safe(p, n, dead, ptrace_entry) {
- list_del_init(&p->ptrace_entry);
- release_task(p);
- }
- }
-
- static void reparent_thread(struct task_struct *p, struct task_struct *father)
- {
- if (p->pdeath_signal)
- /* We already hold the tasklist_lock here. */
- group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
-
- list_move_tail(&p->sibling, &p->real_parent->children);
-
- /* If this is a threaded reparent there is no need to
- * notify anyone anything has happened.
- */
- if (same_thread_group(p->real_parent, father))
- return;
-
- /* We don't want people slaying init. */
- if (!task_detached(p))
- p->exit_signal = SIGCHLD;
-
- /* If we'd notified the old parent about this child's death,
- * also notify the new parent.
- */
- if (!ptrace_reparented(p) &&
- p->exit_state == EXIT_ZOMBIE &&
- !task_detached(p) && thread_group_empty(p))
- do_notify_parent(p, p->exit_signal);
-
- kill_orphaned_pgrp(p, father);
- }
-
- /*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
return pid_ns->child_reaper;
}
+ /*
+ * Any that need to be release_task'd are put on the @dead list.
+ */
+ static void reparent_thread(struct task_struct *father, struct task_struct *p,
+ struct list_head *dead)
+ {
+ if (p->pdeath_signal)
+ group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+ list_move_tail(&p->sibling, &p->real_parent->children);
+
+ if (task_detached(p))
+ return;
+ /*
+ * If this is a threaded reparent there is no need to
+ * notify anyone anything has happened.
+ */
+ if (same_thread_group(p->real_parent, father))
+ return;
+
+ /* We don't want people slaying init. */
+ p->exit_signal = SIGCHLD;
+
+ /* If it has exited notify the new parent about this child's death. */
+ if (!p->ptrace &&
+ p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+ do_notify_parent(p, p->exit_signal);
+ if (task_detached(p)) {
+ p->exit_state = EXIT_DEAD;
+ list_move_tail(&p->sibling, dead);
+ }
+ }
+
+ kill_orphaned_pgrp(p, father);
+ }
+
static void forget_original_parent(struct task_struct *father)
{
struct task_struct *p, *n, *reaper;
- LIST_HEAD(ptrace_dead);
+ LIST_HEAD(dead_children);
+
+ exit_ptrace(father);
write_lock_irq(&tasklist_lock);
reaper = find_new_reaper(father);
- /*
- * First clean up ptrace if we were using it.
- */
- ptrace_exit(father, &ptrace_dead);
list_for_each_entry_safe(p, n, &father->children, sibling) {
p->real_parent = reaper;
BUG_ON(p->ptrace);
p->parent = p->real_parent;
}
- reparent_thread(p, father);
+ reparent_thread(father, p, &dead_children);
}
-
write_unlock_irq(&tasklist_lock);
+
BUG_ON(!list_empty(&father->children));
- ptrace_exit_finish(father, &ptrace_dead);
+ list_for_each_entry_safe(p, n, &dead_children, sibling) {
+ list_del_init(&p->sibling);
+ release_task(p);
+ }
}
/*
tsk->mempolicy = NULL;
#endif
#ifdef CONFIG_FUTEX
- /*
- * This must happen late, after the PID is not
- * hashed anymore:
- */
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
if (unlikely(current->pi_state_cache))
*/
read_unlock(&tasklist_lock);
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_counter_exit_task(p);
+
retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
return retval;
}
+ static int *task_stopped_code(struct task_struct *p, bool ptrace)
+ {
+ if (ptrace) {
+ if (task_is_stopped_or_traced(p))
+ return &p->exit_code;
+ } else {
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return &p->signal->group_exit_code;
+ }
+ return NULL;
+ }
+
/*
* Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
int options, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
- int retval, exit_code, why;
+ int retval, exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
- if (unlikely(!task_is_stopped_or_traced(p)))
- goto unlock_sig;
-
- if (!ptrace && p->signal->group_stop_count > 0)
- /*
- * A group stop is in progress and this is the group leader.
- * We won't report until all threads have stopped.
- */
+ p_code = task_stopped_code(p, ptrace);
+ if (unlikely(!p_code))
goto unlock_sig;
- exit_code = p->exit_code;
+ exit_code = *p_code;
if (!exit_code)
goto unlock_sig;
if (!unlikely(options & WNOWAIT))
- p->exit_code = 0;
+ *p_code = 0;
/* don't need the RCU readlock here as we're holding a spinlock */
uid = __task_cred(p)->uid;
*/
*notask_error = 0;
- if (task_is_stopped_or_traced(p))
+ if (task_stopped_code(p, ptrace))
return wait_task_stopped(ptrace, p, options,
infop, stat_addr, ru);
pid = find_get_pid(-upid);
} else if (upid == 0) {
type = PIDTYPE_PGID;
- pid = get_pid(task_pgrp(current));
+ pid = get_task_pid(current, PIDTYPE_PGID);
} else /* upid > 0 */ {
type = PIDTYPE_PID;
pid = find_get_pid(upid);
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
+ #include <linux/fs_struct.h>
#include <trace/sched.h>
#include <linux/magic.h>
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
- cpus_clear(mm->cpu_vm_mask);
+ cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
return retval;
}
- static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
- {
- struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
- /* We don't need to lock fs - think why ;-) */
- if (fs) {
- atomic_set(&fs->count, 1);
- rwlock_init(&fs->lock);
- fs->umask = old->umask;
- read_lock(&old->lock);
- fs->root = old->root;
- path_get(&old->root);
- fs->pwd = old->pwd;
- path_get(&old->pwd);
- read_unlock(&old->lock);
- }
- return fs;
- }
-
- struct fs_struct *copy_fs_struct(struct fs_struct *old)
- {
- return __copy_fs_struct(old);
- }
-
- EXPORT_SYMBOL_GPL(copy_fs_struct);
-
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
+ struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
- atomic_inc(¤t->fs->count);
+ /* tsk->fs is already what we want */
+ write_lock(&fs->lock);
+ if (fs->in_exec) {
+ write_unlock(&fs->lock);
+ return -EAGAIN;
+ }
+ fs->users++;
+ write_unlock(&fs->lock);
return 0;
}
- tsk->fs = __copy_fs_struct(current->fs);
+ tsk->fs = copy_fs_struct(fs);
if (!tsk->fs)
return -ENOMEM;
return 0;
atomic_set(&sig->live, 1);
init_waitqueue_head(&sig->wait_chldexit);
sig->flags = 0;
+ if (clone_flags & CLONE_NEWPID)
+ sig->flags |= SIGNAL_UNKILLABLE;
sig->group_exit_code = 0;
sig->group_exit_task = NULL;
sig->group_stop_count = 0;
goto fork_out;
rt_mutex_init_task(p);
+ perf_counter_init_task(p);
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
goto bad_fork_cleanup_mm;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+ retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_io;
#endif
clear_all_latency_tracing(p);
- /* Our parent execution domain becomes current domain
- These must match for thread signalling to apply */
- p->parent_exec_id = p->self_exec_id;
-
/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
set_task_cpu(p, smp_processor_id());
/* CLONE_PARENT re-uses the old parent */
- if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
- else
+ p->parent_exec_id = current->parent_exec_id;
+ } else {
p->real_parent = current;
+ p->parent_exec_id = current->self_exec_id;
+ }
spin_lock(¤t->sighand->siglock);
p->signal->leader_pid = pid;
tty_kref_put(p->signal->tty);
p->signal->tty = tty_kref_get(current->signal->tty);
- set_task_pgrp(p, task_pgrp_nr(current));
- set_task_session(p, task_session_nr(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
}
{
struct fs_struct *fs = current->fs;
- if ((unshare_flags & CLONE_FS) &&
- (fs && atomic_read(&fs->count) > 1)) {
- *new_fsp = __copy_fs_struct(current->fs);
- if (!*new_fsp)
- return -ENOMEM;
- }
+ if (!(unshare_flags & CLONE_FS) || !fs)
+ return 0;
+
+ /* don't need lock here; in the worst case we'll do useless copy */
+ if (fs->users == 1)
+ return 0;
+
+ *new_fsp = copy_fs_struct(fs);
+ if (!*new_fsp)
+ return -ENOMEM;
return 0;
}
if (new_fs) {
fs = current->fs;
+ write_lock(&fs->lock);
current->fs = new_fs;
- new_fs = fs;
+ if (--fs->users)
+ new_fs = NULL;
+ else
+ new_fs = fs;
+ write_unlock(&fs->lock);
}
if (new_mm) {
bad_unshare_cleanup_fs:
if (new_fs)
- put_fs_struct(new_fs);
+ free_fs_struct(new_fs);
bad_unshare_cleanup_thread:
bad_unshare_out:
*/
static DEFINE_SPINLOCK(task_group_lock);
+ #ifdef CONFIG_SMP
+ static int root_task_group_empty(void)
+ {
+ return list_empty(&root_task_group.children);
+ }
+ #endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else
+ #ifdef CONFIG_SMP
+ static int root_task_group_empty(void)
+ {
+ return 1;
+ }
+ #endif
+
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
static inline struct task_group *task_group(struct task_struct *p)
{
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- int highest_prio; /* highest queued rt task prio */
+ struct {
+ int curr; /* highest queued rt task prio */
+ #ifdef CONFIG_SMP
+ int next; /* next highest */
+ #endif
+ } highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
int overloaded;
+ struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned long last_tick_seen;
unsigned char in_nohz_recently;
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
+ u64 nr_migrations_in;
struct cfs_rq cfs;
struct rt_rq rt;
struct root_domain *rd;
struct sched_domain *sd;
+ unsigned char idle_at_tick;
/* For active balancing */
int active_balance;
int push_cpu;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
- unsigned int yld_exp_empty;
- unsigned int yld_act_empty;
- unsigned int yld_both_empty;
unsigned int yld_count;
/* schedule() stats */
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
{
rq->clock = sched_clock_cpu(cpu_of(rq));
}
}
}
+void curr_rq_lock_irq_save(unsigned long *flags)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_save(*flags);
+ rq = cpu_rq(smp_processor_id());
+ spin_lock(&rq->lock);
+}
+
+void curr_rq_unlock_irq_restore(unsigned long *flags)
+ __releases(rq->lock)
+{
+ struct rq *rq;
+
+ rq = cpu_rq(smp_processor_id());
+ spin_unlock(&rq->lock);
+ local_irq_restore(*flags);
+}
+
void task_rq_unlock_wait(struct task_struct *p)
{
struct rq *rq = task_rq(p);
if (rq == this_rq()) {
hrtimer_restart(timer);
} else if (!rq->hrtick_csd_pending) {
- __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+ __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
rq->hrtick_csd_pending = 1;
}
}
assert_spin_locked(&task_rq(p)->lock);
- if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ if (test_tsk_need_resched(p))
return;
- set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ set_tsk_need_resched(p);
cpu = task_cpu(p);
if (cpu == smp_processor_id())
* lockless. The worst case is that the other CPU runs the
* idle task through an additional NOOP schedule()
*/
- set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+ set_tsk_need_resched(rq->idle);
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
#endif
+ #ifdef CONFIG_PREEMPT
+
/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations. This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below. However, it
+ * also adds more overhead and therefore may reduce throughput.
*/
- static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+ {
+ spin_unlock(&this_rq->lock);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
+ }
+
+ #else
+ /*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry. This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
int ret = 0;
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
- spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
return ret;
}
+ #endif /* CONFIG_PREEMPT */
+
+ /*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ {
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+
+ return _double_lock_balance(this_rq, busiest);
+ }
+
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
{
+ if (wakeup)
+ p->se.start_runtime = p->se.sum_exec_runtime;
+
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, wakeup);
p->se.on_rq = 1;
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
- if (sleep && p->se.last_wakeup) {
- update_avg(&p->se.avg_overlap,
- p->se.sum_exec_runtime - p->se.last_wakeup);
- p->se.last_wakeup = 0;
+ if (sleep) {
+ if (p->se.last_wakeup) {
+ update_avg(&p->se.avg_overlap,
+ p->se.sum_exec_runtime - p->se.last_wakeup);
+ p->se.last_wakeup = 0;
+ } else {
+ update_avg(&p->se.avg_wakeup,
+ sysctl_sched_wakeup_granularity);
+ }
}
sched_info_dequeued(p);
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
+#endif
if (old_cpu != new_cpu) {
- schedstat_inc(p, se.nr_migrations);
+ p->se.nr_migrations++;
+ new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
- }
#endif
+ }
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;
* it must be off the runqueue _entirely_, and not
* preempted!
*
- * So if it wa still runnable (but just not actively
+ * So if it was still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
#endif /* CONFIG_SMP */
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p: the task to evaluate
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+ void (*func) (void *info), void *info)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if (task_curr(p))
+ smp_call_function_single(cpu, func, info, 1);
+ preempt_enable();
+}
+
/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
sync = 0;
#ifdef CONFIG_SMP
- if (sched_feat(LB_WAKEUP_UPDATE)) {
+ if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
struct sched_domain *sd;
this_cpu = raw_smp_processor_id();
activate_task(rq, p, 1);
success = 1;
+ /*
+ * Only attribute actual wakeups done by this task.
+ */
+ if (!in_interrupt()) {
+ struct sched_entity *se = ¤t->se;
+ u64 sample = se->sum_exec_runtime;
+
+ if (se->last_wakeup)
+ sample -= se->last_wakeup;
+ else
+ sample -= se->start_runtime;
+ update_avg(&se->avg_wakeup, sample);
+
+ se->last_wakeup = se->sum_exec_runtime;
+ }
+
out_running:
trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
p->sched_class->task_wake_up(rq, p);
#endif
out:
- current->se.last_wakeup = current->se.sum_exec_runtime;
-
task_rq_unlock(rq, &flags);
return success;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.nr_migrations = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
+ p->se.start_runtime = 0;
+ p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+ plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
put_cpu();
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
/**
- * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
struct mm_struct *mm = rq->prev_mm;
long prev_state;
+ #ifdef CONFIG_SMP
+ int post_schedule = 0;
+
+ if (current->sched_class->needs_post_schedule)
+ post_schedule = current->sched_class->needs_post_schedule(rq);
+ #endif
rq->prev_mm = NULL;
*/
prev_state = prev->state;
finish_arch_switch(prev);
+ perf_counter_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP
- if (current->sched_class->post_schedule)
+ if (post_schedule)
current->sched_class->post_schedule(rq);
#endif
}
/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_switches(cpu) - number of context switches on that cpu
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_switches(int cpu)
+{
+ return cpu_rq(cpu)->nr_switches;
+}
+
+u64 cpu_nr_migrations(int cpu)
+{
+ return cpu_rq(cpu)->nr_migrations_in;
+}
+
+/*
* Update rq->cpu_load[] statistics. This function is usually called every
* scheduler tick (TICK_NSEC).
*/
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
+ int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
* 1) running (obviously), or
* 2) too many balance attempts have failed.
*/
- if (!task_hot(p, rq->clock, sd) ||
- sd->nr_balance_failed > sd->cache_nice_tries) {
+ tsk_cache_hot = task_hot(p, rq->clock, sd);
+ if (!tsk_cache_hot ||
+ sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
- if (task_hot(p, rq->clock, sd)) {
+ if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
schedstat_inc(p, se.nr_forced_migrations);
}
return 1;
}
- if (task_hot(p, rq->clock, sd)) {
+ if (tsk_cache_hot) {
schedstat_inc(p, se.nr_failed_migrations_hot);
return 0;
}
pulled++;
rem_load_move -= p->se.load.weight;
+ #ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible kernels
+ * will stop after the first task is pulled to minimize the critical
+ * section.
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ goto out;
+ #endif
+
/*
* We only want to steal up to the prescribed amount of weighted load.
*/
sd, idle, all_pinned, &this_best_prio);
class = class->next;
+ #ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible
+ * kernels will stop after the first task is pulled to minimize
+ * the critical section.
+ */
if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
break;
-
+ #endif
} while (class && max_load_move > total_load_moved);
return total_load_moved > 0;
return 0;
}
-
+ /********** Helpers for find_busiest_group ************************/
/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
*/
- static struct sched_group *
- find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const struct cpumask *cpus, int *balance)
- {
- struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- unsigned long max_load, avg_load, total_load, this_load, total_pwr;
- unsigned long max_pull;
- unsigned long busiest_load_per_task, busiest_nr_running;
- unsigned long this_load_per_task, this_nr_running;
- int load_idx, group_imb = 0;
+ struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *this; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ /** Statistics of this group */
+ unsigned long this_load;
+ unsigned long this_load_per_task;
+ unsigned long this_nr_running;
+
+ /* Statistics of the busiest group */
+ unsigned long max_load;
+ unsigned long busiest_load_per_task;
+ unsigned long busiest_nr_running;
+
+ int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance = 1;
- unsigned long leader_nr_running = 0, min_load_per_task = 0;
- unsigned long min_nr_running = ULONG_MAX;
- struct sched_group *group_min = NULL, *group_leader = NULL;
+ int power_savings_balance; /* Is powersave balance needed for this sd */
+ struct sched_group *group_min; /* Least loaded group in sd */
+ struct sched_group *group_leader; /* Group which relieves group_min */
+ unsigned long min_load_per_task; /* load_per_task in group_min */
+ unsigned long leader_nr_running; /* Nr running of group_leader */
+ unsigned long min_nr_running; /* Nr running of group_min */
#endif
+ };
- max_load = this_load = total_load = total_pwr = 0;
- busiest_load_per_task = busiest_nr_running = 0;
- this_load_per_task = this_nr_running = 0;
+ /*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+ struct sg_lb_stats {
+ unsigned long avg_load; /*Avg load across the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long sum_nr_running; /* Nr tasks running in the group */
+ unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long group_capacity;
+ int group_imb; /* Is there an imbalance in the group ? */
+ };
+
+ /**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+ static inline unsigned int group_first_cpu(struct sched_group *group)
+ {
+ return cpumask_first(sched_group_cpus(group));
+ }
+
+ /**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+ static inline int get_sd_load_idx(struct sched_domain *sd,
+ enum cpu_idle_type idle)
+ {
+ int load_idx;
- if (idle == CPU_NOT_IDLE)
+ switch (idle) {
+ case CPU_NOT_IDLE:
load_idx = sd->busy_idx;
- else if (idle == CPU_NEWLY_IDLE)
+ break;
+
+ case CPU_NEWLY_IDLE:
load_idx = sd->newidle_idx;
- else
+ break;
+ default:
load_idx = sd->idle_idx;
+ break;
+ }
- do {
- unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
- int local_group;
- int i;
- int __group_imb = 0;
- unsigned int balance_cpu = -1, first_idle_cpu = 0;
- unsigned long sum_nr_running, sum_weighted_load;
- unsigned long sum_avg_load_per_task;
- unsigned long avg_load_per_task;
+ return load_idx;
+ }
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
- if (local_group)
- balance_cpu = cpumask_first(sched_group_cpus(group));
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+ static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ {
+ /*
+ * Busy processors will not participate in power savings
+ * balance.
+ */
+ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ sds->power_savings_balance = 0;
+ else {
+ sds->power_savings_balance = 1;
+ sds->min_nr_running = ULONG_MAX;
+ sds->leader_nr_running = 0;
+ }
+ }
- /* Tally up the load of all CPUs in the group */
- sum_weighted_load = sum_nr_running = avg_load = 0;
- sum_avg_load_per_task = avg_load_per_task = 0;
+ /**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ * load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+ static inline void update_sd_power_savings_stats(struct sched_group *group,
+ struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ {
- max_cpu_load = 0;
- min_cpu_load = ~0UL;
+ if (!sds->power_savings_balance)
+ return;
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
- struct rq *rq = cpu_rq(i);
+ /*
+ * If the local group is idle or completely loaded
+ * no need to do power savings balance at this domain
+ */
+ if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+ !sds->this_nr_running))
+ sds->power_savings_balance = 0;
- if (*sd_idle && rq->nr_running)
- *sd_idle = 0;
+ /*
+ * If a group is already running at full capacity or idle,
+ * don't include that group in power savings calculations
+ */
+ if (!sds->power_savings_balance ||
+ sgs->sum_nr_running >= sgs->group_capacity ||
+ !sgs->sum_nr_running)
+ return;
- /* Bias balancing toward cpus of our domain */
- if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
- first_idle_cpu = 1;
- balance_cpu = i;
- }
+ /*
+ * Calculate the group which has the least non-idle load.
+ * This is the group from where we need to pick up the load
+ * for saving power
+ */
+ if ((sgs->sum_nr_running < sds->min_nr_running) ||
+ (sgs->sum_nr_running == sds->min_nr_running &&
+ group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+ sds->group_min = group;
+ sds->min_nr_running = sgs->sum_nr_running;
+ sds->min_load_per_task = sgs->sum_weighted_load /
+ sgs->sum_nr_running;
+ }
- load = target_load(i, load_idx);
- } else {
- load = source_load(i, load_idx);
- if (load > max_cpu_load)
- max_cpu_load = load;
- if (min_cpu_load > load)
- min_cpu_load = load;
- }
+ /*
+ * Calculate the group which is almost near its
+ * capacity but still has some space to pick up some load
+ * from other group and save more power
+ */
+ if (sgs->sum_nr_running > sgs->group_capacity - 1)
+ return;
- avg_load += load;
- sum_nr_running += rq->nr_running;
- sum_weighted_load += weighted_cpuload(i);
+ if (sgs->sum_nr_running > sds->leader_nr_running ||
+ (sgs->sum_nr_running == sds->leader_nr_running &&
+ group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+ sds->group_leader = group;
+ sds->leader_nr_running = sgs->sum_nr_running;
+ }
+ }
- sum_avg_load_per_task += cpu_avg_load_per_task(i);
- }
+ /**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ * under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+ {
+ if (!sds->power_savings_balance)
+ return 0;
- /*
- * First idle cpu or the first cpu(busiest) in this sched group
- * is eligible for doing load balancing at this and above
- * domains. In the newly idle case, we will allow all the cpu's
- * to do the newly idle load balance.
- */
- if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu && balance) {
- *balance = 0;
- goto ret;
- }
+ if (sds->this != sds->group_leader ||
+ sds->group_leader == sds->group_min)
+ return 0;
- total_load += avg_load;
- total_pwr += group->__cpu_power;
+ *imbalance = sds->min_load_per_task;
+ sds->busiest = sds->group_min;
- /* Adjust by relative CPU power of the group */
- avg_load = sg_div_cpu_power(group,
- avg_load * SCHED_LOAD_SCALE);
+ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ group_first_cpu(sds->group_leader);
+ }
+ return 1;
- /*
- * Consider the group unbalanced when the imbalance is larger
- * than the average weight of two tasks.
- *
- * APZ: with cgroup the avg task weight can vary wildly and
- * might not be a suitable number - should we keep a
- * normalized nr_running number somewhere that negates
- * the hierarchy?
- */
- avg_load_per_task = sg_div_cpu_power(group,
- sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ }
+ #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ {
+ return;
+ }
+
+ static inline void update_sd_power_savings_stats(struct sched_group *group,
+ struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ {
+ return;
+ }
+
+ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+ {
+ return 0;
+ }
+ #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+
+ /**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+ enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ int local_group, const struct cpumask *cpus,
+ int *balance, struct sg_lb_stats *sgs)
+ {
+ unsigned long load, max_cpu_load, min_cpu_load;
+ int i;
+ unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ unsigned long sum_avg_load_per_task;
+ unsigned long avg_load_per_task;
+
+ if (local_group)
+ balance_cpu = group_first_cpu(group);
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
- __group_imb = 1;
+ /* Tally up the load of all CPUs in the group */
+ sum_avg_load_per_task = avg_load_per_task = 0;
+ max_cpu_load = 0;
+ min_cpu_load = ~0UL;
- group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ struct rq *rq = cpu_rq(i);
+ if (*sd_idle && rq->nr_running)
+ *sd_idle = 0;
+
+ /* Bias balancing toward cpus of our domain */
if (local_group) {
- this_load = avg_load;
- this = group;
- this_nr_running = sum_nr_running;
- this_load_per_task = sum_weighted_load;
- } else if (avg_load > max_load &&
- (sum_nr_running > group_capacity || __group_imb)) {
- max_load = avg_load;
- busiest = group;
- busiest_nr_running = sum_nr_running;
- busiest_load_per_task = sum_weighted_load;
- group_imb = __group_imb;
+ if (idle_cpu(i) && !first_idle_cpu) {
+ first_idle_cpu = 1;
+ balance_cpu = i;
+ }
+
+ load = target_load(i, load_idx);
+ } else {
+ load = source_load(i, load_idx);
+ if (load > max_cpu_load)
+ max_cpu_load = load;
+ if (min_cpu_load > load)
+ min_cpu_load = load;
}
- #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- /*
- * Busy processors will not participate in power savings
- * balance.
- */
- if (idle == CPU_NOT_IDLE ||
- !(sd->flags & SD_POWERSAVINGS_BALANCE))
- goto group_next;
+ sgs->group_load += load;
+ sgs->sum_nr_running += rq->nr_running;
+ sgs->sum_weighted_load += weighted_cpuload(i);
- /*
- * If the local group is idle or completely loaded
- * no need to do power savings balance at this domain
- */
- if (local_group && (this_nr_running >= group_capacity ||
- !this_nr_running))
- power_savings_balance = 0;
+ sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ }
- /*
- * If a group is already running at full capacity or idle,
- * don't include that group in power savings calculations
- */
- if (!power_savings_balance || sum_nr_running >= group_capacity
- || !sum_nr_running)
- goto group_next;
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
+ *balance = 0;
+ return;
+ }
- /*
- * Calculate the group which has the least non-idle load.
- * This is the group from where we need to pick up the load
- * for saving power
- */
- if ((sum_nr_running < min_nr_running) ||
- (sum_nr_running == min_nr_running &&
- cpumask_first(sched_group_cpus(group)) >
- cpumask_first(sched_group_cpus(group_min)))) {
- group_min = group;
- min_nr_running = sum_nr_running;
- min_load_per_task = sum_weighted_load /
- sum_nr_running;
- }
+ /* Adjust by relative CPU power of the group */
+ sgs->avg_load = sg_div_cpu_power(group,
+ sgs->group_load * SCHED_LOAD_SCALE);
- /*
- * Calculate the group which is almost near its
- * capacity but still has some space to pick up some load
- * from other group and save more power
- */
- if (sum_nr_running <= group_capacity - 1) {
- if (sum_nr_running > leader_nr_running ||
- (sum_nr_running == leader_nr_running &&
- cpumask_first(sched_group_cpus(group)) <
- cpumask_first(sched_group_cpus(group_leader)))) {
- group_leader = group;
- leader_nr_running = sum_nr_running;
- }
+
+ /*
+ * Consider the group unbalanced when the imbalance is larger
+ * than the average weight of two tasks.
+ *
+ * APZ: with cgroup the avg task weight can vary wildly and
+ * might not be a suitable number - should we keep a
+ * normalized nr_running number somewhere that negates
+ * the hierarchy?
+ */
+ avg_load_per_task = sg_div_cpu_power(group,
+ sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ sgs->group_imb = 1;
+
+ sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+
+ }
+
+ /**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+ enum cpu_idle_type idle, int *sd_idle,
+ const struct cpumask *cpus, int *balance,
+ struct sd_lb_stats *sds)
+ {
+ struct sched_group *group = sd->groups;
+ struct sg_lb_stats sgs;
+ int load_idx;
+
+ init_sd_power_savings_stats(sd, sds, idle);
+ load_idx = get_sd_load_idx(sd, idle);
+
+ do {
+ int local_group;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
+ memset(&sgs, 0, sizeof(sgs));
+ update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+ local_group, cpus, balance, &sgs);
+
+ if (local_group && balance && !(*balance))
+ return;
+
+ sds->total_load += sgs.group_load;
+ sds->total_pwr += group->__cpu_power;
+
+ if (local_group) {
+ sds->this_load = sgs.avg_load;
+ sds->this = group;
+ sds->this_nr_running = sgs.sum_nr_running;
+ sds->this_load_per_task = sgs.sum_weighted_load;
+ } else if (sgs.avg_load > sds->max_load &&
+ (sgs.sum_nr_running > sgs.group_capacity ||
+ sgs.group_imb)) {
+ sds->max_load = sgs.avg_load;
+ sds->busiest = group;
+ sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->group_imb = sgs.group_imb;
}
- group_next:
- #endif
+
+ update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load || busiest_nr_running == 0)
- goto out_balanced;
-
- avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+ }
- if (this_load >= avg_load ||
- 100*max_load <= sd->imbalance_pct*this_load)
- goto out_balanced;
+ /**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ * amongst the groups of a sched_domain, during
+ * load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+ {
+ unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned int imbn = 2;
+
+ if (sds->this_nr_running) {
+ sds->this_load_per_task /= sds->this_nr_running;
+ if (sds->busiest_load_per_task >
+ sds->this_load_per_task)
+ imbn = 1;
+ } else
+ sds->this_load_per_task =
+ cpu_avg_load_per_task(this_cpu);
- busiest_load_per_task /= busiest_nr_running;
- if (group_imb)
- busiest_load_per_task = min(busiest_load_per_task, avg_load);
+ if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+ sds->busiest_load_per_task * imbn) {
+ *imbalance = sds->busiest_load_per_task;
+ return;
+ }
/*
- * We're trying to get all the cpus to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded cpu below the average load, as either of these
- * actions would just result in more rebalancing later, and ping-pong
- * tasks around. Thus we look for the minimum possible imbalance.
- * Negative imbalances (*we* are more loaded than anyone else) will
- * be counted as no imbalance for these purposes -- we can't fix that
- * by pulling tasks to us. Be careful of negative numbers as they'll
- * appear as very large values with unsigned longs.
+ * OK, we don't have enough imbalance to justify moving tasks,
+ * however we may be able to increase total CPU power used by
+ * moving them.
*/
- if (max_load <= busiest_load_per_task)
- goto out_balanced;
+ pwr_now += sds->busiest->__cpu_power *
+ min(sds->busiest_load_per_task, sds->max_load);
+ pwr_now += sds->this->__cpu_power *
+ min(sds->this_load_per_task, sds->this_load);
+ pwr_now /= SCHED_LOAD_SCALE;
+
+ /* Amount of load we'd subtract */
+ tmp = sg_div_cpu_power(sds->busiest,
+ sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ if (sds->max_load > tmp)
+ pwr_move += sds->busiest->__cpu_power *
+ min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+ /* Amount of load we'd add */
+ if (sds->max_load * sds->busiest->__cpu_power <
+ sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ tmp = sg_div_cpu_power(sds->this,
+ sds->max_load * sds->busiest->__cpu_power);
+ else
+ tmp = sg_div_cpu_power(sds->this,
+ sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += sds->this->__cpu_power *
+ min(sds->this_load_per_task, sds->this_load + tmp);
+ pwr_move /= SCHED_LOAD_SCALE;
+
+ /* Move if we gain throughput */
+ if (pwr_move > pwr_now)
+ *imbalance = sds->busiest_load_per_task;
+ }
+
+ /**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ * groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+ unsigned long *imbalance)
+ {
+ unsigned long max_pull;
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
- if (max_load < avg_load) {
+ if (sds->max_load < sds->avg_load) {
*imbalance = 0;
- goto small_imbalance;
+ return fix_small_imbalance(sds, this_cpu, imbalance);
}
/* Don't want to pull so many tasks that a group would go idle */
- max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+ max_pull = min(sds->max_load - sds->avg_load,
+ sds->max_load - sds->busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->__cpu_power,
- (avg_load - this_load) * this->__cpu_power)
+ *imbalance = min(max_pull * sds->busiest->__cpu_power,
+ (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < busiest_load_per_task) {
- unsigned long tmp, pwr_now, pwr_move;
- unsigned int imbn;
-
- small_imbalance:
- pwr_move = pwr_now = 0;
- imbn = 2;
- if (this_nr_running) {
- this_load_per_task /= this_nr_running;
- if (busiest_load_per_task > this_load_per_task)
- imbn = 1;
- } else
- this_load_per_task = cpu_avg_load_per_task(this_cpu);
+ if (*imbalance < sds->busiest_load_per_task)
+ return fix_small_imbalance(sds, this_cpu, imbalance);
- if (max_load - this_load + busiest_load_per_task >=
- busiest_load_per_task * imbn) {
- *imbalance = busiest_load_per_task;
- return busiest;
- }
+ }
+ /******* find_busiest_group() helpers end here *********************/
- /*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
- * moving them.
- */
+ /**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ * be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ * is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns: - the busiest group if imbalance exists.
+ * - If no imbalance and user has opted for power-savings balance,
+ * return the least loaded group whose CPUs can be
+ * put to idle by rebalancing its tasks onto our group.
+ */
+ static struct sched_group *
+ find_busiest_group(struct sched_domain *sd, int this_cpu,
+ unsigned long *imbalance, enum cpu_idle_type idle,
+ int *sd_idle, const struct cpumask *cpus, int *balance)
+ {
+ struct sd_lb_stats sds;
- pwr_now += busiest->__cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->__cpu_power *
- min(this_load_per_task, this_load);
- pwr_now /= SCHED_LOAD_SCALE;
-
- /* Amount of load we'd subtract */
- tmp = sg_div_cpu_power(busiest,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- if (max_load > tmp)
- pwr_move += busiest->__cpu_power *
- min(busiest_load_per_task, max_load - tmp);
-
- /* Amount of load we'd add */
- if (max_load * busiest->__cpu_power <
- busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = sg_div_cpu_power(this,
- max_load * busiest->__cpu_power);
- else
- tmp = sg_div_cpu_power(this,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- pwr_move += this->__cpu_power *
- min(this_load_per_task, this_load + tmp);
- pwr_move /= SCHED_LOAD_SCALE;
+ memset(&sds, 0, sizeof(sds));
- /* Move if we gain throughput */
- if (pwr_move > pwr_now)
- *imbalance = busiest_load_per_task;
- }
+ /*
+ * Compute the various statistics relavent for load balancing at
+ * this level.
+ */
+ update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+ balance, &sds);
+
+ /* Cases where imbalance does not exist from POV of this_cpu */
+ /* 1) this_cpu is not the appropriate cpu to perform load balancing
+ * at this level.
+ * 2) There is no busy sibling group to pull from.
+ * 3) This group is the busiest group.
+ * 4) This group is more busy than the avg busieness at this
+ * sched_domain.
+ * 5) The imbalance is within the specified limit.
+ * 6) Any rebalance would lead to ping-pong
+ */
+ if (balance && !(*balance))
+ goto ret;
- return busiest;
+ if (!sds.busiest || sds.busiest_nr_running == 0)
+ goto out_balanced;
- out_balanced:
- #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- goto ret;
+ if (sds.this_load >= sds.max_load)
+ goto out_balanced;
- if (this == group_leader && group_leader != group_min) {
- *imbalance = min_load_per_task;
- if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
- cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
- cpumask_first(sched_group_cpus(group_leader));
- }
- return group_min;
- }
- #endif
+ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
+ if (sds.this_load >= sds.avg_load)
+ goto out_balanced;
+
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
+
+ sds.busiest_load_per_task /= sds.busiest_nr_running;
+ if (sds.group_imb)
+ sds.busiest_load_per_task =
+ min(sds.busiest_load_per_task, sds.avg_load);
+
+ /*
+ * We're trying to get all the cpus to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded cpu below the average load, as either of these
+ * actions would just result in more rebalancing later, and ping-pong
+ * tasks around. Thus we look for the minimum possible imbalance.
+ * Negative imbalances (*we* are more loaded than anyone else) will
+ * be counted as no imbalance for these purposes -- we can't fix that
+ * by pulling tasks to us. Be careful of negative numbers as they'll
+ * appear as very large values with unsigned longs.
+ */
+ if (sds.max_load <= sds.busiest_load_per_task)
+ goto out_balanced;
+
+ /* Looks like there is an imbalance. Compute it */
+ calculate_imbalance(&sds, this_cpu, imbalance);
+ return sds.busiest;
+
+ out_balanced:
+ /*
+ * There is no obvious imbalance. But check if we can do some balancing
+ * to save power.
+ */
+ if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+ return sds.busiest;
ret:
*imbalance = 0;
return NULL;
*/
#define MAX_PINNED_INTERVAL 512
+ /* Working cpumask for load_balance and load_balance_newidle. */
+ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *balance, struct cpumask *cpus)
+ int *balance)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
+ struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_setall(cpus);
* this_rq is locked.
*/
static int
- load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
- struct cpumask *cpus)
+ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
{
struct sched_group *group;
struct rq *busiest = NULL;
int ld_moved = 0;
int sd_idle = 0;
int all_pinned = 0;
+ struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_setall(cpus);
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
- cpumask_var_t tmpmask;
-
- if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
- return;
for_each_domain(this_cpu, sd) {
unsigned long interval;
if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu, this_rq,
- sd, tmpmask);
+ sd);
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
*/
this_rq->next_balance = next_balance;
}
- free_cpumask_var(tmpmask);
}
/*
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
- cpumask_var_t tmp;
-
- /* Fails alloc? Rebalancing probably not a priority right now. */
- if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
- return;
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+ if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
-
- free_cpumask_var(tmp);
}
/*
#endif
}
+ static inline int on_null_domain(int cpu)
+ {
+ return !rcu_dereference(cpu_rq(cpu)->sd);
+ }
+
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*
cpumask_test_cpu(cpu, nohz.cpu_mask))
return;
#endif
- if (time_after_eq(jiffies, rq->next_balance))
+ /* Don't need to rebalance while attached to NULL domain */
+ if (time_after_eq(jiffies, rq->next_balance) &&
+ likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
}
* Return any ns on the sched_clock that have not yet been banked in
* @p in case that task is currently running.
*/
+unsigned long long __task_delta_exec(struct task_struct *p, int update)
+{
+ s64 delta_exec;
+ struct rq *rq;
+
+ rq = task_rq(p);
+ WARN_ON_ONCE(!runqueue_is_locked());
+ WARN_ON_ONCE(!task_current(rq, p));
+
+ if (update)
+ update_rq_clock(rq);
+
+ delta_exec = rq->clock - p->se.exec_start;
+
+ WARN_ON_ONCE(delta_exec < 0);
+
+ return delta_exec;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
update_rq_clock(rq);
update_cpu_load(rq);
curr->sched_class->task_tick(rq, curr, 0);
+ perf_counter_task_tick(curr, cpu);
spin_unlock(&rq->lock);
#ifdef CONFIG_SMP
#endif
}
- #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
- defined(CONFIG_PREEMPT_TRACER))
-
- static inline unsigned long get_parent_ip(unsigned long addr)
+ unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
addr = CALLER_ADDR2;
return addr;
}
+ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_PREEMPT_TRACER))
+
void __kprobes add_preempt_count(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
#endif
}
+ static void put_prev_task(struct rq *rq, struct task_struct *prev)
+ {
+ if (prev->state == TASK_RUNNING) {
+ u64 runtime = prev->se.sum_exec_runtime;
+
+ runtime -= prev->se.prev_sum_exec_runtime;
+ runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+
+ /*
+ * In order to avoid avg_overlap growing stale when we are
+ * indeed overlapping and hence not getting put to sleep, grow
+ * the avg_overlap on preemption.
+ *
+ * We use the average preemption runtime because that
+ * correlates to the amount of cache footprint a task can
+ * build up.
+ */
+ update_avg(&prev->se.avg_overlap, runtime);
+ }
+ prev->sched_class->put_prev_task(rq, prev);
+ }
+
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
- pick_next_task(struct rq *rq, struct task_struct *prev)
+ pick_next_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *p;
/*
* schedule() is the main scheduler function.
*/
- asmlinkage void __sched schedule(void)
+ asmlinkage void __sched __schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
int cpu;
- need_resched:
- preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_qsctr_inc(cpu);
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
- prev->sched_class->put_prev_task(rq, prev);
- next = pick_next_task(rq, prev);
+ put_prev_task(rq, prev);
+ next = pick_next_task(rq);
if (likely(prev != next)) {
sched_info_switch(prev, next);
+ perf_counter_task_sched_out(prev, cpu);
rq->nr_switches++;
rq->curr = next;
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
+ }
+ asmlinkage void __sched schedule(void)
+ {
+ need_resched:
+ preempt_disable();
+ __schedule();
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
EXPORT_SYMBOL(schedule);
+ #ifdef CONFIG_SMP
+ /*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+ {
+ unsigned int cpu;
+ struct rq *rq;
+
+ if (!sched_feat(OWNER_SPIN))
+ return 0;
+
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * Need to access the cpu field knowing that
+ * DEBUG_PAGEALLOC could have unmapped it if
+ * the mutex owner just released it and exited.
+ */
+ if (probe_kernel_address(&owner->cpu, cpu))
+ goto out;
+ #else
+ cpu = owner->cpu;
+ #endif
+
+ /*
+ * Even if the access succeeded (likely case),
+ * the cpu field may no longer be valid.
+ */
+ if (cpu >= nr_cpumask_bits)
+ goto out;
+
+ /*
+ * We need to validate that we can do a
+ * get_cpu() and that we have the percpu area.
+ */
+ if (!cpu_online(cpu))
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ /*
+ * Owner changed, break to re-assess state.
+ */
+ if (lock->owner != owner)
+ break;
+
+ /*
+ * Is that owner really running on that cpu?
+ */
+ if (task_thread_info(rq->curr) != owner || need_resched())
+ return 0;
+
+ cpu_relax();
+ }
+ out:
+ return 1;
+ }
+ #endif
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
* between schedule and now.
*/
barrier();
- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ } while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
* between schedule and now.
*/
barrier();
- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ } while (need_resched());
}
#endif /* CONFIG_PREEMPT */
__wake_up_common(q, mode, 1, 0, NULL);
}
+ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+ {
+ __wake_up_common(q, mode, 1, 0, key);
+ }
+
/**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
* away soon, so while the target thread will be woken up, it will not
*
* On UP it can prevent extra preemption.
*/
- void
- __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
{
unsigned long flags;
int sync = 1;
sync = 0;
spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+ __wake_up_common(q, mode, nr_exclusive, sync, key);
spin_unlock_irqrestore(&q->lock, flags);
}
+ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+ /*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+ void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ {
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+ }
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
/**
if (increment > 40)
increment = 40;
- nice = PRIO_TO_NICE(current->static_prio) + increment;
+ nice = TASK_NICE(current) + increment;
if (nice < -20)
nice = -20;
if (nice > 19)
if (!rq->nr_running)
break;
update_rq_clock(rq);
- next = pick_next_task(rq, rq->curr);
+ next = pick_next_task(rq);
if (!next)
break;
next->sched_class->put_prev_task(rq, next);
{
int group;
- cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
if (sg)
*sg = &per_cpu(sched_group_core, group).sg;
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
#else
group = cpu;
SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd),
- &per_cpu(cpu_sibling_map, i), cpu_map);
+ topology_thread_cpumask(i), cpu_map);
sd->parent = p;
p->child = sd;
cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
/* Set up CPU (sibling) groups */
for_each_cpu(i, cpu_map) {
cpumask_and(this_sibling_map,
- &per_cpu(cpu_sibling_map, i), cpu_map);
+ topology_thread_cpumask(i), cpu_map);
if (i != cpumask_first(this_sibling_map))
continue;
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- rt_rq->highest_prio = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ #ifdef CONFIG_SMP
+ rt_rq->highest_prio.next = MAX_RT_PRIO;
+ #endif
#endif
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
+ plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
#endif
rt_rq->rt_time = 0;
#ifdef CONFIG_USER_SCHED
alloc_size *= 2;
#endif
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ alloc_size += num_possible_cpus() * cpumask_size();
+ #endif
/*
* As sched_init() is called before page_alloc is setup,
* we use alloc_bootmem().
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_USER_SCHED */
#endif /* CONFIG_RT_GROUP_SCHED */
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+ ptr += cpumask_size();
+ }
+ #endif /* CONFIG_CPUMASK_OFFSTACK */
}
#ifdef CONFIG_SMP
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
#ifndef CONFIG_64BIT
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
#ifndef CONFIG_64BIT
/*
struct cpuacct *ca;
int cpu;
- if (!cpuacct_subsys.active)
+ if (unlikely(!cpuacct_subsys.active))
return;
cpu = task_cpu(tsk);
ca = task_ca(tsk);
for (; ca; ca = ca->parent) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
}
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
+#include <linux/perf_counter.h>
#include <linux/resource.h>
#include <linux/kernel.h>
#include <linux/kexec.h>
#include <linux/seccomp.h>
#include <linux/cpu.h>
#include <linux/ptrace.h>
+ #include <linux/fs_struct.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
if (err)
goto out;
- if (task_pgrp(p) != pgrp) {
+ if (task_pgrp(p) != pgrp)
change_pid(p, PIDTYPE_PGID, pgrp);
- set_task_pgrp(p, pid_nr(pgrp));
- }
err = 0;
out:
case PR_SET_TSC:
error = SET_TSC_CTL(arg2);
break;
+ case PR_TASK_PERF_COUNTERS_DISABLE:
+ error = perf_counter_task_disable();
+ break;
+ case PR_TASK_PERF_COUNTERS_ENABLE:
+ error = perf_counter_task_enable();
+ break;
case PR_GET_TIMERSLACK:
error = current->timer_slack_ns;
break;