* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
#include <linux/module.h>
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/arch_hooks.h>
+#include <asm/nmi.h>
#include <mach_apic.h>
#include <mach_wakecpu.h>
EXPORT_SYMBOL(smp_num_siblings);
#endif
-/* Package ID of each logical CPU */
-int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-EXPORT_SYMBOL(phys_proc_id);
-
-/* Core ID of each logical CPU */
-int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-EXPORT_SYMBOL(cpu_core_id);
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+/* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_sibling_map);
+/* representing HT and core siblings of each logical CPU */
cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_core_map);
cpumask_t cpu_callin_map;
cpumask_t cpu_callout_map;
EXPORT_SYMBOL(cpu_callout_map);
-#ifdef CONFIG_HOTPLUG_CPU
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
-#else
cpumask_t cpu_possible_map;
-#endif
EXPORT_SYMBOL(cpu_possible_map);
static cpumask_t smp_commenced_mask;
{ [0 ... NR_CPUS-1] = 0xff };
EXPORT_SYMBOL(x86_cpu_to_apicid);
+u8 apicid_2_node[MAX_APICID];
+
/*
* Trampoline 80x86 program as an array.
*/
*/
if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
+ if (num_possible_cpus() == 1)
+ goto valid_k7;
+
/* Athlon 660/661 is valid. */
if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
goto valid_k7;
* then we print a warning if not, and always resync.
*/
-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
+static struct {
+ atomic_t start_flag;
+ atomic_t count_start;
+ atomic_t count_stop;
+ unsigned long long values[NR_CPUS];
+} tsc __initdata = {
+ .start_flag = ATOMIC_INIT(0),
+ .count_start = ATOMIC_INIT(0),
+ .count_stop = ATOMIC_INIT(0),
+};
#define NR_LOOPS 5
-static void __init synchronize_tsc_bp (void)
+static void __init synchronize_tsc_bp(void)
{
int i;
unsigned long long t0;
/* convert from kcyc/sec to cyc/usec */
one_usec = cpu_khz / 1000;
- atomic_set(&tsc_start_flag, 1);
+ atomic_set(&tsc.start_flag, 1);
wmb();
/*
/*
* all APs synchronize but they loop on '== num_cpus'
*/
- while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_stop, 0);
+ while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
+ cpu_relax();
+ atomic_set(&tsc.count_stop, 0);
wmb();
/*
* this lets the APs save their current TSC:
*/
- atomic_inc(&tsc_count_start);
+ atomic_inc(&tsc.count_start);
- rdtscll(tsc_values[smp_processor_id()]);
+ rdtscll(tsc.values[smp_processor_id()]);
/*
* We clear the TSC in the last loop:
*/
/*
* Wait for all APs to leave the synchronization point:
*/
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_start, 0);
+ while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
+ cpu_relax();
+ atomic_set(&tsc.count_start, 0);
wmb();
- atomic_inc(&tsc_count_stop);
+ atomic_inc(&tsc.count_stop);
}
sum = 0;
for (i = 0; i < NR_CPUS; i++) {
if (cpu_isset(i, cpu_callout_map)) {
- t0 = tsc_values[i];
+ t0 = tsc.values[i];
sum += t0;
}
}
avg = sum;
do_div(avg, num_booting_cpus());
- sum = 0;
for (i = 0; i < NR_CPUS; i++) {
if (!cpu_isset(i, cpu_callout_map))
continue;
- delta = tsc_values[i] - avg;
+ delta = tsc.values[i] - avg;
if (delta < 0)
delta = -delta;
/*
* We report bigger than 2 microseconds clock differences.
*/
if (delta > 2*one_usec) {
- long realdelta;
+ long long realdelta;
+
if (!buggy) {
buggy = 1;
printk("\n");
}
realdelta = delta;
do_div(realdelta, one_usec);
- if (tsc_values[i] < avg)
+ if (tsc.values[i] < avg)
realdelta = -realdelta;
- printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
+ if (realdelta)
+ printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
+ "skew, fixed it up.\n", i, realdelta);
}
-
- sum += delta;
}
if (!buggy)
printk("passed.\n");
}
-static void __init synchronize_tsc_ap (void)
+static void __init synchronize_tsc_ap(void)
{
int i;
* this gets called, so we first wait for the BP to
* finish SMP initialization:
*/
- while (!atomic_read(&tsc_start_flag)) mb();
+ while (!atomic_read(&tsc.start_flag))
+ cpu_relax();
for (i = 0; i < NR_LOOPS; i++) {
- atomic_inc(&tsc_count_start);
- while (atomic_read(&tsc_count_start) != num_booting_cpus())
- mb();
+ atomic_inc(&tsc.count_start);
+ while (atomic_read(&tsc.count_start) != num_booting_cpus())
+ cpu_relax();
- rdtscll(tsc_values[smp_processor_id()]);
+ rdtscll(tsc.values[smp_processor_id()]);
if (i == NR_LOOPS-1)
write_tsc(0, 0);
- atomic_inc(&tsc_count_stop);
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+ atomic_inc(&tsc.count_stop);
+ while (atomic_read(&tsc.count_stop) != num_booting_cpus())
+ cpu_relax();
}
}
#undef NR_LOOPS
static int cpucount;
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+ struct cpuinfo_x86 *c = cpu_data + cpu;
+ /*
+ * For perf, we return last level cache shared map.
+ * And for power savings, we return cpu_core_map
+ */
+ if (sched_mc_power_savings || sched_smt_power_savings)
+ return cpu_core_map[cpu];
+ else
+ return c->llc_shared_map;
+}
+
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
static inline void
set_cpu_sibling_map(int cpu)
{
int i;
+ struct cpuinfo_x86 *c = cpu_data;
+
+ cpu_set(cpu, cpu_sibling_setup_map);
if (smp_num_siblings > 1) {
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_isset(i, cpu_callout_map))
- continue;
- if (cpu_core_id[cpu] == cpu_core_id[i]) {
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+ c[cpu].cpu_core_id == c[i].cpu_core_id) {
cpu_set(i, cpu_sibling_map[cpu]);
cpu_set(cpu, cpu_sibling_map[i]);
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ cpu_set(i, c[cpu].llc_shared_map);
+ cpu_set(cpu, c[i].llc_shared_map);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
- if (current_cpu_data.x86_num_cores > 1) {
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_isset(i, cpu_callout_map))
- continue;
- if (phys_proc_id[cpu] == phys_proc_id[i]) {
- cpu_set(i, cpu_core_map[cpu]);
- cpu_set(cpu, cpu_core_map[i]);
- }
- }
- } else {
+ cpu_set(cpu, c[cpu].llc_shared_map);
+
+ if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
+ c[cpu].booted_cores = 1;
+ return;
+ }
+
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (cpu_llc_id[cpu] != BAD_APICID &&
+ cpu_llc_id[cpu] == cpu_llc_id[i]) {
+ cpu_set(i, c[cpu].llc_shared_map);
+ cpu_set(cpu, c[i].llc_shared_map);
+ }
+ if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ /*
+ * Does this new cpu bringup a new core?
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+ /*
+ * for each core in package, increment
+ * the booted_cores for this new cpu
+ */
+ if (first_cpu(cpu_sibling_map[i]) == i)
+ c[cpu].booted_cores++;
+ /*
+ * increment the core count for all
+ * the other cpus in this package
+ */
+ if (i != cpu)
+ c[i].booted_cores++;
+ } else if (i != cpu && !c[cpu].booted_cores)
+ c[cpu].booted_cores = c[i].booted_cores;
+ }
}
}
* things done here to the most necessary things.
*/
cpu_init();
+ preempt_disable();
smp_callin();
while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
rep_nop();
/* which logical CPUs are on which nodes */
cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+EXPORT_SYMBOL(node_2_cpu_mask);
/* which node each logical CPU is on */
int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
EXPORT_SYMBOL(cpu_2_node);
{
int cpu = smp_processor_id();
int apicid = logical_smp_processor_id();
+ int node = apicid_to_node(apicid);
+
+ if (!node_online(node))
+ node = first_online_node;
cpu_2_logical_apicid[cpu] = apicid;
- map_cpu_to_node(cpu, apicid_to_node(apicid));
+ map_cpu_to_node(cpu, node);
}
static void unmap_cpu_to_logical_apicid(int cpu)
/* initialize thread_struct. we really want to avoid destroy
* idle tread
*/
- idle->thread.esp = (unsigned long)(((struct pt_regs *)
- (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1);
+ idle->thread.esp = (unsigned long)task_pt_regs(idle);
init_idle(idle, cpu);
return idle;
}
unsigned short nmi_high = 0, nmi_low = 0;
++cpucount;
+ alternatives_smp_switch(1);
/*
* We can't use kernel_thread since we must avoid to
irq_ctx_init(cpu);
+ x86_cpu_to_apicid[cpu] = apicid;
/*
* This grunge runs the startup process for
* the targeted processor.
cpu_clear(cpu, cpu_callout_map);
cpu_clear(cpu, cpu_callin_map);
- cpu_clear(cpu, cpu_present_map);
cpu_clear(cpu, smp_commenced_mask);
unmap_cpu_to_logical_apicid(cpu);
struct warm_boot_cpu_info {
struct completion *complete;
+ struct work_struct task;
int apicid;
int cpu;
};
-static void __devinit do_warm_boot_cpu(void *p)
+static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
{
- struct warm_boot_cpu_info *info = p;
+ struct warm_boot_cpu_info *info =
+ container_of(work, struct warm_boot_cpu_info, task);
do_boot_cpu(info->apicid, info->cpu);
complete(info->complete);
}
-int __devinit smp_prepare_cpu(int cpu)
+static int __cpuinit __smp_prepare_cpu(int cpu)
{
- DECLARE_COMPLETION(done);
+ DECLARE_COMPLETION_ONSTACK(done);
struct warm_boot_cpu_info info;
- struct work_struct task;
int apicid, ret;
+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
- lock_cpu_hotplug();
apicid = x86_cpu_to_apicid[cpu];
if (apicid == BAD_APICID) {
ret = -ENODEV;
goto exit;
}
+ /*
+ * the CPU isn't initialized at boot time, allocate gdt table here.
+ * cpu_init will initialize it
+ */
+ if (!cpu_gdt_descr->address) {
+ cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+ if (!cpu_gdt_descr->address)
+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+ ret = -ENOMEM;
+ goto exit;
+ }
+
info.complete = &done;
info.apicid = apicid;
info.cpu = cpu;
- INIT_WORK(&task, do_warm_boot_cpu, &info);
+ INIT_WORK(&info.task, do_warm_boot_cpu);
tsc_sync_disabled = 1;
clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
KERNEL_PGD_PTRS);
flush_tlb_all();
- schedule_work(&task);
+ schedule_work(&info.task);
wait_for_completion(&done);
tsc_sync_disabled = 0;
zap_low_mappings();
ret = 0;
exit:
- unlock_cpu_hotplug();
return ret;
}
#endif
cachesize = 16; /* Pentiums, 2x8kB cache */
bandwidth = 100;
}
+ max_cache_size = cachesize * 1024;
}
}
current_thread_info()->cpu = 0;
smp_tune_scheduling();
- cpus_clear(cpu_sibling_map[0]);
- cpu_set(0, cpu_sibling_map[0]);
- cpus_clear(cpu_core_map[0]);
- cpu_set(0, cpu_core_map[0]);
+ set_cpu_sibling_map(0);
/*
* If we couldn't find an SMP configuration at boot time,
remove_siblinginfo(int cpu)
{
int sibling;
+ struct cpuinfo_x86 *c = cpu_data;
+ for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ /*
+ * last thread sibling in this cpu core going down
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+ c[sibling].booted_cores--;
+ }
+
for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
cpu_clear(cpu, cpu_sibling_map[sibling]);
- for_each_cpu_mask(sibling, cpu_core_map[cpu])
- cpu_clear(cpu, cpu_core_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
- phys_proc_id[cpu] = BAD_APICID;
- cpu_core_id[cpu] = BAD_APICID;
+ c[cpu].phys_proc_id = 0;
+ c[cpu].cpu_core_id = 0;
+ cpu_clear(cpu, cpu_sibling_setup_map);
}
int __cpu_disable(void)
*/
if (cpu == 0)
return -EBUSY;
-
- /* We enable the timer again on the exit path of the death loop */
- disable_APIC_timer();
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ stop_apic_nmi_watchdog(NULL);
+ clear_local_APIC();
/* Allow any queued timer interrupts to get serviced */
local_irq_enable();
mdelay(1);
/* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
printk ("CPU %d is now offline\n", cpu);
+ if (1 == num_online_cpus())
+ alternatives_smp_switch(0);
return;
}
msleep(100);
int __devinit __cpu_up(unsigned int cpu)
{
+#ifdef CONFIG_HOTPLUG_CPU
+ int ret=0;
+
+ /*
+ * We do warm boot only on cpus that had booted earlier
+ * Otherwise cold boot is all handled from smp_boot_cpus().
+ * cpu_callin_map is set during AP kickstart process. Its reset
+ * when a cpu is taken offline from cpu_exit_clear().
+ */
+ if (!cpu_isset(cpu, cpu_callin_map))
+ ret = __smp_prepare_cpu(cpu);
+
+ if (ret)
+ return -EIO;
+#endif
+
/* In case one didn't come up */
if (!cpu_isset(cpu, cpu_callin_map)) {
printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
/* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask);
while (!cpu_isset(cpu, cpu_online_map))
- mb();
+ cpu_relax();
return 0;
}
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
}
+
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+ extern unsigned int maxcpus;
+
+ maxcpus = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+early_param("maxcpus", parse_maxcpus);