X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fi386%2Fkernel%2Fsmpboot.c;h=02a9b66b6ac38ae95aa199e8d91e7576d10daeba;hb=c4028958b6ecad064b1a6303a6a5906d4fe48d73;hp=e6488ffc1f7928cfca08ec35a5968c46ae614008;hpb=38e548ee1a79c8da7b3d9e26f2adce9b61413f84;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index e6488ff..02a9b66 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -34,7 +34,6 @@ * Rusty Russell : Hacked into shape for new "hotplug" boot process. */ #include -#include #include #include @@ -52,6 +51,7 @@ #include #include #include +#include #include #include @@ -66,17 +66,14 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); #endif -/* Package ID of each logical CPU */ -int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; -EXPORT_SYMBOL(phys_proc_id); - -/* Core ID of each logical CPU */ -int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; -EXPORT_SYMBOL(cpu_core_id); +/* Last level cache ID of each logical CPU */ +int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; +/* representing HT siblings of each logical CPU */ cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_sibling_map); +/* representing HT and core siblings of each logical CPU */ cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); @@ -87,11 +84,7 @@ EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; EXPORT_SYMBOL(cpu_callout_map); -#ifdef CONFIG_HOTPLUG_CPU -cpumask_t cpu_possible_map = CPU_MASK_ALL; -#else cpumask_t cpu_possible_map; -#endif EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; @@ -109,6 +102,8 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); +u8 apicid_2_node[MAX_APICID]; + /* * Trampoline 80x86 program as an array. */ @@ -184,6 +179,9 @@ static void __devinit smp_store_cpu_info(int id) */ if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { + if (num_possible_cpus() == 1) + goto valid_k7; + /* Athlon 660/661 is valid. */ if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1))) goto valid_k7; @@ -219,14 +217,20 @@ valid_k7: * then we print a warning if not, and always resync. */ -static atomic_t tsc_start_flag = ATOMIC_INIT(0); -static atomic_t tsc_count_start = ATOMIC_INIT(0); -static atomic_t tsc_count_stop = ATOMIC_INIT(0); -static unsigned long long tsc_values[NR_CPUS]; +static struct { + atomic_t start_flag; + atomic_t count_start; + atomic_t count_stop; + unsigned long long values[NR_CPUS]; +} tsc __initdata = { + .start_flag = ATOMIC_INIT(0), + .count_start = ATOMIC_INIT(0), + .count_stop = ATOMIC_INIT(0), +}; #define NR_LOOPS 5 -static void __init synchronize_tsc_bp (void) +static void __init synchronize_tsc_bp(void) { int i; unsigned long long t0; @@ -240,7 +244,7 @@ static void __init synchronize_tsc_bp (void) /* convert from kcyc/sec to cyc/usec */ one_usec = cpu_khz / 1000; - atomic_set(&tsc_start_flag, 1); + atomic_set(&tsc.start_flag, 1); wmb(); /* @@ -257,16 +261,16 @@ static void __init synchronize_tsc_bp (void) /* * all APs synchronize but they loop on '== num_cpus' */ - while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) - mb(); - atomic_set(&tsc_count_stop, 0); + while (atomic_read(&tsc.count_start) != num_booting_cpus()-1) + cpu_relax(); + atomic_set(&tsc.count_stop, 0); wmb(); /* * this lets the APs save their current TSC: */ - atomic_inc(&tsc_count_start); + atomic_inc(&tsc.count_start); - rdtscll(tsc_values[smp_processor_id()]); + rdtscll(tsc.values[smp_processor_id()]); /* * We clear the TSC in the last loop: */ @@ -276,54 +280,54 @@ static void __init synchronize_tsc_bp (void) /* * Wait for all APs to leave the synchronization point: */ - while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) - mb(); - atomic_set(&tsc_count_start, 0); + while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1) + cpu_relax(); + atomic_set(&tsc.count_start, 0); wmb(); - atomic_inc(&tsc_count_stop); + atomic_inc(&tsc.count_stop); } sum = 0; for (i = 0; i < NR_CPUS; i++) { if (cpu_isset(i, cpu_callout_map)) { - t0 = tsc_values[i]; + t0 = tsc.values[i]; sum += t0; } } avg = sum; do_div(avg, num_booting_cpus()); - sum = 0; for (i = 0; i < NR_CPUS; i++) { if (!cpu_isset(i, cpu_callout_map)) continue; - delta = tsc_values[i] - avg; + delta = tsc.values[i] - avg; if (delta < 0) delta = -delta; /* * We report bigger than 2 microseconds clock differences. */ if (delta > 2*one_usec) { - long realdelta; + long long realdelta; + if (!buggy) { buggy = 1; printk("\n"); } realdelta = delta; do_div(realdelta, one_usec); - if (tsc_values[i] < avg) + if (tsc.values[i] < avg) realdelta = -realdelta; - printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta); + if (realdelta) + printk(KERN_INFO "CPU#%d had %Ld usecs TSC " + "skew, fixed it up.\n", i, realdelta); } - - sum += delta; } if (!buggy) printk("passed.\n"); } -static void __init synchronize_tsc_ap (void) +static void __init synchronize_tsc_ap(void) { int i; @@ -332,19 +336,21 @@ static void __init synchronize_tsc_ap (void) * this gets called, so we first wait for the BP to * finish SMP initialization: */ - while (!atomic_read(&tsc_start_flag)) mb(); + while (!atomic_read(&tsc.start_flag)) + cpu_relax(); for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc_count_start); - while (atomic_read(&tsc_count_start) != num_booting_cpus()) - mb(); + atomic_inc(&tsc.count_start); + while (atomic_read(&tsc.count_start) != num_booting_cpus()) + cpu_relax(); - rdtscll(tsc_values[smp_processor_id()]); + rdtscll(tsc.values[smp_processor_id()]); if (i == NR_LOOPS-1) write_tsc(0, 0); - atomic_inc(&tsc_count_stop); - while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + atomic_inc(&tsc.count_stop); + while (atomic_read(&tsc.count_stop) != num_booting_cpus()) + cpu_relax(); } } #undef NR_LOOPS @@ -444,35 +450,83 @@ static void __devinit smp_callin(void) static int cpucount; +/* maps the cpu to the sched domain representing multi-core */ +cpumask_t cpu_coregroup_map(int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + /* + * For perf, we return last level cache shared map. + * And for power savings, we return cpu_core_map + */ + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_map[cpu]; + else + return c->llc_shared_map; +} + +/* representing cpus for which sibling maps can be computed */ +static cpumask_t cpu_sibling_setup_map; + static inline void set_cpu_sibling_map(int cpu) { int i; + struct cpuinfo_x86 *c = cpu_data; + + cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (cpu_core_id[cpu] == cpu_core_id[i]) { + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id && + c[cpu].cpu_core_id == c[i].cpu_core_id) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); } } } else { cpu_set(cpu, cpu_sibling_map[cpu]); } - if (current_cpu_data.x86_num_cores > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); - } - } - } else { + cpu_set(cpu, c[cpu].llc_shared_map); + + if (current_cpu_data.x86_max_cores == 1) { cpu_core_map[cpu] = cpu_sibling_map[cpu]; + c[cpu].booted_cores = 1; + return; + } + + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (cpu_llc_id[cpu] != BAD_APICID && + cpu_llc_id[cpu] == cpu_llc_id[i]) { + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); + } + if (c[cpu].phys_proc_id == c[i].phys_proc_id) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + /* + * Does this new cpu bringup a new core? + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (first_cpu(cpu_sibling_map[i]) == i) + c[cpu].booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + c[i].booted_cores++; + } else if (i != cpu && !c[cpu].booted_cores) + c[cpu].booted_cores = c[i].booted_cores; + } } } @@ -487,6 +541,7 @@ static void __devinit start_secondary(void *unused) * things done here to the most necessary things. */ cpu_init(); + preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); @@ -557,6 +612,7 @@ extern struct { /* which logical CPUs are on which nodes */ cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; +EXPORT_SYMBOL(node_2_cpu_mask); /* which node each logical CPU is on */ int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; EXPORT_SYMBOL(cpu_2_node); @@ -592,9 +648,13 @@ static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); int apicid = logical_smp_processor_id(); + int node = apicid_to_node(apicid); + + if (!node_online(node)) + node = first_online_node; cpu_2_logical_apicid[cpu] = apicid; - map_cpu_to_node(cpu, apicid_to_node(apicid)); + map_cpu_to_node(cpu, node); } static void unmap_cpu_to_logical_apicid(int cpu) @@ -849,8 +909,7 @@ static inline struct task_struct * alloc_idle_task(int cpu) /* initialize thread_struct. we really want to avoid destroy * idle tread */ - idle->thread.esp = (unsigned long)(((struct pt_regs *) - (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1); + idle->thread.esp = (unsigned long)task_pt_regs(idle); init_idle(idle, cpu); return idle; } @@ -878,6 +937,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu) unsigned short nmi_high = 0, nmi_low = 0; ++cpucount; + alternatives_smp_switch(1); /* * We can't use kernel_thread since we must avoid to @@ -897,6 +957,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu) irq_ctx_init(cpu); + x86_cpu_to_apicid[cpu] = apicid; /* * This grunge runs the startup process for * the targeted processor. @@ -981,7 +1042,6 @@ void cpu_exit_clear(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); - cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, smp_commenced_mask); unmap_cpu_to_logical_apicid(cpu); @@ -989,35 +1049,48 @@ void cpu_exit_clear(void) struct warm_boot_cpu_info { struct completion *complete; + struct work_struct task; int apicid; int cpu; }; -static void __devinit do_warm_boot_cpu(void *p) +static void __cpuinit do_warm_boot_cpu(struct work_struct *work) { - struct warm_boot_cpu_info *info = p; + struct warm_boot_cpu_info *info = + container_of(work, struct warm_boot_cpu_info, task); do_boot_cpu(info->apicid, info->cpu); complete(info->complete); } -int __devinit smp_prepare_cpu(int cpu) +static int __cpuinit __smp_prepare_cpu(int cpu) { - DECLARE_COMPLETION(done); + DECLARE_COMPLETION_ONSTACK(done); struct warm_boot_cpu_info info; - struct work_struct task; int apicid, ret; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - lock_cpu_hotplug(); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { ret = -ENODEV; goto exit; } + /* + * the CPU isn't initialized at boot time, allocate gdt table here. + * cpu_init will initialize it + */ + if (!cpu_gdt_descr->address) { + cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); + if (!cpu_gdt_descr->address) + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); + ret = -ENOMEM; + goto exit; + } + info.complete = &done; info.apicid = apicid; info.cpu = cpu; - INIT_WORK(&task, do_warm_boot_cpu, &info); + INIT_WORK(&info.task, do_warm_boot_cpu); tsc_sync_disabled = 1; @@ -1025,14 +1098,13 @@ int __devinit smp_prepare_cpu(int cpu) clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, KERNEL_PGD_PTRS); flush_tlb_all(); - schedule_work(&task); + schedule_work(&info.task); wait_for_completion(&done); tsc_sync_disabled = 0; zap_low_mappings(); ret = 0; exit: - unlock_cpu_hotplug(); return ret; } #endif @@ -1064,6 +1136,7 @@ static void smp_tune_scheduling (void) cachesize = 16; /* Pentiums, 2x8kB cache */ bandwidth = 100; } + max_cache_size = cachesize * 1024; } } @@ -1096,11 +1169,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus) current_thread_info()->cpu = 0; smp_tune_scheduling(); - cpus_clear(cpu_sibling_map[0]); - cpu_set(0, cpu_sibling_map[0]); - cpus_clear(cpu_core_map[0]); - cpu_set(0, cpu_core_map[0]); + set_cpu_sibling_map(0); /* * If we couldn't find an SMP configuration at boot time, @@ -1279,15 +1349,24 @@ static void remove_siblinginfo(int cpu) { int sibling; + struct cpuinfo_x86 *c = cpu_data; + for_each_cpu_mask(sibling, cpu_core_map[cpu]) { + cpu_clear(cpu, cpu_core_map[sibling]); + /* + * last thread sibling in this cpu core going down + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) + c[sibling].booted_cores--; + } + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) cpu_clear(cpu, cpu_sibling_map[sibling]); - for_each_cpu_mask(sibling, cpu_core_map[cpu]) - cpu_clear(cpu, cpu_core_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); - phys_proc_id[cpu] = BAD_APICID; - cpu_core_id[cpu] = BAD_APICID; + c[cpu].phys_proc_id = 0; + c[cpu].cpu_core_id = 0; + cpu_clear(cpu, cpu_sibling_setup_map); } int __cpu_disable(void) @@ -1305,9 +1384,9 @@ int __cpu_disable(void) */ if (cpu == 0) return -EBUSY; - - /* We enable the timer again on the exit path of the death loop */ - disable_APIC_timer(); + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); + clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); mdelay(1); @@ -1331,6 +1410,8 @@ void __cpu_die(unsigned int cpu) /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { printk ("CPU %d is now offline\n", cpu); + if (1 == num_online_cpus()) + alternatives_smp_switch(0); return; } msleep(100); @@ -1352,6 +1433,22 @@ void __cpu_die(unsigned int cpu) int __devinit __cpu_up(unsigned int cpu) { +#ifdef CONFIG_HOTPLUG_CPU + int ret=0; + + /* + * We do warm boot only on cpus that had booted earlier + * Otherwise cold boot is all handled from smp_boot_cpus(). + * cpu_callin_map is set during AP kickstart process. Its reset + * when a cpu is taken offline from cpu_exit_clear(). + */ + if (!cpu_isset(cpu, cpu_callin_map)) + ret = __smp_prepare_cpu(cpu); + + if (ret) + return -EIO; +#endif + /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); @@ -1364,7 +1461,7 @@ int __devinit __cpu_up(unsigned int cpu) /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) - mb(); + cpu_relax(); return 0; } @@ -1402,3 +1499,16 @@ void __init smp_intr_init(void) /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); } + +/* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ +static int __init parse_maxcpus(char *arg) +{ + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(arg, NULL, 0); + return 0; +} +early_param("maxcpus", parse_maxcpus);