X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86_64%2Fkernel%2Fsmpboot.c;h=daf19332f0dd02ed7ab4257e5c887989dfeb0ef8;hb=4c3cbf75b262433afc90b5c35510d1e5744d3b94;hp=fa25e39fe54dfc4aab4390c95b45039da0202558;hpb=1eecd73cce4e11ba9d67ad767f92069cfba7b589;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index fa25e39..daf1933 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -38,19 +38,18 @@ */ -#include #include #include #include #include -#include #include #include #include - #include #include +#include + #include #include #include @@ -58,17 +57,21 @@ #include #include #include +#include +#include +#include +#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; -/* Package ID of each logical CPU */ -u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(phys_proc_id); -EXPORT_SYMBOL(cpu_core_id); +EXPORT_SYMBOL(smp_num_siblings); + +/* Last level cache ID of each logical CPU */ +u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; +EXPORT_SYMBOL(cpu_llc_id); /* Bitmask of currently online CPUs */ -cpumask_t cpu_online_map; +cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); @@ -78,18 +81,24 @@ EXPORT_SYMBOL(cpu_online_map); */ cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; +EXPORT_SYMBOL(cpu_callout_map); cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_data); /* Set when the idlers are all forked */ int smp_threads_ready; -cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; -cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; +/* representing HT siblings of each logical CPU */ +cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_sibling_map); + +/* representing HT and core siblings of each logical CPU */ +cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); /* @@ -332,7 +341,13 @@ static __cpuinit void sync_tsc(unsigned int master) static void __cpuinit tsc_sync_wait(void) { - if (notscsync || !cpu_has_tsc) + /* + * When the CPU has synchronized TSCs assume the BIOS + * or the hardware already synced. Otherwise we could + * mess up a possible perfect synchronization with a + * not-quite-perfect algorithm. + */ + if (notscsync || !cpu_has_tsc || !unsynchronized_tsc()) return; sync_tsc(0); } @@ -340,7 +355,7 @@ static void __cpuinit tsc_sync_wait(void) static __init int notscsync_setup(char *s) { notscsync = 1; - return 0; + return 1; } __setup("notscsync", notscsync_setup); @@ -413,8 +428,13 @@ void __cpuinit smp_callin(void) /* * Get our bogomips. + * + * Need to enable IRQs because it can take longer and then + * the NMI watchdog might kill us. */ + local_irq_enable(); calibrate_delay(); + local_irq_disable(); Dprintk("Stack at about %p\n",&cpuid); disable_APIC_timer(); @@ -430,30 +450,82 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +/* maps the cpu to the sched domain representing multi-core */ +cpumask_t cpu_coregroup_map(int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + /* + * For perf, we return last level cache shared map. + * And for power savings, we return cpu_core_map + */ + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_map[cpu]; + else + return c->llc_shared_map; +} + +/* representing cpus for which sibling maps can be computed */ +static cpumask_t cpu_sibling_setup_map; + static inline void set_cpu_sibling_map(int cpu) { int i; + struct cpuinfo_x86 *c = cpu_data; + + cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for_each_cpu(i) { - if (cpu_core_id[cpu] == cpu_core_id[i]) { + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id && + c[cpu].cpu_core_id == c[i].cpu_core_id) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); } } } else { cpu_set(cpu, cpu_sibling_map[cpu]); } - if (current_cpu_data.x86_num_cores > 1) { - for_each_cpu(i) { - if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); - } - } - } else { + cpu_set(cpu, c[cpu].llc_shared_map); + + if (current_cpu_data.x86_max_cores == 1) { cpu_core_map[cpu] = cpu_sibling_map[cpu]; + c[cpu].booted_cores = 1; + return; + } + + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (cpu_llc_id[cpu] != BAD_APICID && + cpu_llc_id[cpu] == cpu_llc_id[i]) { + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); + } + if (c[cpu].phys_proc_id == c[i].phys_proc_id) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + /* + * Does this new cpu bringup a new core? + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (first_cpu(cpu_sibling_map[i]) == i) + c[cpu].booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + c[i].booted_cores++; + } else if (i != cpu && !c[cpu].booted_cores) + c[cpu].booted_cores = c[i].booted_cores; + } } } @@ -468,6 +540,7 @@ void __cpuinit start_secondary(void) * things done here to the most necessary things. */ cpu_init(); + preempt_disable(); smp_callin(); /* otherwise gcc will move up the smp_processor_id before the cpu_init */ @@ -509,12 +582,16 @@ void __cpuinit start_secondary(void) * smp_call_function(). */ lock_ipi_call_lock(); + spin_lock(&vector_lock); + /* Setup the per cpu irq handling data structures */ + __setup_vector_irq(smp_processor_id()); /* * Allow the master to continue. */ cpu_set(smp_processor_id(), cpu_online_map); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + spin_unlock(&vector_lock); unlock_ipi_call_lock(); cpu_idle(); @@ -540,8 +617,8 @@ static void inquire_remote_apic(int apicid) */ apic_wait_icr_idle(); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); timeout = 0; do { @@ -574,12 +651,12 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); @@ -595,10 +672,10 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta Dprintk("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); timeout = 0; @@ -608,18 +685,10 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; } while (send_status && (timeout++ < 1000)); + mb(); atomic_set(&init_deasserted, 1); - /* - * Should we send STARTUP IPIs ? - * - * Determine this based on the APIC version. - * If we don't have an integrated APIC, don't send the STARTUP IPIs. - */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) - num_starts = 2; - else - num_starts = 0; + num_starts = 2; /* * Run STARTUP IPI loop. @@ -630,7 +699,6 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); - apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); Dprintk("After apic_write.\n"); @@ -640,12 +708,11 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_rip >> 12)); + apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12)); /* * Give the other CPU some time to accept the IPI. @@ -670,7 +737,6 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta * Due to the Pentium erratum 3AP. */ if (maxlvt > 3) { - apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); } accept_status = (apic_read(APIC_ESR) & 0xEF); @@ -688,14 +754,16 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta } struct create_idle { + struct work_struct work; struct task_struct *idle; struct completion done; int cpu; }; -void do_fork_idle(void *_c_idle) +void do_fork_idle(struct work_struct *work) { - struct create_idle *c_idle = _c_idle; + struct create_idle *c_idle = + container_of(work, struct create_idle, work); c_idle->idle = fork_idle(c_idle->cpu); complete(&c_idle->done); @@ -710,16 +778,41 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) int timeout; unsigned long start_rip; struct create_idle c_idle = { + .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), .cpu = cpu, - .done = COMPLETION_INITIALIZER(c_idle.done), + .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - DECLARE_WORK(work, do_fork_idle, &c_idle); + + /* allocate memory for gdts of secondary cpus. Hotplug is considered */ + if (!cpu_gdt_descr[cpu].address && + !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { + printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); + return -1; + } + + /* Allocate node local memory for AP pdas */ + if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { + struct x8664_pda *newpda, *pda; + int node = cpu_to_node(cpu); + pda = cpu_pda(cpu); + newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC, + node); + if (newpda) { + memcpy(newpda, pda, sizeof (struct x8664_pda)); + cpu_pda(cpu) = newpda; + } else + printk(KERN_ERR + "Could not allocate node local PDA for CPU %d on node %d\n", + cpu, node); + } + + alternatives_smp_switch(1); c_idle.idle = get_idle_for_cpu(cpu); if (c_idle.idle) { c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); + (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); init_idle(c_idle.idle, cpu); goto do_rest; } @@ -735,9 +828,9 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) * thread. */ if (!keventd_up() || current_is_keventd()) - work.func(work.data); + c_idle.work.func(&c_idle.work); else { - schedule_work(&work); + schedule_work(&c_idle.work); wait_for_completion(&c_idle.done); } @@ -750,14 +843,14 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) do_rest: - cpu_pda[cpu].pcurrent = c_idle.idle; + cpu_pda(cpu)->pcurrent = c_idle.idle; start_rip = setup_trampoline(); init_rsp = c_idle.idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; initial_code = start_secondary; - clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); + clear_tsk_thread_flag(c_idle.idle, TIF_FORK); printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, cpus_weight(cpu_present_map), @@ -783,11 +876,8 @@ do_rest: /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); /* * Status is now clean @@ -836,6 +926,7 @@ do_rest: if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); x86_cpu_to_apicid[cpu] = BAD_APICID; @@ -864,17 +955,6 @@ static __cpuinit void smp_cleanup_boot(void) * Reset trampoline flag */ *((volatile int *) phys_to_virt(0x467)) = 0; - -#ifndef CONFIG_HOTPLUG_CPU - /* - * Free pages reserved for SMP bootup. - * When you add hotplug CPU support later remove this - * Note there is more work to be done for later CPU bootup. - */ - - free_page((unsigned long) __va(PAGE_SIZE)); - free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE)); -#endif } /* @@ -894,24 +974,10 @@ static __init void disable_smp(void) cpu_set(0, cpu_core_map[0]); } -/* - * Handle user cpus=... parameter. - */ -static __init void enforce_max_cpus(unsigned max_cpus) -{ - int i, k; - k = 0; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; - if (++k > max_cpus) { - cpu_clear(i, cpu_possible_map); - cpu_clear(i, cpu_present_map); - } - } -} - #ifdef CONFIG_HOTPLUG_CPU + +int additional_cpus __initdata = -1; + /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -920,14 +986,35 @@ static __init void enforce_max_cpus(unsigned max_cpus) * cpu_present_map on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current * behaviour, which is cpu_possible == cpu_present. - * If cpu-hotplug is supported, then we need to preallocate for all - * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - The user can overwrite it with additional_cpus=NUM + * - Otherwise don't reserve additional CPUs. + * We do this because additional CPUs waste a lot of memory. + * -AK */ -static void prefill_possible_map(void) +__init void prefill_possible_map(void) { int i; - for (i = 0; i < NR_CPUS; i++) + int possible; + + if (additional_cpus == -1) { + if (disabled_cpus > 0) + additional_cpus = disabled_cpus; + else + additional_cpus = 0; + } + possible = num_processors + additional_cpus; + if (possible > NR_CPUS) + possible = NR_CPUS; + + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + possible, + max_t(int, possible - num_processors, 0)); + + for (i = 0; i < possible; i++) cpu_set(i, cpu_possible_map); } #endif @@ -969,7 +1056,7 @@ static int __init smp_sanity_check(unsigned max_cpus) /* * If we couldn't find a local APIC, then get out of here now! */ - if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { + if (!cpu_has_apic) { printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", boot_cpu_id); printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); @@ -998,12 +1085,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ - - enforce_max_cpus(max_cpus); - -#ifdef CONFIG_HOTPLUG_CPU - prefill_possible_map(); -#endif + set_cpu_sibling_map(0); if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1015,7 +1097,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) /* * Switch from PIC to APIC mode. */ - connect_bsp_APIC(); setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { @@ -1047,8 +1128,6 @@ void __init smp_prepare_boot_cpu(void) int me = smp_processor_id(); cpu_set(me, cpu_online_map); cpu_set(me, cpu_callout_map); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); per_cpu(cpu_state, me) = CPU_ONLINE; } @@ -1091,6 +1170,13 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); + + if (num_online_cpus() > 8 && genapic == &apic_flat) { + printk(KERN_WARNING + "flat APIC routing can't be used with > 8 cpus\n"); + BUG(); + } + err = 0; return err; @@ -1101,18 +1187,10 @@ int __cpuinit __cpu_up(unsigned int cpu) */ void __init smp_cpus_done(unsigned int max_cpus) { -#ifndef CONFIG_HOTPLUG_CPU - zap_low_mappings(); -#endif smp_cleanup_boot(); - -#ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -#endif - - time_init_gtod(); - check_nmi_watchdog(); + time_init_gtod(); } #ifdef CONFIG_HOTPLUG_CPU @@ -1120,15 +1198,24 @@ void __init smp_cpus_done(unsigned int max_cpus) static void remove_siblinginfo(int cpu) { int sibling; + struct cpuinfo_x86 *c = cpu_data; + for_each_cpu_mask(sibling, cpu_core_map[cpu]) { + cpu_clear(cpu, cpu_core_map[sibling]); + /* + * last thread sibling in this cpu core going down + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) + c[sibling].booted_cores--; + } + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) cpu_clear(cpu, cpu_sibling_map[sibling]); - for_each_cpu_mask(sibling, cpu_core_map[cpu]) - cpu_clear(cpu, cpu_core_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); - phys_proc_id[cpu] = BAD_APICID; - cpu_core_id[cpu] = BAD_APICID; + c[cpu].phys_proc_id = 0; + c[cpu].cpu_core_id = 0; + cpu_clear(cpu, cpu_sibling_setup_map); } void remove_cpu_from_maps(void) @@ -1138,6 +1225,7 @@ void remove_cpu_from_maps(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_node_cpumask(cpu); } int __cpu_disable(void) @@ -1155,7 +1243,9 @@ int __cpu_disable(void) if (cpu == 0) return -EBUSY; - disable_APIC_timer(); + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); + clear_local_APIC(); /* * HACK: @@ -1169,8 +1259,10 @@ int __cpu_disable(void) local_irq_disable(); remove_siblinginfo(cpu); + spin_lock(&vector_lock); /* It's now safe to remove this processor from the online map */ cpu_clear(cpu, cpu_online_map); + spin_unlock(&vector_lock); remove_cpu_from_maps(); fixup_irqs(cpu_online_map); return 0; @@ -1185,6 +1277,8 @@ void __cpu_die(unsigned int cpu) /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { printk ("CPU %d is now offline\n", cpu); + if (1 == num_online_cpus()) + alternatives_smp_switch(0); return; } msleep(100); @@ -1192,6 +1286,12 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } +static __init int setup_additional_cpus(char *s) +{ + return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; +} +early_param("additional_cpus", setup_additional_cpus); + #else /* ... !CONFIG_HOTPLUG_CPU */ int __cpu_disable(void)