X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fi386%2Fkernel%2Fsmpboot.c;h=02a9b66b6ac38ae95aa199e8d91e7576d10daeba;hb=c4028958b6ecad064b1a6303a6a5906d4fe48d73;hp=e6488ffc1f7928cfca08ec35a5968c46ae614008;hpb=38e548ee1a79c8da7b3d9e26f2adce9b61413f84;p=safe%2Fjmp%2Flinux-2.6

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index e6488ff..02a9b66 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -34,7 +34,6 @@
 *		Rusty Russell	:	Hacked into shape for new "hotplug" boot process. */
 
 #include <linux/module.h>
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 
@@ -52,6 +51,7 @@
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
+#include <asm/nmi.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -66,17 +66,14 @@ int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
 #endif
 
-/* Package ID of each logical CPU */
-int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-EXPORT_SYMBOL(phys_proc_id);
-
-/* Core ID of each logical CPU */
-int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-EXPORT_SYMBOL(cpu_core_id);
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
 
+/* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
 
+/* representing HT and core siblings of each logical CPU */
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
@@ -87,11 +84,7 @@ EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
-#ifdef CONFIG_HOTPLUG_CPU
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
-#else
 cpumask_t cpu_possible_map;
-#endif
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
@@ -109,6 +102,8 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
 			{ [0 ... NR_CPUS-1] = 0xff };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 
+u8 apicid_2_node[MAX_APICID];
+
 /*
  * Trampoline 80x86 program as an array.
  */
@@ -184,6 +179,9 @@ static void __devinit smp_store_cpu_info(int id)
 	 */
 	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
 
+		if (num_possible_cpus() == 1)
+			goto valid_k7;
+
 		/* Athlon 660/661 is valid. */	
 		if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
 			goto valid_k7;
@@ -219,14 +217,20 @@ valid_k7:
  * then we print a warning if not, and always resync.
  */
 
-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
+static struct {
+	atomic_t start_flag;
+	atomic_t count_start;
+	atomic_t count_stop;
+	unsigned long long values[NR_CPUS];
+} tsc __initdata = {
+	.start_flag = ATOMIC_INIT(0),
+	.count_start = ATOMIC_INIT(0),
+	.count_stop = ATOMIC_INIT(0),
+};
 
 #define NR_LOOPS 5
 
-static void __init synchronize_tsc_bp (void)
+static void __init synchronize_tsc_bp(void)
 {
 	int i;
 	unsigned long long t0;
@@ -240,7 +244,7 @@ static void __init synchronize_tsc_bp (void)
 	/* convert from kcyc/sec to cyc/usec */
 	one_usec = cpu_khz / 1000;
 
-	atomic_set(&tsc_start_flag, 1);
+	atomic_set(&tsc.start_flag, 1);
 	wmb();
 
 	/*
@@ -257,16 +261,16 @@ static void __init synchronize_tsc_bp (void)
 		/*
 		 * all APs synchronize but they loop on '== num_cpus'
 		 */
-		while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
-			mb();
-		atomic_set(&tsc_count_stop, 0);
+		while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
+			cpu_relax();
+		atomic_set(&tsc.count_stop, 0);
 		wmb();
 		/*
 		 * this lets the APs save their current TSC:
 		 */
-		atomic_inc(&tsc_count_start);
+		atomic_inc(&tsc.count_start);
 
-		rdtscll(tsc_values[smp_processor_id()]);
+		rdtscll(tsc.values[smp_processor_id()]);
 		/*
 		 * We clear the TSC in the last loop:
 		 */
@@ -276,54 +280,54 @@ static void __init synchronize_tsc_bp (void)
 		/*
 		 * Wait for all APs to leave the synchronization point:
 		 */
-		while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
-			mb();
-		atomic_set(&tsc_count_start, 0);
+		while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
+			cpu_relax();
+		atomic_set(&tsc.count_start, 0);
 		wmb();
-		atomic_inc(&tsc_count_stop);
+		atomic_inc(&tsc.count_stop);
 	}
 
 	sum = 0;
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_isset(i, cpu_callout_map)) {
-			t0 = tsc_values[i];
+			t0 = tsc.values[i];
 			sum += t0;
 		}
 	}
 	avg = sum;
 	do_div(avg, num_booting_cpus());
 
-	sum = 0;
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_isset(i, cpu_callout_map))
 			continue;
-		delta = tsc_values[i] - avg;
+		delta = tsc.values[i] - avg;
 		if (delta < 0)
 			delta = -delta;
 		/*
 		 * We report bigger than 2 microseconds clock differences.
 		 */
 		if (delta > 2*one_usec) {
-			long realdelta;
+			long long realdelta;
+
 			if (!buggy) {
 				buggy = 1;
 				printk("\n");
 			}
 			realdelta = delta;
 			do_div(realdelta, one_usec);
-			if (tsc_values[i] < avg)
+			if (tsc.values[i] < avg)
 				realdelta = -realdelta;
 
-			printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
+			if (realdelta)
+				printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
+					"skew, fixed it up.\n", i, realdelta);
 		}
-
-		sum += delta;
 	}
 	if (!buggy)
 		printk("passed.\n");
 }
 
-static void __init synchronize_tsc_ap (void)
+static void __init synchronize_tsc_ap(void)
 {
 	int i;
 
@@ -332,19 +336,21 @@ static void __init synchronize_tsc_ap (void)
 	 * this gets called, so we first wait for the BP to
 	 * finish SMP initialization:
 	 */
-	while (!atomic_read(&tsc_start_flag)) mb();
+	while (!atomic_read(&tsc.start_flag))
+		cpu_relax();
 
 	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&tsc_count_start);
-		while (atomic_read(&tsc_count_start) != num_booting_cpus())
-			mb();
+		atomic_inc(&tsc.count_start);
+		while (atomic_read(&tsc.count_start) != num_booting_cpus())
+			cpu_relax();
 
-		rdtscll(tsc_values[smp_processor_id()]);
+		rdtscll(tsc.values[smp_processor_id()]);
 		if (i == NR_LOOPS-1)
 			write_tsc(0, 0);
 
-		atomic_inc(&tsc_count_stop);
-		while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+		atomic_inc(&tsc.count_stop);
+		while (atomic_read(&tsc.count_stop) != num_booting_cpus())
+			cpu_relax();
 	}
 }
 #undef NR_LOOPS
@@ -444,35 +450,83 @@ static void __devinit smp_callin(void)
 
 static int cpucount;
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * And for power savings, we return cpu_core_map
+	 */
+	if (sched_mc_power_savings || sched_smt_power_savings)
+		return cpu_core_map[cpu];
+	else
+		return c->llc_shared_map;
+}
+
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
 static inline void
 set_cpu_sibling_map(int cpu)
 {
 	int i;
+	struct cpuinfo_x86 *c = cpu_data;
+
+	cpu_set(cpu, cpu_sibling_setup_map);
 
 	if (smp_num_siblings > 1) {
-		for (i = 0; i < NR_CPUS; i++) {
-			if (!cpu_isset(i, cpu_callout_map))
-				continue;
-			if (cpu_core_id[cpu] == cpu_core_id[i]) {
+		for_each_cpu_mask(i, cpu_sibling_setup_map) {
+			if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+			    c[cpu].cpu_core_id == c[i].cpu_core_id) {
 				cpu_set(i, cpu_sibling_map[cpu]);
 				cpu_set(cpu, cpu_sibling_map[i]);
+				cpu_set(i, cpu_core_map[cpu]);
+				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
-	if (current_cpu_data.x86_num_cores > 1) {
-		for (i = 0; i < NR_CPUS; i++) {
-			if (!cpu_isset(i, cpu_callout_map))
-				continue;
-			if (phys_proc_id[cpu] == phys_proc_id[i]) {
-				cpu_set(i, cpu_core_map[cpu]);
-				cpu_set(cpu, cpu_core_map[i]);
-			}
-		}
-	} else {
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
+	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
+		c[cpu].booted_cores = 1;
+		return;
+	}
+
+	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
+		if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+			cpu_set(i, cpu_core_map[cpu]);
+			cpu_set(cpu, cpu_core_map[i]);
+			/*
+			 *  Does this new cpu bringup a new core?
+			 */
+			if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+				/*
+				 * for each core in package, increment
+				 * the booted_cores for this new cpu
+				 */
+				if (first_cpu(cpu_sibling_map[i]) == i)
+					c[cpu].booted_cores++;
+				/*
+				 * increment the core count for all
+				 * the other cpus in this package
+				 */
+				if (i != cpu)
+					c[i].booted_cores++;
+			} else if (i != cpu && !c[cpu].booted_cores)
+				c[cpu].booted_cores = c[i].booted_cores;
+		}
 	}
 }
 
@@ -487,6 +541,7 @@ static void __devinit start_secondary(void *unused)
 	 * things done here to the most necessary things.
 	 */
 	cpu_init();
+	preempt_disable();
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
 		rep_nop();
@@ -557,6 +612,7 @@ extern struct {
 /* which logical CPUs are on which nodes */
 cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
 				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+EXPORT_SYMBOL(node_2_cpu_mask);
 /* which node each logical CPU is on */
 int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
 EXPORT_SYMBOL(cpu_2_node);
@@ -592,9 +648,13 @@ static void map_cpu_to_logical_apicid(void)
 {
 	int cpu = smp_processor_id();
 	int apicid = logical_smp_processor_id();
+	int node = apicid_to_node(apicid);
+
+	if (!node_online(node))
+		node = first_online_node;
 
 	cpu_2_logical_apicid[cpu] = apicid;
-	map_cpu_to_node(cpu, apicid_to_node(apicid));
+	map_cpu_to_node(cpu, node);
 }
 
 static void unmap_cpu_to_logical_apicid(int cpu)
@@ -849,8 +909,7 @@ static inline struct task_struct * alloc_idle_task(int cpu)
 		/* initialize thread_struct.  we really want to avoid destroy
 		 * idle tread
 		 */
-		idle->thread.esp = (unsigned long)(((struct pt_regs *)
-			(THREAD_SIZE + (unsigned long) idle->thread_info)) - 1);
+		idle->thread.esp = (unsigned long)task_pt_regs(idle);
 		init_idle(idle, cpu);
 		return idle;
 	}
@@ -878,6 +937,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
 	unsigned short nmi_high = 0, nmi_low = 0;
 
 	++cpucount;
+	alternatives_smp_switch(1);
 
 	/*
 	 * We can't use kernel_thread since we must avoid to
@@ -897,6 +957,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
 
 	irq_ctx_init(cpu);
 
+	x86_cpu_to_apicid[cpu] = apicid;
 	/*
 	 * This grunge runs the startup process for
 	 * the targeted processor.
@@ -981,7 +1042,6 @@ void cpu_exit_clear(void)
 
 	cpu_clear(cpu, cpu_callout_map);
 	cpu_clear(cpu, cpu_callin_map);
-	cpu_clear(cpu, cpu_present_map);
 
 	cpu_clear(cpu, smp_commenced_mask);
 	unmap_cpu_to_logical_apicid(cpu);
@@ -989,35 +1049,48 @@ void cpu_exit_clear(void)
 
 struct warm_boot_cpu_info {
 	struct completion *complete;
+	struct work_struct task;
 	int apicid;
 	int cpu;
 };
 
-static void __devinit do_warm_boot_cpu(void *p)
+static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
 {
-	struct warm_boot_cpu_info *info = p;
+	struct warm_boot_cpu_info *info =
+		container_of(work, struct warm_boot_cpu_info, task);
 	do_boot_cpu(info->apicid, info->cpu);
 	complete(info->complete);
 }
 
-int __devinit smp_prepare_cpu(int cpu)
+static int __cpuinit __smp_prepare_cpu(int cpu)
 {
-	DECLARE_COMPLETION(done);
+	DECLARE_COMPLETION_ONSTACK(done);
 	struct warm_boot_cpu_info info;
-	struct work_struct task;
 	int	apicid, ret;
+	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
-	lock_cpu_hotplug();
 	apicid = x86_cpu_to_apicid[cpu];
 	if (apicid == BAD_APICID) {
 		ret = -ENODEV;
 		goto exit;
 	}
 
+	/*
+	 * the CPU isn't initialized at boot time, allocate gdt table here.
+	 * cpu_init will initialize it
+	 */
+	if (!cpu_gdt_descr->address) {
+		cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+		if (!cpu_gdt_descr->address)
+			printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+			ret = -ENOMEM;
+			goto exit;
+	}
+
 	info.complete = &done;
 	info.apicid = apicid;
 	info.cpu = cpu;
-	INIT_WORK(&task, do_warm_boot_cpu, &info);
+	INIT_WORK(&info.task, do_warm_boot_cpu);
 
 	tsc_sync_disabled = 1;
 
@@ -1025,14 +1098,13 @@ int __devinit smp_prepare_cpu(int cpu)
 	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
 			KERNEL_PGD_PTRS);
 	flush_tlb_all();
-	schedule_work(&task);
+	schedule_work(&info.task);
 	wait_for_completion(&done);
 
 	tsc_sync_disabled = 0;
 	zap_low_mappings();
 	ret = 0;
 exit:
-	unlock_cpu_hotplug();
 	return ret;
 }
 #endif
@@ -1064,6 +1136,7 @@ static void smp_tune_scheduling (void)
 			cachesize = 16; /* Pentiums, 2x8kB cache */
 			bandwidth = 100;
 		}
+		max_cache_size = cachesize * 1024;
 	}
 }
 
@@ -1096,11 +1169,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 
 	current_thread_info()->cpu = 0;
 	smp_tune_scheduling();
-	cpus_clear(cpu_sibling_map[0]);
-	cpu_set(0, cpu_sibling_map[0]);
 
-	cpus_clear(cpu_core_map[0]);
-	cpu_set(0, cpu_core_map[0]);
+	set_cpu_sibling_map(0);
 
 	/*
 	 * If we couldn't find an SMP configuration at boot time,
@@ -1279,15 +1349,24 @@ static void
 remove_siblinginfo(int cpu)
 {
 	int sibling;
+	struct cpuinfo_x86 *c = cpu_data;
 
+	for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+		cpu_clear(cpu, cpu_core_map[sibling]);
+		/*
+		 * last thread sibling in this cpu core going down
+		 */
+		if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+			c[sibling].booted_cores--;
+	}
+			
 	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
 		cpu_clear(cpu, cpu_sibling_map[sibling]);
-	for_each_cpu_mask(sibling, cpu_core_map[cpu])
-		cpu_clear(cpu, cpu_core_map[sibling]);
 	cpus_clear(cpu_sibling_map[cpu]);
 	cpus_clear(cpu_core_map[cpu]);
-	phys_proc_id[cpu] = BAD_APICID;
-	cpu_core_id[cpu] = BAD_APICID;
+	c[cpu].phys_proc_id = 0;
+	c[cpu].cpu_core_id = 0;
+	cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
 int __cpu_disable(void)
@@ -1305,9 +1384,9 @@ int __cpu_disable(void)
 	 */
 	if (cpu == 0)
 		return -EBUSY;
-
-	/* We enable the timer again on the exit path of the death loop */
-	disable_APIC_timer();
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		stop_apic_nmi_watchdog(NULL);
+	clear_local_APIC();
 	/* Allow any queued timer interrupts to get serviced */
 	local_irq_enable();
 	mdelay(1);
@@ -1331,6 +1410,8 @@ void __cpu_die(unsigned int cpu)
 		/* They ack this in play_dead by setting CPU_DEAD */
 		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
 			printk ("CPU %d is now offline\n", cpu);
+			if (1 == num_online_cpus())
+				alternatives_smp_switch(0);
 			return;
 		}
 		msleep(100);
@@ -1352,6 +1433,22 @@ void __cpu_die(unsigned int cpu)
 
 int __devinit __cpu_up(unsigned int cpu)
 {
+#ifdef CONFIG_HOTPLUG_CPU
+	int ret=0;
+
+	/*
+	 * We do warm boot only on cpus that had booted earlier
+	 * Otherwise cold boot is all handled from smp_boot_cpus().
+	 * cpu_callin_map is set during AP kickstart process. Its reset
+	 * when a cpu is taken offline from cpu_exit_clear().
+	 */
+	if (!cpu_isset(cpu, cpu_callin_map))
+		ret = __smp_prepare_cpu(cpu);
+
+	if (ret)
+		return -EIO;
+#endif
+
 	/* In case one didn't come up */
 	if (!cpu_isset(cpu, cpu_callin_map)) {
 		printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
@@ -1364,7 +1461,7 @@ int __devinit __cpu_up(unsigned int cpu)
 	/* Unleash the CPU! */
 	cpu_set(cpu, smp_commenced_mask);
 	while (!cpu_isset(cpu, cpu_online_map))
-		mb();
+		cpu_relax();
 	return 0;
 }
 
@@ -1402,3 +1499,16 @@ void __init smp_intr_init(void)
 	/* IPI for generic function call */
 	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 }
+
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+	extern unsigned int maxcpus;
+
+	maxcpus = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+early_param("maxcpus", parse_maxcpus);