sparc64: Make %pil level 15 a pseudo-NMI.

[safe/jmp/linux-2.6] / arch / sparc64 / kernel / smp.c
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c

index c73b7a4..b5225c8 100644 (file)
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1,6 +1,6 @@
  /* smp.c: Sparc64 SMP support.
   *
- * Copyright (C) 1997, 2007 David S. Miller (davem@davemloft.net)
+ * Copyright (C) 1997, 2007, 2008 David S. Miller (davem@davemloft.net)
   */
  
  #include <linux/module.h>
@@ -20,7 +20,8 @@
  #include <linux/cache.h>
  #include <linux/jiffies.h>
  #include <linux/profile.h>
-#include <linux/bootmem.h>
+#include <linux/lmb.h>
+#include <linux/cpu.h>
  
  #include <asm/head.h>
  #include <asm/ptrace.h>
@@ -30,6 +31,7 @@
  #include <asm/cpudata.h>
  #include <asm/hvtramp.h>
  #include <asm/io.h>
+#include <asm/timer.h>
  
  #include <asm/irq.h>
  #include <asm/irq_regs.h>
@@ -37,7 +39,6 @@
  #include <asm/pgtable.h>
  #include <asm/oplib.h>
  #include <asm/uaccess.h>
-#include <asm/timer.h>
  #include <asm/starfire.h>
  #include <asm/tlb.h>
  #include <asm/sections.h>
@@ -46,20 +47,17 @@
  #include <asm/ldc.h>
  #include <asm/hypervisor.h>
  
-extern void calibrate_delay(void);
-
  int sparc64_multi_core __read_mostly;
  
  cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE;
  cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
-cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly =
-       { [0 ... NR_CPUS-1] = CPU_MASK_NONE };
+DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
  cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
         { [0 ... NR_CPUS-1] = CPU_MASK_NONE };
  
  EXPORT_SYMBOL(cpu_possible_map);
  EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(cpu_sibling_map);
+EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
  EXPORT_SYMBOL(cpu_core_map);
  
  static cpumask_t smp_commenced_mask;
@@ -83,13 +81,11 @@ void smp_bogo(struct seq_file *m)
                            i, cpu_data(i).clock_tick);
  }
  
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
-
  extern void setup_sparc64_timer(void);
  
  static volatile unsigned long callin_flag = 0;
  
-void __devinit smp_callin(void)
+void __cpuinit smp_callin(void)
  {
         int cpuid = hard_smp_processor_id();
  
@@ -120,12 +116,15 @@ void __devinit smp_callin(void)
         atomic_inc(&init_mm.mm_count);
         current->active_mm = &init_mm;
  
+       /* inform the notifiers about the new cpu */
+       notify_cpu_starting(cpuid);
+
         while (!cpu_isset(cpuid, smp_commenced_mask))
                 rmb();
  
-       spin_lock(&call_lock);
+       ipi_call_lock();
         cpu_set(cpuid, cpu_online_map);
-       spin_unlock(&call_lock);
+       ipi_call_unlock();
  
         /* idle thread is expected to have preempt disabled */
         preempt_disable();
@@ -164,7 +163,7 @@ static inline long get_delta (long *rt, long *master)
         for (i = 0; i < NUM_ITERS; i++) {
                 t0 = tick_ops->get_tick();
                 go[MASTER] = 1;
-               membar_storeload();
+               membar_safe("#StoreLoad");
                 while (!(tm = go[SLAVE]))
                         rmb();
                 go[SLAVE] = 0;
@@ -237,8 +236,9 @@ void smp_synchronize_tick_client(void)
                        t[i].rt, t[i].master, t[i].diff, t[i].lat);
  #endif
  
-       printk(KERN_INFO "CPU %d: synchronized TICK with master CPU (last diff %ld cycles,"
-              "maxerr %lu cycles)\n", smp_processor_id(), delta, rt);
+       printk(KERN_INFO "CPU %d: synchronized TICK with master CPU "
+              "(last diff %ld cycles, maxerr %lu cycles)\n",
+              smp_processor_id(), delta, rt);
  }
  
  static void smp_start_sync_tick_client(int cpu);
@@ -257,7 +257,7 @@ static void smp_synchronize_one_tick(int cpu)
  
         /* now let the client proceed into his loop */
         go[MASTER] = 0;
-       membar_storeload();
+       membar_safe("#StoreLoad");
  
         spin_lock_irqsave(&itc_sync_lock, flags);
         {
@@ -267,7 +267,7 @@ static void smp_synchronize_one_tick(int cpu)
                         go[MASTER] = 0;
                         wmb();
                         go[SLAVE] = tick_ops->get_tick();
-                       membar_storeload();
+                       membar_safe("#StoreLoad");
                 }
         }
         spin_unlock_irqrestore(&itc_sync_lock, flags);
@@ -282,18 +282,21 @@ static unsigned long kimage_addr_to_ra(void *p)
         return kern_base + (val - KERNBASE);
  }
  
-static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg)
+static void __cpuinit ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg)
  {
         extern unsigned long sparc64_ttable_tl0;
         extern unsigned long kern_locked_tte_data;
-       extern int bigkernel;
         struct hvtramp_descr *hdesc;
         unsigned long trampoline_ra;
         struct trap_per_cpu *tb;
         u64 tte_vaddr, tte_data;
         unsigned long hv_err;
+       int i;
  
-       hdesc = kzalloc(sizeof(*hdesc), GFP_KERNEL);
+       hdesc = kzalloc(sizeof(*hdesc) +
+                       (sizeof(struct hvtramp_mapping) *
+                        num_kernel_image_mappings - 1),
+                       GFP_KERNEL);
         if (!hdesc) {
                 printk(KERN_ERR "ldom_startcpu_cpuid: Cannot allocate "
                        "hvtramp_descr.\n");
@@ -301,7 +304,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg)
         }
  
         hdesc->cpu = cpu;
-       hdesc->num_mappings = (bigkernel ? 2 : 1);
+       hdesc->num_mappings = num_kernel_image_mappings;
  
         tb = &trap_block[cpu];
         tb->hdesc = hdesc;
@@ -314,13 +317,11 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg)
         tte_vaddr = (unsigned long) KERNBASE;
         tte_data = kern_locked_tte_data;
  
-       hdesc->maps[0].vaddr = tte_vaddr;
-       hdesc->maps[0].tte   = tte_data;
-       if (bigkernel) {
+       for (i = 0; i < hdesc->num_mappings; i++) {
+               hdesc->maps[i].vaddr = tte_vaddr;
+               hdesc->maps[i].tte   = tte_data;
                 tte_vaddr += 0x400000;
                 tte_data  += 0x400000;
-               hdesc->maps[1].vaddr = tte_vaddr;
-               hdesc->maps[1].tte   = tte_data;
         }
  
         trampoline_ra = kimage_addr_to_ra(hv_cpu_startup);
@@ -342,7 +343,7 @@ extern unsigned long sparc64_cpu_startup;
   */
  static struct thread_info *cpu_new_thread = NULL;
  
-static int __devinit smp_boot_one_cpu(unsigned int cpu)
+static int __cpuinit smp_boot_one_cpu(unsigned int cpu)
  {
         struct trap_per_cpu *tb = &trap_block[cpu];
         unsigned long entry =
@@ -460,27 +461,35 @@ again:
         }
  }
  
-static __inline__ void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt)
  {
+       u64 *mondo, data0, data1, data2;
+       u16 *cpu_list;
         u64 pstate;
         int i;
  
         __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
-       for_each_cpu_mask(i, mask)
-               spitfire_xcall_helper(data0, data1, data2, pstate, i);
+       cpu_list = __va(tb->cpu_list_pa);
+       mondo = __va(tb->cpu_mondo_block_pa);
+       data0 = mondo[0];
+       data1 = mondo[1];
+       data2 = mondo[2];
+       for (i = 0; i < cnt; i++)
+               spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]);
  }
  
  /* Cheetah now allows to send the whole 64-bytes of data in the interrupt
   * packet, but we have no use for that.  However we do take advantage of
   * the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
   */
-static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
  {
-       u64 pstate, ver;
         int nack_busy_id, is_jbus, need_more;
+       u64 *mondo, pstate, ver, busy_mask;
+       u16 *cpu_list;
  
-       if (cpus_empty(mask))
-               return;
+       cpu_list = __va(tb->cpu_list_pa);
+       mondo = __va(tb->cpu_mondo_block_pa);
  
         /* Unfortunately, someone at Sun had the brilliant idea to make the
          * busy/nack fields hard-coded by ITID number for this Ultra-III
@@ -503,19 +512,30 @@ retry:
                              "stxa      %2, [%5] %6\n\t"
                              "membar    #Sync\n\t"
                              : /* no outputs */
-                            : "r" (data0), "r" (data1), "r" (data2),
+                            : "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]),
                                "r" (0x40), "r" (0x50), "r" (0x60),
                                "i" (ASI_INTR_W));
  
         nack_busy_id = 0;
+       busy_mask = 0;
         {
                 int i;
  
-               for_each_cpu_mask(i, mask) {
-                       u64 target = (i << 14) | 0x70;
+               for (i = 0; i < cnt; i++) {
+                       u64 target, nr;
+
+                       nr = cpu_list[i];
+                       if (nr == 0xffff)
+                               continue;
  
-                       if (!is_jbus)
+                       target = (nr << 14) | 0x70;
+                       if (is_jbus) {
+                               busy_mask |= (0x1UL << (nr * 2));
+                       } else {
                                 target |= (nack_busy_id << 24);
+                               busy_mask |= (0x1UL <<
+                                             (nack_busy_id * 2));
+                       }
                         __asm__ __volatile__(
                                 "stxa   %%g0, [%0] %1\n\t"
                                 "membar #Sync\n\t"
@@ -531,23 +551,26 @@ retry:
  
         /* Now, poll for completion. */
         {
-               u64 dispatch_stat;
+               u64 dispatch_stat, nack_mask;
                 long stuck;
  
                 stuck = 100000 * nack_busy_id;
+               nack_mask = busy_mask << 1;
                 do {
                         __asm__ __volatile__("ldxa      [%%g0] %1, %0"
                                              : "=r" (dispatch_stat)
                                              : "i" (ASI_INTR_DISPATCH_STAT));
-                       if (dispatch_stat == 0UL) {
+                       if (!(dispatch_stat & (busy_mask | nack_mask))) {
                                 __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
                                                      : : "r" (pstate));
                                 if (unlikely(need_more)) {
-                                       int i, cnt = 0;
-                                       for_each_cpu_mask(i, mask) {
-                                               cpu_clear(i, mask);
-                                               cnt++;
-                                               if (cnt == 32)
+                                       int i, this_cnt = 0;
+                                       for (i = 0; i < cnt; i++) {
+                                               if (cpu_list[i] == 0xffff)
+                                                       continue;
+                                               cpu_list[i] = 0xffff;
+                                               this_cnt++;
+                                               if (this_cnt == 32)
                                                         break;
                                         }
                                         goto retry;
@@ -556,12 +579,12 @@ retry:
                         }
                         if (!--stuck)
                                 break;
-               } while (dispatch_stat & 0x5555555555555555UL);
+               } while (dispatch_stat & busy_mask);
  
                 __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
                                      : : "r" (pstate));
  
-               if ((dispatch_stat & ~(0x5555555555555555UL)) == 0) {
+               if (dispatch_stat & busy_mask) {
                         /* Busy bits will not clear, continue instead
                          * of freezing up on this cpu.
                          */
@@ -578,16 +601,20 @@ retry:
                         /* Clear out the mask bits for cpus which did not
                          * NACK us.
                          */
-                       for_each_cpu_mask(i, mask) {
-                               u64 check_mask;
+                       for (i = 0; i < cnt; i++) {
+                               u64 check_mask, nr;
+
+                               nr = cpu_list[i];
+                               if (nr == 0xffff)
+                                       continue;
  
                                 if (is_jbus)
-                                       check_mask = (0x2UL << (2*i));
+                                       check_mask = (0x2UL << (2*nr));
                                 else
                                         check_mask = (0x2UL <<
                                                       this_busy_nack);
                                 if ((dispatch_stat & check_mask) == 0)
-                                       cpu_clear(i, mask);
+                                       cpu_list[i] = 0xffff;
                                 this_busy_nack += 2;
                                 if (this_busy_nack == 64)
                                         break;
@@ -599,47 +626,17 @@ retry:
  }
  
  /* Multi-cpu list version.  */
-static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
  {
-       struct trap_per_cpu *tb;
+       int retries, this_cpu, prev_sent, i, saw_cpu_error;
+       unsigned long status;
         u16 *cpu_list;
-       u64 *mondo;
-       cpumask_t error_mask;
-       unsigned long flags, status;
-       int cnt, retries, this_cpu, prev_sent, i;
-
-       if (cpus_empty(mask))
-               return;
-
-       /* We have to do this whole thing with interrupts fully disabled.
-        * Otherwise if we send an xcall from interrupt context it will
-        * corrupt both our mondo block and cpu list state.
-        *
-        * One consequence of this is that we cannot use timeout mechanisms
-        * that depend upon interrupts being delivered locally.  So, for
-        * example, we cannot sample jiffies and expect it to advance.
-        *
-        * Fortunately, udelay() uses %stick/%tick so we can use that.
-        */
-       local_irq_save(flags);
  
         this_cpu = smp_processor_id();
-       tb = &trap_block[this_cpu];
-
-       mondo = __va(tb->cpu_mondo_block_pa);
-       mondo[0] = data0;
-       mondo[1] = data1;
-       mondo[2] = data2;
-       wmb();
  
         cpu_list = __va(tb->cpu_list_pa);
  
-       /* Setup the initial cpu list.  */
-       cnt = 0;
-       for_each_cpu_mask(i, mask)
-               cpu_list[cnt++] = i;
-
-       cpus_clear(error_mask);
+       saw_cpu_error = 0;
         retries = 0;
         prev_sent = 0;
         do {
@@ -684,10 +681,9 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
                                         continue;
  
                                 err = sun4v_cpu_state(cpu);
-                               if (err >= 0 &&
-                                   err == HV_CPU_STATE_ERROR) {
+                               if (err == HV_CPU_STATE_ERROR) {
+                                       saw_cpu_error = (cpu + 1);
                                         cpu_list[i] = 0xffff;
-                                       cpu_set(cpu, error_mask);
                                 }
                         }
                 } else if (unlikely(status != HV_EWOULDBLOCK))
@@ -711,32 +707,24 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
                 }
         } while (1);
  
-       local_irq_restore(flags);
-
-       if (unlikely(!cpus_empty(error_mask)))
+       if (unlikely(saw_cpu_error))
                 goto fatal_mondo_cpu_error;
  
         return;
  
  fatal_mondo_cpu_error:
         printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
-              "were in error state\n",
-              this_cpu);
-       printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
-       for_each_cpu_mask(i, error_mask)
-               printk("%d ", i);
-       printk("]\n");
+              "(including %d) were in error state\n",
+              this_cpu, saw_cpu_error - 1);
         return;
  
  fatal_mondo_timeout:
-       local_irq_restore(flags);
         printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
                " progress after %d retries.\n",
                this_cpu, retries);
         goto dump_cpu_list_and_out;
  
  fatal_mondo_error:
-       local_irq_restore(flags);
         printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
                this_cpu, status);
         printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
@@ -750,124 +738,103 @@ dump_cpu_list_and_out:
         printk("]\n");
  }
  
-/* Send cross call to all processors mentioned in MASK
- * except self.
- */
-static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, cpumask_t mask)
-{
-       u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
-       int this_cpu = get_cpu();
-
-       cpus_and(mask, mask, cpu_online_map);
-       cpu_clear(this_cpu, mask);
-
-       if (tlb_type == spitfire)
-               spitfire_xcall_deliver(data0, data1, data2, mask);
-       else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-               cheetah_xcall_deliver(data0, data1, data2, mask);
-       else
-               hypervisor_xcall_deliver(data0, data1, data2, mask);
-       /* NOTE: Caller runs local copy on master. */
+static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
  
-       put_cpu();
-}
+static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
+{
+       struct trap_per_cpu *tb;
+       int this_cpu, i, cnt;
+       unsigned long flags;
+       u16 *cpu_list;
+       u64 *mondo;
  
-extern unsigned long xcall_sync_tick;
+       /* We have to do this whole thing with interrupts fully disabled.
+        * Otherwise if we send an xcall from interrupt context it will
+        * corrupt both our mondo block and cpu list state.
+        *
+        * One consequence of this is that we cannot use timeout mechanisms
+        * that depend upon interrupts being delivered locally.  So, for
+        * example, we cannot sample jiffies and expect it to advance.
+        *
+        * Fortunately, udelay() uses %stick/%tick so we can use that.
+        */
+       local_irq_save(flags);
  
-static void smp_start_sync_tick_client(int cpu)
-{
-       cpumask_t mask = cpumask_of_cpu(cpu);
+       this_cpu = smp_processor_id();
+       tb = &trap_block[this_cpu];
  
-       smp_cross_call_masked(&xcall_sync_tick,
-                             0, 0, 0, mask);
-}
+       mondo = __va(tb->cpu_mondo_block_pa);
+       mondo[0] = data0;
+       mondo[1] = data1;
+       mondo[2] = data2;
+       wmb();
  
-/* Send cross call to all processors except self. */
-#define smp_cross_call(func, ctx, data1, data2) \
-       smp_cross_call_masked(func, ctx, data1, data2, cpu_online_map)
+       cpu_list = __va(tb->cpu_list_pa);
  
-struct call_data_struct {
-       void (*func) (void *info);
-       void *info;
-       atomic_t finished;
-       int wait;
-};
+       /* Setup the initial cpu list.  */
+       cnt = 0;
+       for_each_cpu_mask_nr(i, *mask) {
+               if (i == this_cpu || !cpu_online(i))
+                       continue;
+               cpu_list[cnt++] = i;
+       }
  
-static struct call_data_struct *call_data;
+       if (cnt)
+               xcall_deliver_impl(tb, cnt);
  
-extern unsigned long xcall_call_function;
+       local_irq_restore(flags);
+}
  
-/**
- * smp_call_function(): Run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute <<func>> or are or have executed.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
+/* Send cross call to all processors mentioned in MASK_P
+ * except self.  Really, there are only two cases currently,
+ * "&cpu_online_map" and "&mm->cpu_vm_mask".
   */
-static int smp_call_function_mask(void (*func)(void *info), void *info,
-                                 int nonatomic, int wait, cpumask_t mask)
+static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask)
  {
-       struct call_data_struct data;
-       int cpus;
-
-       /* Can deadlock when called with interrupts disabled */
-       WARN_ON(irqs_disabled());
-
-       data.func = func;
-       data.info = info;
-       atomic_set(&data.finished, 0);
-       data.wait = wait;
-
-       spin_lock(&call_lock);
+       u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
  
-       cpu_clear(smp_processor_id(), mask);
-       cpus = cpus_weight(mask);
-       if (!cpus)
-               goto out_unlock;
+       xcall_deliver(data0, data1, data2, mask);
+}
  
-       call_data = &data;
-       mb();
+/* Send cross call to all processors except self. */
+static void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2)
+{
+       smp_cross_call_masked(func, ctx, data1, data2, &cpu_online_map);
+}
  
-       smp_cross_call_masked(&xcall_call_function, 0, 0, 0, mask);
+extern unsigned long xcall_sync_tick;
  
-       /* Wait for response */
-       while (atomic_read(&data.finished) != cpus)
-               cpu_relax();
+static void smp_start_sync_tick_client(int cpu)
+{
+       xcall_deliver((u64) &xcall_sync_tick, 0, 0,
+                     &cpumask_of_cpu(cpu));
+}
  
-out_unlock:
-       spin_unlock(&call_lock);
+extern unsigned long xcall_call_function;
  
-       return 0;
+void arch_send_call_function_ipi(cpumask_t mask)
+{
+       xcall_deliver((u64) &xcall_call_function, 0, 0, &mask);
  }
  
-int smp_call_function(void (*func)(void *info), void *info,
-                     int nonatomic, int wait)
+extern unsigned long xcall_call_function_single;
+
+void arch_send_call_function_single_ipi(int cpu)
  {
-       return smp_call_function_mask(func, info, nonatomic, wait,
-                                     cpu_online_map);
+       xcall_deliver((u64) &xcall_call_function_single, 0, 0,
+                     &cpumask_of_cpu(cpu));
  }
  
  void smp_call_function_client(int irq, struct pt_regs *regs)
  {
-       void (*func) (void *info) = call_data->func;
-       void *info = call_data->info;
+       clear_softint(1 << irq);
+       generic_smp_call_function_interrupt();
+}
  
+void smp_call_function_single_client(int irq, struct pt_regs *regs)
+{
         clear_softint(1 << irq);
-       if (call_data->wait) {
-               /* let initiator proceed only after completion */
-               func(info);
-               atomic_inc(&call_data->finished);
-       } else {
-               /* let initiator proceed after getting data */
-               atomic_inc(&call_data->finished);
-               func(info);
-       }
+       generic_smp_call_function_single_interrupt();
  }
  
  static void tsb_sync(void *info)
@@ -887,15 +854,18 @@ static void tsb_sync(void *info)
  
  void smp_tsb_sync(struct mm_struct *mm)
  {
-       smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask);
+       smp_call_function_mask(mm->cpu_vm_mask, tsb_sync, mm, 1);
  }
  
  extern unsigned long xcall_flush_tlb_mm;
  extern unsigned long xcall_flush_tlb_pending;
  extern unsigned long xcall_flush_tlb_kernel_range;
-extern unsigned long xcall_report_regs;
+extern unsigned long xcall_fetch_glob_regs;
  extern unsigned long xcall_receive_signal;
  extern unsigned long xcall_new_mmu_context_version;
+#ifdef CONFIG_KGDB
+extern unsigned long xcall_kgdb_capture;
+#endif
  
  #ifdef DCACHE_ALIASING_POSSIBLE
  extern unsigned long xcall_flush_dcache_page_cheetah;
@@ -907,7 +877,7 @@ extern atomic_t dcpage_flushes;
  extern atomic_t dcpage_flushes_xcall;
  #endif
  
-static __inline__ void __local_flush_dcache_page(struct page *page)
+static inline void __local_flush_dcache_page(struct page *page)
  {
  #ifdef DCACHE_ALIASING_POSSIBLE
         __flush_dcache_page(page_address(page),
@@ -922,7 +892,6 @@ static __inline__ void __local_flush_dcache_page(struct page *page)
  
  void smp_flush_dcache_page_impl(struct page *page, int cpu)
  {
-       cpumask_t mask = cpumask_of_cpu(cpu);
         int this_cpu;
  
         if (tlb_type == hypervisor)
@@ -938,29 +907,24 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
                 __local_flush_dcache_page(page);
         } else if (cpu_online(cpu)) {
                 void *pg_addr = page_address(page);
-               u64 data0;
+               u64 data0 = 0;
  
                 if (tlb_type == spitfire) {
-                       data0 =
-                               ((u64)&xcall_flush_dcache_page_spitfire);
+                       data0 = ((u64)&xcall_flush_dcache_page_spitfire);
                         if (page_mapping(page) != NULL)
                                 data0 |= ((u64)1 << 32);
-                       spitfire_xcall_deliver(data0,
-                                              __pa(pg_addr),
-                                              (u64) pg_addr,
-                                              mask);
                 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
  #ifdef DCACHE_ALIASING_POSSIBLE
-                       data0 =
-                               ((u64)&xcall_flush_dcache_page_cheetah);
-                       cheetah_xcall_deliver(data0,
-                                             __pa(pg_addr),
-                                             0, mask);
+                       data0 = ((u64)&xcall_flush_dcache_page_cheetah);
  #endif
                 }
+               if (data0) {
+                       xcall_deliver(data0, __pa(pg_addr),
+                                     (u64) pg_addr, &cpumask_of_cpu(cpu));
  #ifdef CONFIG_DEBUG_DCFLUSH
-               atomic_inc(&dcpage_flushes_xcall);
+                       atomic_inc(&dcpage_flushes_xcall);
  #endif
+               }
         }
  
         put_cpu();
@@ -968,66 +932,41 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
  
  void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
  {
-       void *pg_addr = page_address(page);
-       cpumask_t mask = cpu_online_map;
-       u64 data0;
+       void *pg_addr;
         int this_cpu;
+       u64 data0;
  
         if (tlb_type == hypervisor)
                 return;
  
         this_cpu = get_cpu();
  
-       cpu_clear(this_cpu, mask);
-
  #ifdef CONFIG_DEBUG_DCFLUSH
         atomic_inc(&dcpage_flushes);
  #endif
-       if (cpus_empty(mask))
-               goto flush_self;
+       data0 = 0;
+       pg_addr = page_address(page);
         if (tlb_type == spitfire) {
                 data0 = ((u64)&xcall_flush_dcache_page_spitfire);
                 if (page_mapping(page) != NULL)
                         data0 |= ((u64)1 << 32);
-               spitfire_xcall_deliver(data0,
-                                      __pa(pg_addr),
-                                      (u64) pg_addr,
-                                      mask);
         } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
  #ifdef DCACHE_ALIASING_POSSIBLE
                 data0 = ((u64)&xcall_flush_dcache_page_cheetah);
-               cheetah_xcall_deliver(data0,
-                                     __pa(pg_addr),
-                                     0, mask);
  #endif
         }
+       if (data0) {
+               xcall_deliver(data0, __pa(pg_addr),
+                             (u64) pg_addr, &cpu_online_map);
  #ifdef CONFIG_DEBUG_DCFLUSH
-       atomic_inc(&dcpage_flushes_xcall);
+               atomic_inc(&dcpage_flushes_xcall);
  #endif
- flush_self:
+       }
         __local_flush_dcache_page(page);
  
         put_cpu();
  }
  
-static void __smp_receive_signal_mask(cpumask_t mask)
-{
-       smp_cross_call_masked(&xcall_receive_signal, 0, 0, 0, mask);
-}
-
-void smp_receive_signal(int cpu)
-{
-       cpumask_t mask = cpumask_of_cpu(cpu);
-
-       if (cpu_online(cpu))
-               __smp_receive_signal_mask(mask);
-}
-
-void smp_receive_signal_client(int irq, struct pt_regs *regs)
-{
-       clear_softint(1 << irq);
-}
-
  void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
  {
         struct mm_struct *mm;
@@ -1059,9 +998,16 @@ void smp_new_mmu_context_version(void)
         smp_cross_call(&xcall_new_mmu_context_version, 0, 0, 0);
  }
  
-void smp_report_regs(void)
+#ifdef CONFIG_KGDB
+void kgdb_roundup_cpus(unsigned long flags)
  {
-       smp_cross_call(&xcall_report_regs, 0, 0, 0);
+       smp_cross_call(&xcall_kgdb_capture, 0, 0, 0);
+}
+#endif
+
+void smp_fetch_global_regs(void)
+{
+       smp_cross_call(&xcall_fetch_glob_regs, 0, 0, 0);
  }
  
  /* We know that the window frames of the user have been flushed
@@ -1119,7 +1065,7 @@ void smp_flush_tlb_mm(struct mm_struct *mm)
  
         smp_cross_call_masked(&xcall_flush_tlb_mm,
                               ctx, 0, 0,
-                             mm->cpu_vm_mask);
+                             &mm->cpu_vm_mask);
  
  local_flush_and_out:
         __flush_tlb_mm(ctx, SECONDARY_CONTEXT);
@@ -1137,7 +1083,7 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
         else
                 smp_cross_call_masked(&xcall_flush_tlb_pending,
                                       ctx, nr, (unsigned long) vaddrs,
-                                     mm->cpu_vm_mask);
+                                     &mm->cpu_vm_mask);
  
         __flush_tlb_pending(ctx, nr, vaddrs);
  
@@ -1176,7 +1122,6 @@ void smp_capture(void)
                        smp_processor_id());
  #endif
                 penguins_are_doing_time = 1;
-               membar_storestore_loadstore();
                 atomic_inc(&smp_capture_registry);
                 smp_cross_call(&xcall_capture, 0, 0, 0);
                 while (atomic_read(&smp_capture_registry) != ncpus)
@@ -1196,13 +1141,13 @@ void smp_release(void)
                        smp_processor_id());
  #endif
                 penguins_are_doing_time = 0;
-               membar_storeload_storestore();
+               membar_safe("#StoreLoad");
                 atomic_dec(&smp_capture_registry);
         }
  }
  
-/* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they
- * can service tlb flush xcalls...
+/* Imprisoned penguins run with %pil == PIL_NORMAL_MAX, but PSTATE_IE
+ * set, so they can service tlb flush xcalls...
   */
  extern void prom_world(int);
  
@@ -1215,7 +1160,7 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs)
         __asm__ __volatile__("flushw");
         prom_world(1);
         atomic_inc(&smp_capture_registry);
-       membar_storeload_storestore();
+       membar_safe("#StoreLoad");
         while (penguins_are_doing_time)
                 rmb();
         atomic_dec(&smp_capture_registry);
@@ -1238,6 +1183,16 @@ void __devinit smp_prepare_boot_cpu(void)
  {
  }
  
+void __init smp_setup_processor_id(void)
+{
+       if (tlb_type == spitfire)
+               xcall_deliver_impl = spitfire_xcall_deliver;
+       else if (tlb_type == cheetah || tlb_type == cheetah_plus)
+               xcall_deliver_impl = cheetah_xcall_deliver;
+       else
+               xcall_deliver_impl = hypervisor_xcall_deliver;
+}
+
  void __devinit smp_fill_in_sib_core_maps(void)
  {
         unsigned int i;
@@ -1261,16 +1216,16 @@ void __devinit smp_fill_in_sib_core_maps(void)
         for_each_present_cpu(i) {
                 unsigned int j;
  
-               cpus_clear(cpu_sibling_map[i]);
+               cpus_clear(per_cpu(cpu_sibling_map, i));
                 if (cpu_data(i).proc_id == -1) {
-                       cpu_set(i, cpu_sibling_map[i]);
+                       cpu_set(i, per_cpu(cpu_sibling_map, i));
                         continue;
                 }
  
                 for_each_present_cpu(j) {
                         if (cpu_data(i).proc_id ==
                             cpu_data(j).proc_id)
-                               cpu_set(j, cpu_sibling_map[i]);
+                               cpu_set(j, per_cpu(cpu_sibling_map, i));
                 }
         }
  }
@@ -1342,19 +1297,15 @@ int __cpu_disable(void)
                 cpu_clear(cpu, cpu_core_map[i]);
         cpus_clear(cpu_core_map[cpu]);
  
-       for_each_cpu_mask(i, cpu_sibling_map[cpu])
-               cpu_clear(cpu, cpu_sibling_map[i]);
-       cpus_clear(cpu_sibling_map[cpu]);
+       for_each_cpu_mask(i, per_cpu(cpu_sibling_map, cpu))
+               cpu_clear(cpu, per_cpu(cpu_sibling_map, i));
+       cpus_clear(per_cpu(cpu_sibling_map, cpu));
  
         c = &cpu_data(cpu);
  
         c->core_id = 0;
         c->proc_id = -1;
  
-       spin_lock(&call_lock);
-       cpu_clear(cpu, cpu_online_map);
-       spin_unlock(&call_lock);
-
         smp_wmb();
  
         /* Make sure no interrupts point to this cpu.  */
@@ -1364,6 +1315,10 @@ int __cpu_disable(void)
         mdelay(1);
         local_irq_disable();
  
+       ipi_call_lock();
+       cpu_clear(cpu, cpu_online_map);
+       ipi_call_unlock();
+
         return 0;
  }
  
@@ -1406,7 +1361,13 @@ void __init smp_cpus_done(unsigned int max_cpus)
  
  void smp_send_reschedule(int cpu)
  {
-       smp_receive_signal(cpu);
+       xcall_deliver((u64) &xcall_receive_signal, 0, 0,
+                     &cpumask_of_cpu(cpu));
+}
+
+void smp_receive_signal_client(int irq, struct pt_regs *regs)
+{
+       clear_softint(1 << irq);
  }
  
  /* This is a nop because we capture all other cpus
@@ -1424,7 +1385,7 @@ EXPORT_SYMBOL(__per_cpu_shift);
  
  void __init real_setup_per_cpu_areas(void)
  {
-       unsigned long goal, size, i;
+       unsigned long paddr, goal, size, i;
         char *ptr;
  
         /* Copy section for each CPU (we discard the original) */
@@ -1434,8 +1395,13 @@ void __init real_setup_per_cpu_areas(void)
         for (size = PAGE_SIZE; size < goal; size <<= 1UL)
                 __per_cpu_shift++;
  
-       ptr = alloc_bootmem_pages(size * NR_CPUS);
+       paddr = lmb_alloc(size * NR_CPUS, PAGE_SIZE);
+       if (!paddr) {
+               prom_printf("Cannot allocate per-cpu memory.\n");
+               prom_halt();
+       }
  
+       ptr = __va(paddr);
         __per_cpu_base = ptr - __per_cpu_start;
  
         for (i = 0; i < NR_CPUS; i++, ptr += size)