Merge branches 'misc', 'eeepc-laptop' and 'bugzilla-14445' into release

[safe/jmp/linux-2.6] / arch / x86 / kernel / tlb_uv.c
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c

index d8705e9..1740c85 100644 (file)
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -6,29 +6,60 @@
   *     This code is released under the GNU General Public License version 2 or
   *     later.
   */
-#include <linux/mc146818rtc.h>
+#include <linux/seq_file.h>
  #include <linux/proc_fs.h>
  #include <linux/kernel.h>
  
  #include <asm/mmu_context.h>
-#include <asm/idle.h>
-#include <asm/genapic.h>
-#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv.h>
  #include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
  #include <asm/uv/uv_bau.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
  #include <asm/tsc.h>
+#include <asm/irq_vectors.h>
  
-#include <mach_apic.h>
+static struct bau_control      **uv_bau_table_bases __read_mostly;
+static int                     uv_bau_retry_limit __read_mostly;
  
-static struct bau_control **uv_bau_table_bases __read_mostly;
-static int uv_bau_retry_limit __read_mostly;
-static int uv_nshift __read_mostly; /* position of pnode (which is nasid>>1) */
-static unsigned long uv_mmask __read_mostly;
+/* base pnode in this partition */
+static int                     uv_partition_base_pnode __read_mostly;
+
+static unsigned long           uv_mmask __read_mostly;
  
  static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
  static DEFINE_PER_CPU(struct bau_control, bau_control);
  
  /*
+ * Determine the first node on a blade.
+ */
+static int __init blade_to_first_node(int blade)
+{
+       int node, b;
+
+       for_each_online_node(node) {
+               b = uv_node_to_blade_id(node);
+               if (blade == b)
+                       return node;
+       }
+       return -1; /* shouldn't happen */
+}
+
+/*
+ * Determine the apicid of the first cpu on a blade.
+ */
+static int __init blade_to_first_apicid(int blade)
+{
+       int cpu;
+
+       for_each_present_cpu(cpu)
+               if (blade == uv_cpu_to_blade_id(cpu))
+                       return per_cpu(x86_cpu_to_apicid, cpu);
+       return -1;
+}
+
+/*
   * Free a software acknowledge hardware resource by clearing its Pending
   * bit. This will return a reply to the sender.
   * If the message has timed out, a reply has already been sent by the
@@ -37,8 +68,8 @@ static DEFINE_PER_CPU(struct bau_control, bau_control);
   * be sent (the hardware will only do one reply per message).
   */
  static void uv_reply_to_message(int resource,
-                   struct bau_payload_queue_entry *msg,
-                   struct bau_msg_status *msp)
+                               struct bau_payload_queue_entry *msg,
+                               struct bau_msg_status *msp)
  {
         unsigned long dw;
  
@@ -55,16 +86,16 @@ static void uv_reply_to_message(int resource,
   * Other cpu's may come here at the same time for this message.
   */
  static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
-                      int msg_slot, int sw_ack_slot)
+                                  int msg_slot, int sw_ack_slot)
  {
-       int cpu;
         unsigned long this_cpu_mask;
         struct bau_msg_status *msp;
+       int cpu;
  
         msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
         cpu = uv_blade_processor_id();
         msg->number_of_cpus =
-           uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
+               uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
         this_cpu_mask = 1UL << cpu;
         if (msp->seen_by.bits & this_cpu_mask)
                 return;
@@ -96,11 +127,11 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
   */
  static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
  {
-       int i;
-       int j;
-       int count = 0;
         struct bau_payload_queue_entry *msg;
         struct bau_msg_status *msp;
+       int count = 0;
+       int i;
+       int j;
  
         for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
              msg++, i++) {
@@ -111,7 +142,7 @@ static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
                                i, msg->address, msg->acknowledge_count,
                                msg->number_of_cpus);
                         for (j = 0; j < msg->number_of_cpus; j++) {
-                               if (!((long)1 << j & msp-> seen_by.bits)) {
+                               if (!((1L << j) & msp->seen_by.bits)) {
                                         count++;
                                         printk("%d ", j);
                                 }
@@ -135,8 +166,7 @@ static int uv_examine_destinations(struct bau_target_nodemask *distribution)
         int count = 0;
  
         sender = smp_processor_id();
-       for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE);
-            i++) {
+       for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
                 if (!bau_node_isset(i, distribution))
                         continue;
                 count += uv_examine_destination(uv_bau_table_bases[i], sender);
@@ -197,6 +227,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
                                 destination_timeouts = 0;
                         }
                 }
+               cpu_relax();
         }
         return FLUSH_COMPLETE;
  }
@@ -206,22 +237,23 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
   *
   * Send a broadcast and wait for a broadcast message to complete.
   *
- * The cpumaskp mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast was sent to.
   *
- * Returns 1 if all remote flushing was done. The mask is zeroed.
- * Returns 0 if some remote flushing remains to be done. The mask is left
- * unchanged.
+ * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns @flush_mask if some remote flushing remains to be done. The
+ * mask will have some bits still set.
   */
-int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
-                          cpumask_t *cpumaskp)
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
+                                            struct bau_desc *bau_desc,
+                                            struct cpumask *flush_mask)
  {
         int completion_status = 0;
         int right_shift;
-       int bit;
-       int blade;
         int tries = 0;
-       unsigned long index;
+       int pnode;
+       int bit;
         unsigned long mmr_offset;
+       unsigned long index;
         cycles_t time1;
         cycles_t time2;
  
@@ -253,73 +285,83 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
                  * the cpu's, all of which are still in the mask.
                  */
                 __get_cpu_var(ptcstats).ptc_i++;
-               return 0;
+               return flush_mask;
         }
  
         /*
          * Success, so clear the remote cpu's from the mask so we don't
          * use the IPI method of shootdown on them.
          */
-       for_each_cpu_mask(bit, *cpumaskp) {
-               blade = uv_cpu_to_blade_id(bit);
-               if (blade == this_blade)
+       for_each_cpu(bit, flush_mask) {
+               pnode = uv_cpu_to_pnode(bit);
+               if (pnode == this_pnode)
                         continue;
-               cpu_clear(bit, *cpumaskp);
+               cpumask_clear_cpu(bit, flush_mask);
         }
-       if (!cpus_empty(*cpumaskp))
-               return 0;
-       return 1;
+       if (!cpumask_empty(flush_mask))
+               return flush_mask;
+       return NULL;
  }
  
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
  /**
   * uv_flush_tlb_others - globally purge translation cache of a virtual
   * address or all TLB's
- * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @cpumask: mask of all cpu's in which the address is to be removed
   * @mm: mm_struct containing virtual address range
   * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @cpu: the current cpu
   *
   * This is the entry point for initiating any UV global TLB shootdown.
   *
   * Purges the translation caches of all specified processors of the given
   * virtual address, or purges all TLB's on specified processors.
   *
- * The caller has derived the cpumaskp from the mm_struct and has subtracted
- * the local cpu from the mask.  This function is called only if there
- * are bits set in the mask. (e.g. flush_tlb_page())
+ * The caller has derived the cpumask from the mm_struct.  This function
+ * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
   *
- * The cpumaskp is converted into a nodemask of the nodes containing
+ * The cpumask is converted into a nodemask of the nodes containing
   * the cpus.
   *
- * Returns 1 if all remote flushing was done.
- * Returns 0 if some remote flushing remains to be done.
+ * Note that this function should be called with preemption disabled.
+ *
+ * Returns NULL if all remote flushing was done.
+ * Returns pointer to cpumask if some remote flushing remains to be
+ * done.  The returned pointer is valid till preemption is re-enabled.
   */
-int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
-       unsigned long va)
+const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+                                         struct mm_struct *mm,
+                                         unsigned long va, unsigned int cpu)
  {
+       struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
         int i;
         int bit;
-       int blade;
-       int cpu;
-       int this_blade;
+       int pnode;
+       int uv_cpu;
+       int this_pnode;
         int locals = 0;
         struct bau_desc *bau_desc;
  
-       cpu = uv_blade_processor_id();
-       this_blade = uv_numa_blade_id();
+       cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+
+       uv_cpu = uv_blade_processor_id();
+       this_pnode = uv_hub_info->pnode;
         bau_desc = __get_cpu_var(bau_control).descriptor_base;
-       bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
+       bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
  
         bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
  
         i = 0;
-       for_each_cpu_mask(bit, *cpumaskp) {
-               blade = uv_cpu_to_blade_id(bit);
-               BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
-               if (blade == this_blade) {
+       for_each_cpu(bit, flush_mask) {
+               pnode = uv_cpu_to_pnode(bit);
+               BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
+               if (pnode == this_pnode) {
                         locals++;
                         continue;
                 }
-               bau_node_set(blade, &bau_desc->distribution);
+               bau_node_set(pnode - uv_partition_base_pnode,
+                               &bau_desc->distribution);
                 i++;
         }
         if (i == 0) {
@@ -327,17 +369,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
                  * no off_node flushing; return status for local node
                  */
                 if (locals)
-                       return 0;
+                       return flush_mask;
                 else
-                       return 1;
+                       return NULL;
         }
         __get_cpu_var(ptcstats).requestor++;
         __get_cpu_var(ptcstats).ntargeted += i;
  
         bau_desc->payload.address = va;
-       bau_desc->payload.sending_cpu = smp_processor_id();
+       bau_desc->payload.sending_cpu = cpu;
  
-       return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
+       return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
  }
  
  /*
@@ -356,12 +398,12 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
   */
  void uv_bau_message_interrupt(struct pt_regs *regs)
  {
-       struct bau_payload_queue_entry *pqp;
-       struct bau_payload_queue_entry *msg;
         struct bau_payload_queue_entry *va_queue_first;
         struct bau_payload_queue_entry *va_queue_last;
+       struct bau_payload_queue_entry *msg;
         struct pt_regs *old_regs = set_irq_regs(regs);
-       cycles_t time1, time2;
+       cycles_t time1;
+       cycles_t time2;
         int msg_slot;
         int sw_ack_slot;
         int fw;
@@ -376,13 +418,14 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
  
         local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
  
-       pqp = va_queue_first = __get_cpu_var(bau_control).va_queue_first;
+       va_queue_first = __get_cpu_var(bau_control).va_queue_first;
         va_queue_last = __get_cpu_var(bau_control).va_queue_last;
+
         msg = __get_cpu_var(bau_control).bau_msg_head;
         while (msg->sw_ack_vector) {
                 count++;
                 fw = msg->sw_ack_vector;
-               msg_slot = msg - pqp;
+               msg_slot = msg - va_queue_first;
                 sw_ack_slot = ffs(fw) - 1;
  
                 uv_bau_process_message(msg, msg_slot, sw_ack_slot);
@@ -404,24 +447,58 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
         set_irq_regs(old_regs);
  }
  
+/*
+ * uv_enable_timeouts
+ *
+ * Each target blade (i.e. blades that have cpu's) needs to have
+ * shootdown message timeouts enabled.  The timeout does not cause
+ * an interrupt, but causes an error message to be returned to
+ * the sender.
+ */
  static void uv_enable_timeouts(void)
  {
-       int i;
         int blade;
-       int last_blade;
+       int nblades;
         int pnode;
-       int cur_cpu = 0;
-       unsigned long apicid;
+       unsigned long mmr_image;
+
+       nblades = uv_num_possible_blades();
  
-       last_blade = -1;
-       for_each_online_node(i) {
-               blade = uv_node_to_blade_id(i);
-               if (blade == last_blade)
+       for (blade = 0; blade < nblades; blade++) {
+               if (!uv_blade_nr_possible_cpus(blade))
                         continue;
-               last_blade = blade;
-               apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+
                 pnode = uv_blade_to_pnode(blade);
-               cur_cpu += uv_blade_nr_possible_cpus(i);
+               mmr_image =
+                   uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
+               /*
+                * Set the timeout period and then lock it in, in three
+                * steps; captures and locks in the period.
+                *
+                * To program the period, the SOFT_ACK_MODE must be off.
+                */
+               mmr_image &= ~((unsigned long)1 <<
+                              UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+               uv_write_global_mmr64
+                   (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+               /*
+                * Set the 4-bit period.
+                */
+               mmr_image &= ~((unsigned long)0xf <<
+                       UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+               mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
+                            UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+               uv_write_global_mmr64
+                   (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+               /*
+                * Subsequent reversals of the timebase bit (3) cause an
+                * immediate timeout of one or all INTD resources as
+                * indicated in bits 2:0 (7 causes all of them to timeout).
+                */
+               mmr_image |= ((unsigned long)1 <<
+                             UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+               uv_write_global_mmr64
+                   (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
         }
  }
  
@@ -468,8 +545,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
                            stat->requestee, stat->onetlb, stat->alltlb,
                            stat->s_retry, stat->d_retry, stat->ptc_i);
                 seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
-                          uv_read_global_mmr64(uv_blade_to_pnode
-                                       (uv_cpu_to_blade_id(cpu)),
+                          uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
                                         UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
                            stat->sflush, stat->dflush,
                            stat->retriesok, stat->nomsg,
@@ -484,11 +560,13 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
   * >0: retry limit
   */
  static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
-                 size_t count, loff_t *data)
+                                size_t count, loff_t *data)
  {
         long newmode;
         char optstr[64];
  
+       if (count == 0 || count > sizeof(optstr))
+               return -EINVAL;
         if (copy_from_user(optstr, user, count))
                 return -EFAULT;
         optstr[count - 1] = '\0';
@@ -560,17 +638,13 @@ static int __init uv_ptc_init(void)
         if (!is_uv_system())
                 return 0;
  
-       if (!proc_mkdir("sgi_uv", NULL))
-               return -EINVAL;
-
-       proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
+       proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
+                                 &proc_uv_ptc_operations);
         if (!proc_uv_ptc) {
                 printk(KERN_ERR "unable to create %s proc entry\n",
                        UV_PTC_BASENAME);
-               remove_proc_entry("sgi_uv", NULL);
                 return -EINVAL;
         }
-       proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
         return 0;
  }
  
@@ -580,49 +654,48 @@ static int __init uv_ptc_init(void)
  static struct bau_control * __init uv_table_bases_init(int blade, int node)
  {
         int i;
-       int *ip;
         struct bau_msg_status *msp;
         struct bau_control *bau_tabp;
  
         bau_tabp =
             kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
         BUG_ON(!bau_tabp);
+
         bau_tabp->msg_statuses =
             kmalloc_node(sizeof(struct bau_msg_status) *
                          DEST_Q_SIZE, GFP_KERNEL, node);
         BUG_ON(!bau_tabp->msg_statuses);
+
         for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
                 bau_cpubits_clear(&msp->seen_by, (int)
                                   uv_blade_nr_possible_cpus(blade));
-       bau_tabp->watching =
-           kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
-       BUG_ON(!bau_tabp->watching);
-       for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) {
-               *ip = 0;
-       }
+
         uv_bau_table_bases[blade] = bau_tabp;
-       return bau_tabsp;
+
+       return bau_tabp;
  }
  
  /*
   * finish the initialization of the per-blade control structures
   */
-static void __init uv_table_bases_finish(int blade, int node, int cur_cpu,
-                                 struct bau_control *bau_tablesp,
-                                 struct bau_desc *adp)
+static void __init
+uv_table_bases_finish(int blade,
+                     struct bau_control *bau_tablesp,
+                     struct bau_desc *adp)
  {
-       int i;
         struct bau_control *bcp;
+       int cpu;
  
-       for (i = cur_cpu; i < (cur_cpu + uv_blade_nr_possible_cpus(blade));
-            i++) {
-               bcp = (struct bau_control *)&per_cpu(bau_control, i);
-               bcp->bau_msg_head = bau_tablesp->va_queue_first;
-               bcp->va_queue_first = bau_tablesp->va_queue_first;
-               bcp->va_queue_last = bau_tablesp->va_queue_last;
-               bcp->watching = bau_tablesp->watching;
-               bcp->msg_statuses = bau_tablesp->msg_statuses;
-               bcp->descriptor_base = adp;
+       for_each_present_cpu(cpu) {
+               if (blade != uv_cpu_to_blade_id(cpu))
+                       continue;
+
+               bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
+               bcp->bau_msg_head       = bau_tablesp->va_queue_first;
+               bcp->va_queue_first     = bau_tablesp->va_queue_first;
+               bcp->va_queue_last      = bau_tablesp->va_queue_last;
+               bcp->msg_statuses       = bau_tablesp->msg_statuses;
+               bcp->descriptor_base    = adp;
         }
  }
  
@@ -636,26 +709,40 @@ uv_activation_descriptor_init(int node, int pnode)
         unsigned long pa;
         unsigned long m;
         unsigned long n;
-       unsigned long mmr_image;
         struct bau_desc *adp;
         struct bau_desc *ad2;
  
-       adp = (struct bau_desc *)
-           kmalloc_node(16384, GFP_KERNEL, node);
+       /*
+        * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+        * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+        */
+       adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+               UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
         BUG_ON(!adp);
-       pa = __pa((unsigned long)adp);
-       n = pa >> uv_nshift;
+
+       pa = uv_gpa(adp); /* need the real nasid*/
+       n = uv_gpa_to_pnode(pa);
         m = pa & uv_mmask;
-       mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);
-       if (mmr_image)
-               uv_write_global_mmr64(pnode, (unsigned long)
-                                     UVH_LB_BAU_SB_DESCRIPTOR_BASE,
-                                     (n << UV_DESC_BASE_PNODE_SHIFT | m));
-       for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+
+       uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
+                             (n << UV_DESC_BASE_PNODE_SHIFT | m));
+
+       /*
+        * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+        * cpu even though we only use the first one; one descriptor can
+        * describe a broadcast to 256 nodes.
+        */
+       for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+               i++, ad2++) {
                 memset(ad2, 0, sizeof(struct bau_desc));
                 ad2->header.sw_ack_flag = 1;
-               ad2->header.base_dest_nodeid =
-                   uv_blade_to_pnode(uv_cpu_to_blade_id(0));
+               /*
+                * base_dest_nodeid is the first node in the partition, so
+                * the bit map will indicate partition-relative node numbers.
+                * note that base_dest_nodeid is actually a nasid.
+                */
+               ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+               ad2->header.dest_subnodeid = 0x10; /* the LB */
                 ad2->header.command = UV_NET_ENDPOINT_INTD;
                 ad2->header.int_both = 1;
                 /*
@@ -669,23 +756,30 @@ uv_activation_descriptor_init(int node, int pnode)
  /*
   * initialize the destination side's receiving buffers
   */
-static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node,
-                               int pnode, struct bau_control *bau_tablesp)
+static struct bau_payload_queue_entry * __init
+uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
  {
-       char *cp;
         struct bau_payload_queue_entry *pqp;
+       unsigned long pa;
+       int pn;
+       char *cp;
  
         pqp = (struct bau_payload_queue_entry *) kmalloc_node(
                 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
                 GFP_KERNEL, node);
         BUG_ON(!pqp);
+
         cp = (char *)pqp + 31;
         pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
         bau_tablesp->va_queue_first = pqp;
+       /*
+        * need the pnode of where the memory was really allocated
+        */
+       pa = uv_gpa(pqp);
+       pn = uv_gpa_to_pnode(pa);
         uv_write_global_mmr64(pnode,
                               UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
-                             ((unsigned long)pnode <<
-                              UV_PAYLOADQ_PNODE_SHIFT) |
+                             ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
                               uv_physnodeaddr(pqp));
         uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
                               uv_physnodeaddr(pqp));
@@ -694,14 +788,16 @@ static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node,
                               (unsigned long)
                               uv_physnodeaddr(bau_tablesp->va_queue_last));
         memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
+
         return pqp;
  }
  
  /*
   * Initialization of each UV blade's structures
   */
-static int __init uv_init_blade(int blade, int node, int cur_cpu)
+static int __init uv_init_blade(int blade)
  {
+       int node;
         int pnode;
         unsigned long pa;
         unsigned long apicid;
@@ -709,16 +805,17 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
         struct bau_payload_queue_entry *pqp;
         struct bau_control *bau_tablesp;
  
+       node = blade_to_first_node(blade);
         bau_tablesp = uv_table_bases_init(blade, node);
         pnode = uv_blade_to_pnode(blade);
         adp = uv_activation_descriptor_init(node, pnode);
         pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
-       uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp);
+       uv_table_bases_finish(blade, bau_tablesp, adp);
         /*
          * the below initialization can't be in firmware because the
          * messaging IRQ will be determined by the OS
          */
-       apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+       apicid = blade_to_first_apicid(blade);
         pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
         if ((pa & 0xff) != UV_BAU_MESSAGE) {
                 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
@@ -733,40 +830,36 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
  static int __init uv_bau_init(void)
  {
         int blade;
-       int node;
         int nblades;
-       int last_blade;
-       int cur_cpu = 0;
+       int cur_cpu;
  
         if (!is_uv_system())
                 return 0;
  
+       for_each_possible_cpu(cur_cpu)
+               zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
+                                      GFP_KERNEL, cpu_to_node(cur_cpu));
+
         uv_bau_retry_limit = 1;
-       uv_nshift = uv_hub_info->n_val;
-       uv_mmask = (1UL << uv_hub_info->n_val) - 1;
-       nblades = 0;
-       last_blade = -1;
-       for_each_online_node(node) {
-               blade = uv_node_to_blade_id(node);
-               if (blade == last_blade)
-                       continue;
-               last_blade = blade;
-               nblades++;
-       }
+       uv_mmask = (1UL << uv_hub_info->m_val) - 1;
+       nblades = uv_num_possible_blades();
+
         uv_bau_table_bases = (struct bau_control **)
             kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
         BUG_ON(!uv_bau_table_bases);
-       last_blade = -1;
-       for_each_online_node(node) {
-               blade = uv_node_to_blade_id(node);
-               if (blade == last_blade)
-                       continue;
-               last_blade = blade;
-               uv_init_blade(blade, node, cur_cpu);
-               cur_cpu += uv_blade_nr_possible_cpus(blade);
-       }
-       set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+
+       uv_partition_base_pnode = 0x7fffffff;
+       for (blade = 0; blade < nblades; blade++)
+               if (uv_blade_nr_possible_cpus(blade) &&
+                       (uv_blade_to_pnode(blade) < uv_partition_base_pnode))
+                       uv_partition_base_pnode = uv_blade_to_pnode(blade);
+       for (blade = 0; blade < nblades; blade++)
+               if (uv_blade_nr_possible_cpus(blade))
+                       uv_init_blade(blade);
+
+       alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
         uv_enable_timeouts();
+
         return 0;
  }
  __initcall(uv_bau_init);