Merge ../linux-2.6-x86

author Rusty Russell <rusty@rustcorp.com.au>

Sat, 13 Dec 2008 11:25:51 +0000 (21:55 +1030)

committer Rusty Russell <rusty@rustcorp.com.au>

Sat, 13 Dec 2008 11:25:51 +0000 (21:55 +1030)
author Rusty Russell <rusty@rustcorp.com.au>
Sat, 13 Dec 2008 11:25:51 +0000 (21:55 +1030)
committer Rusty Russell <rusty@rustcorp.com.au>
Sat, 13 Dec 2008 11:25:51 +0000 (21:55 +1030)
diff --combined arch/m32r/Kconfig

index 17a6dab,29047d5..cabba33
--- 1/arch/m32r/Kconfig
--- 2/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@@ -10,7 -10,6 +10,7 @@@ config M32
         default y
         select HAVE_IDE
         select HAVE_OPROFILE
+ +      select INIT_ALL_POSSIBLE
   
   config SBUS
         bool
@@@ -274,7 -273,7 +274,7 @@@ config GENERIC_CALIBRATE_DELA
         bool
         default y
   
- config SCHED_NO_NO_OMIT_FRAME_POINTER
+ config SCHED_OMIT_FRAME_POINTER
           bool
           default y
   
diff --combined arch/x86/kernel/io_apic.c

index 1184210,a1a2e07..d7f0993
--- 1/arch/x86/kernel/io_apic.c
--- 2/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@@ -108,8 -108,33 +108,33 @@@ static int __init parse_noapic(char *st
   early_param("noapic", parse_noapic);
   
   struct irq_pin_list;
+ 
+ /*
+  * This is performance-critical, we want to do it O(1)
+  *
+  * the indexing order of this array favors 1:1 mappings
+  * between pins and IRQs.
+  */
+ 
+ struct irq_pin_list {
+       int apic, pin;
+       struct irq_pin_list *next;
+ };
+ 
+ static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+ {
+       struct irq_pin_list *pin;
+       int node;
+ 
+       node = cpu_to_node(cpu);
+ 
+       pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+ 
+       return pin;
+ }
+ 
   struct irq_cfg {
-       unsigned int irq;
         struct irq_pin_list *irq_2_pin;
         cpumask_t domain;
         cpumask_t old_domain;
@@@ -119,81 -144,95 +144,95 @@@
   };
   
   /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+ #ifdef CONFIG_SPARSE_IRQ
+ static struct irq_cfg irq_cfgx[] = {
+ #else
   static struct irq_cfg irq_cfgx[NR_IRQS] = {
-       [0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-       [1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-       [2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-       [3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-       [4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-       [5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-       [6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-       [7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-       [8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-       [9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-       [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-       [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-       [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-       [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-       [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-       [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+ #endif
+       [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+       [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+       [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+       [3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+       [4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+       [5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+       [6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+       [7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+       [8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+       [9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+       [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+       [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+       [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+       [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+       [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+       [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
   };
   
- #define for_each_irq_cfg(irq, cfg)            \
-       for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
- 
- static struct irq_cfg *irq_cfg(unsigned int irq)
+ void __init arch_early_irq_init(void)
   {
-       return irq < nr_irqs ? irq_cfgx + irq : NULL;
+       struct irq_cfg *cfg;
+       struct irq_desc *desc;
+       int count;
+       int i;
+ 
+       cfg = irq_cfgx;
+       count = ARRAY_SIZE(irq_cfgx);
+ 
+       for (i = 0; i < count; i++) {
+               desc = irq_to_desc(i);
+               desc->chip_data = &cfg[i];
+       }
   }
   
- static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+ #ifdef CONFIG_SPARSE_IRQ
+ static struct irq_cfg *irq_cfg(unsigned int irq)
   {
-       return irq_cfg(irq);
+       struct irq_cfg *cfg = NULL;
+       struct irq_desc *desc;
+ 
+       desc = irq_to_desc(irq);
+       if (desc)
+               cfg = desc->chip_data;
+ 
+       return cfg;
   }
   
- /*
-  * Rough estimation of how many shared IRQs there are, can be changed
-  * anytime.
-  */
- #define MAX_PLUS_SHARED_IRQS NR_IRQS
- #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+ {
+       struct irq_cfg *cfg;
+       int node;
   
- /*
-  * This is performance-critical, we want to do it O(1)
-  *
-  * the indexing order of this array favors 1:1 mappings
-  * between pins and IRQs.
-  */
+       node = cpu_to_node(cpu);
   
- struct irq_pin_list {
-       int apic, pin;
-       struct irq_pin_list *next;
- };
+       cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
   
- static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
- static struct irq_pin_list *irq_2_pin_ptr;
+       return cfg;
+ }
   
- static void __init irq_2_pin_init(void)
+ void arch_init_chip_data(struct irq_desc *desc, int cpu)
   {
-       struct irq_pin_list *pin = irq_2_pin_head;
-       int i;
- 
-       for (i = 1; i < PIN_MAP_SIZE; i++)
-               pin[i-1].next = &pin[i];
+       struct irq_cfg *cfg;
   
-       irq_2_pin_ptr = &pin[0];
+       cfg = desc->chip_data;
+       if (!cfg) {
+               desc->chip_data = get_one_free_irq_cfg(cpu);
+               if (!desc->chip_data) {
+                       printk(KERN_ERR "can not alloc irq_cfg\n");
+                       BUG_ON(1);
+               }
+       }
   }
   
- static struct irq_pin_list *get_one_free_irq_2_pin(void)
+ #else
+ static struct irq_cfg *irq_cfg(unsigned int irq)
   {
-       struct irq_pin_list *pin = irq_2_pin_ptr;
+       return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ }
   
-       if (!pin)
-               panic("can not get more irq_2_pin\n");
+ #endif
   
-       irq_2_pin_ptr = pin->next;
-       pin->next = NULL;
-       return pin;
+ static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+ {
   }
   
   struct io_apic {
@@@ -237,11 -276,10 +276,10 @@@ static inline void io_apic_modify(unsig
         writel(value, &io_apic->data);
   }
   
- static bool io_apic_level_ack_pending(unsigned int irq)
+ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
   {
         struct irq_pin_list *entry;
         unsigned long flags;
-       struct irq_cfg *cfg = irq_cfg(irq);
   
         spin_lock_irqsave(&ioapic_lock, flags);
         entry = cfg->irq_2_pin;
@@@ -323,13 -361,12 +361,12 @@@ static void ioapic_mask_entry(int apic
   }
   
   #ifdef CONFIG_SMP
- static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
   {
         int apic, pin;
-       struct irq_cfg *cfg;
         struct irq_pin_list *entry;
+       u8 vector = cfg->vector;
   
-       cfg = irq_cfg(irq);
         entry = cfg->irq_2_pin;
         for (;;) {
                 unsigned int reg;
@@@ -359,37 -396,48 +396,49 @@@
         }
   }
   
- static int assign_irq_vector(int irq, cpumask_t mask);
+ static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
   
- static void set_ioapic_affinity_irq(unsigned int irq,
-                                   const struct cpumask *mask)
- -static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
++static void set_ioapic_affinity_irq_desc(struct irq_desc *desc,
++                                       const struct cpumask *mask)
   {
         struct irq_cfg *cfg;
         unsigned long flags;
         unsigned int dest;
         cpumask_t tmp;
-       struct irq_desc *desc;
+       unsigned int irq;
   
- -      cpus_and(tmp, mask, cpu_online_map);
- -      if (cpus_empty(tmp))
+ +      if (!cpumask_intersects(mask, cpu_online_mask))
                 return;
   
-       cfg = irq_cfg(irq);
-       if (assign_irq_vector(irq, *mask))
+       irq = desc->irq;
+       cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                 return;
   
- -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
+ 
- -      cpus_and(tmp, cfg->domain, mask);
+ +      cpumask_and(&tmp, &cfg->domain, mask);
         dest = cpu_mask_to_apicid(tmp);
         /*
          * Only the high 8 bits are valid.
          */
         dest = SET_APIC_LOGICAL_ID(dest);
   
-       desc = irq_to_desc(irq);
         spin_lock_irqsave(&ioapic_lock, flags);
-       __target_IO_APIC_irq(irq, dest, cfg->vector);
+       __target_IO_APIC_irq(irq, dest, cfg);
- -      desc->affinity = mask;
+ +      cpumask_copy(&desc->affinity, mask);
         spin_unlock_irqrestore(&ioapic_lock, flags);
   }
- -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+ 
++static void set_ioapic_affinity_irq(unsigned int irq,
++                                  const struct cpumask *mask)
+ {
+       struct irq_desc *desc;
+ 
+       desc = irq_to_desc(irq);
+ 
+       set_ioapic_affinity_irq_desc(desc, mask);
+ }
   #endif /* CONFIG_SMP */
   
   /*
@@@ -397,16 -445,18 +446,18 @@@
    * shared ISA-space IRQs, so we have to support them. We are super
    * fast in the common case, and fast for shared ISA-space IRQs.
    */
- static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
   {
-       struct irq_cfg *cfg;
         struct irq_pin_list *entry;
   
-       /* first time to refer irq_cfg, so with new */
-       cfg = irq_cfg_alloc(irq);
         entry = cfg->irq_2_pin;
         if (!entry) {
-               entry = get_one_free_irq_2_pin();
+               entry = get_one_free_irq_2_pin(cpu);
+               if (!entry) {
+                       printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+                                       apic, pin);
+                       return;
+               }
                 cfg->irq_2_pin = entry;
                 entry->apic = apic;
                 entry->pin = pin;
@@@ -421,7 -471,7 +472,7 @@@
                 entry = entry->next;
         }
   
-       entry->next = get_one_free_irq_2_pin();
+       entry->next = get_one_free_irq_2_pin(cpu);
         entry = entry->next;
         entry->apic = apic;
         entry->pin = pin;
@@@ -430,11 -480,10 +481,10 @@@
   /*
    * Reroute an IRQ to a different pin.
    */
- static void __init replace_pin_at_irq(unsigned int irq,
+ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
                                       int oldapic, int oldpin,
                                       int newapic, int newpin)
   {
-       struct irq_cfg *cfg = irq_cfg(irq);
         struct irq_pin_list *entry = cfg->irq_2_pin;
         int replaced = 0;
   
@@@ -451,18 -500,16 +501,16 @@@
   
         /* why? call replace before add? */
         if (!replaced)
-               add_pin_to_irq(irq, newapic, newpin);
+               add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
   }
   
- static inline void io_apic_modify_irq(unsigned int irq,
+ static inline void io_apic_modify_irq(struct irq_cfg *cfg,
                                 int mask_and, int mask_or,
                                 void (*final)(struct irq_pin_list *entry))
   {
         int pin;
-       struct irq_cfg *cfg;
         struct irq_pin_list *entry;
   
-       cfg = irq_cfg(irq);
         for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
                 unsigned int reg;
                 pin = entry->pin;
@@@ -475,9 -522,9 +523,9 @@@
         }
   }
   
- static void __unmask_IO_APIC_irq(unsigned int irq)
+ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
   {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
   }
   
   #ifdef CONFIG_X86_64
@@@ -492,47 -539,64 +540,64 @@@ void io_apic_sync(struct irq_pin_list *
         readl(&io_apic->data);
   }
   
- static void __mask_IO_APIC_irq(unsigned int irq)
+ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
   {
-       io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+       io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
   }
   #else /* CONFIG_X86_32 */
- static void __mask_IO_APIC_irq(unsigned int irq)
+ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
   {
-       io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+       io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
   }
   
- static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+ static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
   {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
                         IO_APIC_REDIR_MASKED, NULL);
   }
   
- static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
   {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
                         IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
   }
   #endif /* CONFIG_X86_32 */
   
- static void mask_IO_APIC_irq (unsigned int irq)
+ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
   {
+       struct irq_cfg *cfg = desc->chip_data;
         unsigned long flags;
   
+       BUG_ON(!cfg);
+ 
         spin_lock_irqsave(&ioapic_lock, flags);
-       __mask_IO_APIC_irq(irq);
+       __mask_IO_APIC_irq(cfg);
         spin_unlock_irqrestore(&ioapic_lock, flags);
   }
   
- static void unmask_IO_APIC_irq (unsigned int irq)
+ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
   {
+       struct irq_cfg *cfg = desc->chip_data;
         unsigned long flags;
   
         spin_lock_irqsave(&ioapic_lock, flags);
-       __unmask_IO_APIC_irq(irq);
+       __unmask_IO_APIC_irq(cfg);
         spin_unlock_irqrestore(&ioapic_lock, flags);
   }
   
+ static void mask_IO_APIC_irq(unsigned int irq)
+ {
+       struct irq_desc *desc = irq_to_desc(irq);
+ 
+       mask_IO_APIC_irq_desc(desc);
+ }
+ static void unmask_IO_APIC_irq(unsigned int irq)
+ {
+       struct irq_desc *desc = irq_to_desc(irq);
+ 
+       unmask_IO_APIC_irq_desc(desc);
+ }
+ 
   static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
   {
         struct IO_APIC_route_entry entry;
@@@ -809,7 -873,7 +874,7 @@@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vecto
    */
   static int EISA_ELCR(unsigned int irq)
   {
-       if (irq < 16) {
+       if (irq < NR_IRQS_LEGACY) {
                 unsigned int port = 0x4d0 + (irq >> 3);
                 return (inb(port) >> (irq & 7)) & 1;
         }
@@@ -1034,7 -1098,7 +1099,7 @@@ void unlock_vector_lock(void
         spin_unlock(&vector_lock);
   }
   
- static int __assign_irq_vector(int irq, cpumask_t mask)
+ static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
   {
         /*
          * NOTE! The local APIC isn't very good at handling
@@@ -1050,16 -1114,13 +1115,13 @@@
         static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
         unsigned int old_vector;
         int cpu;
-       struct irq_cfg *cfg;
   
-       cfg = irq_cfg(irq);
+       if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+               return -EBUSY;
   
         /* Only try and allocate irqs on cpus that are present */
         cpus_and(mask, mask, cpu_online_map);
   
-       if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-               return -EBUSY;
- 
         old_vector = cfg->vector;
         if (old_vector) {
                 cpumask_t tmp;
@@@ -1113,24 -1174,22 +1175,22 @@@ next
         return -ENOSPC;
   }
   
- static int assign_irq_vector(int irq, cpumask_t mask)
+ static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
   {
         int err;
         unsigned long flags;
   
         spin_lock_irqsave(&vector_lock, flags);
-       err = __assign_irq_vector(irq, mask);
+       err = __assign_irq_vector(irq, cfg, mask);
         spin_unlock_irqrestore(&vector_lock, flags);
         return err;
   }
   
- static void __clear_irq_vector(int irq)
+ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
   {
         cpumask_t mask;
         int cpu, vector;
   
-       cfg = irq_cfg(irq);
         BUG_ON(!cfg->vector);
   
         vector = cfg->vector;
@@@ -1162,9 -1221,13 +1222,13 @@@ void __setup_vector_irq(int cpu
         /* This function must be called with vector_lock held */
         int irq, vector;
         struct irq_cfg *cfg;
+       struct irq_desc *desc;
   
         /* Mark the inuse vectors */
-       for_each_irq_cfg(irq, cfg) {
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+               cfg = desc->chip_data;
                 if (!cpu_isset(cpu, cfg->domain))
                         continue;
                 vector = cfg->vector;
@@@ -1215,11 -1278,8 +1279,8 @@@ static inline int IO_APIC_irq_trigger(i
   }
   #endif
   
- static void ioapic_register_intr(int irq, unsigned long trigger)
+ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
   {
-       struct irq_desc *desc;
- 
-       desc = irq_to_desc(irq);
   
         if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
             trigger == IOAPIC_LEVEL)
@@@ -1311,7 -1371,7 +1372,7 @@@ static int setup_ioapic_entry(int apic
         return 0;
   }
   
- static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
                               int trigger, int polarity)
   {
         struct irq_cfg *cfg;
@@@ -1321,10 -1381,10 +1382,10 @@@
         if (!IO_APIC_IRQ(irq))
                 return;
   
-       cfg = irq_cfg(irq);
+       cfg = desc->chip_data;
   
         mask = TARGET_CPUS;
-       if (assign_irq_vector(irq, mask))
+       if (assign_irq_vector(irq, cfg, mask))
                 return;
   
         cpus_and(mask, cfg->domain, mask);
@@@ -1341,12 -1401,12 +1402,12 @@@
                                cfg->vector)) {
                 printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
                        mp_ioapics[apic].mp_apicid, pin);
-               __clear_irq_vector(irq);
+               __clear_irq_vector(irq, cfg);
                 return;
         }
   
-       ioapic_register_intr(irq, trigger);
-       if (irq < 16)
+       ioapic_register_intr(irq, desc, trigger);
+       if (irq < NR_IRQS_LEGACY)
                 disable_8259A_irq(irq);
   
         ioapic_write_entry(apic, pin, entry);
@@@ -1356,6 -1416,9 +1417,9 @@@ static void __init setup_IO_APIC_irqs(v
   {
         int apic, pin, idx, irq;
         int notcon = 0;
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+       int cpu = boot_cpu_id;
   
         apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
   
@@@ -1387,9 -1450,15 +1451,15 @@@
                         if (multi_timer_check(apic, irq))
                                 continue;
   #endif
-                       add_pin_to_irq(irq, apic, pin);
+                       desc = irq_to_desc_alloc_cpu(irq, cpu);
+                       if (!desc) {
+                               printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+                               continue;
+                       }
+                       cfg = desc->chip_data;
+                       add_pin_to_irq_cpu(cfg, cpu, apic, pin);
   
-                       setup_IO_APIC_irq(apic, pin, irq,
+                       setup_IO_APIC_irq(apic, pin, irq, desc,
                                         irq_trigger(idx), irq_polarity(idx));
                 }
         }
@@@ -1448,6 -1517,7 +1518,7 @@@ __apicdebuginit(void) print_IO_APIC(voi
         union IO_APIC_reg_03 reg_03;
         unsigned long flags;
         struct irq_cfg *cfg;
+       struct irq_desc *desc;
         unsigned int irq;
   
         if (apic_verbosity == APIC_QUIET)
@@@ -1537,8 -1607,13 +1608,13 @@@
         }
         }
         printk(KERN_DEBUG "IRQ to pin mappings:\n");
-       for_each_irq_cfg(irq, cfg) {
-               struct irq_pin_list *entry = cfg->irq_2_pin;
+       for_each_irq_desc(irq, desc) {
+               struct irq_pin_list *entry;
+ 
+               if (!desc)
+                       continue;
+               cfg = desc->chip_data;
+               entry = cfg->irq_2_pin;
                 if (!entry)
                         continue;
                 printk(KERN_DEBUG "IRQ%d ", irq);
@@@ -2022,14 -2097,16 +2098,16 @@@ static unsigned int startup_ioapic_irq(
   {
         int was_pending = 0;
         unsigned long flags;
+       struct irq_cfg *cfg;
   
         spin_lock_irqsave(&ioapic_lock, flags);
-       if (irq < 16) {
+       if (irq < NR_IRQS_LEGACY) {
                 disable_8259A_irq(irq);
                 if (i8259A_irq_pending(irq))
                         was_pending = 1;
         }
-       __unmask_IO_APIC_irq(irq);
+       cfg = irq_cfg(irq);
+       __unmask_IO_APIC_irq(cfg);
         spin_unlock_irqrestore(&ioapic_lock, flags);
   
         return was_pending;
@@@ -2092,35 -2169,37 +2170,37 @@@ static DECLARE_DELAYED_WORK(ir_migratio
    * as simple as edge triggered migration and we can do the irq migration
    * with a simple atomic update to IO-APIC RTE.
    */
- static void migrate_ioapic_irq(int irq, cpumask_t mask)
+ static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
   {
         struct irq_cfg *cfg;
-       struct irq_desc *desc;
         cpumask_t tmp, cleanup_mask;
         struct irte irte;
         int modify_ioapic_rte;
         unsigned int dest;
         unsigned long flags;
+       unsigned int irq;
   
         cpus_and(tmp, mask, cpu_online_map);
         if (cpus_empty(tmp))
                 return;
   
+       irq = desc->irq;
         if (get_irte(irq, &irte))
                 return;
   
-       if (assign_irq_vector(irq, mask))
+       cfg = desc->chip_data;
+       if (assign_irq_vector(irq, cfg, mask))
                 return;
   
-       cfg = irq_cfg(irq);
+       set_extra_move_desc(desc, mask);
+ 
         cpus_and(tmp, cfg->domain, mask);
         dest = cpu_mask_to_apicid(tmp);
   
         modify_ioapic_rte = desc->status & IRQ_LEVEL;
         if (modify_ioapic_rte) {
                 spin_lock_irqsave(&ioapic_lock, flags);
-               __target_IO_APIC_irq(irq, dest, cfg->vector);
+               __target_IO_APIC_irq(irq, dest, cfg);
                 spin_unlock_irqrestore(&ioapic_lock, flags);
         }
   
@@@ -2142,14 -2221,14 +2222,14 @@@
         desc->affinity = mask;
   }
   
- static int migrate_irq_remapped_level(int irq)
+ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
   {
         int ret = -1;
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_cfg *cfg = desc->chip_data;
   
-       mask_IO_APIC_irq(irq);
+       mask_IO_APIC_irq_desc(desc);
   
-       if (io_apic_level_ack_pending(irq)) {
+       if (io_apic_level_ack_pending(cfg)) {
                 /*
                  * Interrupt in progress. Migrating irq now will change the
                  * vector information in the IO-APIC RTE and that will confuse
@@@ -2161,14 -2240,15 +2241,15 @@@
         }
   
         /* everthing is clear. we have right of way */
-       migrate_ioapic_irq(irq, desc->pending_mask);
+       migrate_ioapic_irq_desc(desc, desc->pending_mask);
   
         ret = 0;
         desc->status &= ~IRQ_MOVE_PENDING;
         cpus_clear(desc->pending_mask);
   
   unmask:
-       unmask_IO_APIC_irq(irq);
+       unmask_IO_APIC_irq_desc(desc);
+ 
         return ret;
   }
   
@@@ -2178,6 -2258,9 +2259,9 @@@ static void ir_irq_migration(struct wor
         struct irq_desc *desc;
   
         for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+ 
                 if (desc->status & IRQ_MOVE_PENDING) {
                         unsigned long flags;
   
@@@ -2189,7 -2272,7 +2273,7 @@@
                                 continue;
                         }
   
- -                      desc->chip->set_affinity(irq, desc->pending_mask);
+ +                      desc->chip->set_affinity(irq, &desc->pending_mask);
                         spin_unlock_irqrestore(&desc->lock, flags);
                 }
         }
@@@ -2198,19 -2281,22 +2282,24 @@@
   /*
    * Migrates the IRQ destination in the process context.
    */
- static void set_ir_ioapic_affinity_irq(unsigned int irq,
-                                      const struct cpumask *mask)
- -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
++static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
++                                          const struct cpumask *mask)
   {
-       struct irq_desc *desc = irq_to_desc(irq);
- 
         if (desc->status & IRQ_LEVEL) {
                 desc->status |= IRQ_MOVE_PENDING;
- -              desc->pending_mask = mask;
+ +              cpumask_copy(&desc->pending_mask, mask);
-               migrate_irq_remapped_level(irq);
+               migrate_irq_remapped_level_desc(desc);
                 return;
         }
   
-       migrate_ioapic_irq(irq, *mask);
+       migrate_ioapic_irq_desc(desc, mask);
+ }
- -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
++static void set_ir_ioapic_affinity_irq(unsigned int irq,
++                                     const struct cpumask *mask)
+ {
+       struct irq_desc *desc = irq_to_desc(irq);
+ 
+       set_ir_ioapic_affinity_irq_desc(desc, mask);
   }
   #endif
   
@@@ -2230,6 -2316,9 +2319,9 @@@ asmlinkage void smp_irq_move_cleanup_in
                 struct irq_cfg *cfg;
                 irq = __get_cpu_var(vector_irq)[vector];
   
+               if (irq == -1)
+                       continue;
+ 
                 desc = irq_to_desc(irq);
                 if (!desc)
                         continue;
@@@ -2251,9 -2340,10 +2343,10 @@@ unlock
         irq_exit();
   }
   
- static void irq_complete_move(unsigned int irq)
+ static void irq_complete_move(struct irq_desc **descp)
   {
-       struct irq_cfg *cfg = irq_cfg(irq);
+       struct irq_desc *desc = *descp;
+       struct irq_cfg *cfg = desc->chip_data;
         unsigned vector, me;
   
         if (likely(!cfg->move_in_progress))
@@@ -2271,8 -2361,9 +2364,9 @@@
         }
   }
   #else
- static inline void irq_complete_move(unsigned int irq) {}
+ static inline void irq_complete_move(struct irq_desc **descp) {}
   #endif
+ 
   #ifdef CONFIG_INTR_REMAP
   static void ack_x2apic_level(unsigned int irq)
   {
@@@ -2283,11 -2374,14 +2377,14 @@@ static void ack_x2apic_edge(unsigned in
   {
         ack_x2APIC_irq();
   }
+ 
   #endif
   
   static void ack_apic_edge(unsigned int irq)
   {
-       irq_complete_move(irq);
+       struct irq_desc *desc = irq_to_desc(irq);
+ 
+       irq_complete_move(&desc);
         move_native_irq(irq);
         ack_APIC_irq();
   }
@@@ -2296,18 -2390,21 +2393,21 @@@ atomic_t irq_mis_count
   
   static void ack_apic_level(unsigned int irq)
   {
+       struct irq_desc *desc = irq_to_desc(irq);
+ 
   #ifdef CONFIG_X86_32
         unsigned long v;
         int i;
   #endif
+       struct irq_cfg *cfg;
         int do_unmask_irq = 0;
   
-       irq_complete_move(irq);
+       irq_complete_move(&desc);
   #ifdef CONFIG_GENERIC_PENDING_IRQ
         /* If we are moving the irq we need to mask it */
-       if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+       if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
                 do_unmask_irq = 1;
-               mask_IO_APIC_irq(irq);
+               mask_IO_APIC_irq_desc(desc);
         }
   #endif
   
@@@ -2331,7 -2428,8 +2431,8 @@@
         * operation to prevent an edge-triggered interrupt escaping meanwhile.
         * The idea is from Manfred Spraul.  --macro
         */
-       i = irq_cfg(irq)->vector;
+       cfg = desc->chip_data;
+       i = cfg->vector;
   
         v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
   #endif
@@@ -2370,17 -2468,18 +2471,18 @@@
                  * accurate and is causing problems then it is a hardware bug
                  * and you can go talk to the chipset vendor about it.
                  */
-               if (!io_apic_level_ack_pending(irq))
+               cfg = desc->chip_data;
+               if (!io_apic_level_ack_pending(cfg))
                         move_masked_irq(irq);
-               unmask_IO_APIC_irq(irq);
+               unmask_IO_APIC_irq_desc(desc);
         }
   
   #ifdef CONFIG_X86_32
         if (!(v & (1 << (i & 0x1f)))) {
                 atomic_inc(&irq_mis_count);
                 spin_lock(&ioapic_lock);
-               __mask_and_edge_IO_APIC_irq(irq);
-               __unmask_and_level_IO_APIC_irq(irq);
+               __mask_and_edge_IO_APIC_irq(cfg);
+               __unmask_and_level_IO_APIC_irq(cfg);
                 spin_unlock(&ioapic_lock);
         }
   #endif
@@@ -2431,20 -2530,22 +2533,22 @@@ static inline void init_IO_APIC_traps(v
          * Also, we've got to be careful not to trash gate
          * 0x80, because int 0x80 is hm, kind of importantish. ;)
          */
-       for_each_irq_cfg(irq, cfg) {
-               if (IO_APIC_IRQ(irq) && !cfg->vector) {
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+ 
+               cfg = desc->chip_data;
+               if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
                         /*
                          * Hmm.. We don't have an entry for this,
                          * so default to an old-fashioned 8259
                          * interrupt if we can..
                          */
-                       if (irq < 16)
+                       if (irq < NR_IRQS_LEGACY)
                                 make_8259A_irq(irq);
-                       else {
-                               desc = irq_to_desc(irq);
+                       else
                                 /* Strange. Oh, well.. */
                                 desc->chip = &no_irq_chip;
-                       }
                 }
         }
   }
@@@ -2469,7 -2570,7 +2573,7 @@@ static void unmask_lapic_irq(unsigned i
         apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
   }
   
- static void ack_lapic_irq (unsigned int irq)
+ static void ack_lapic_irq(unsigned int irq)
   {
         ack_APIC_irq();
   }
@@@ -2481,11 -2582,8 +2585,8 @@@ static struct irq_chip lapic_chip __rea
         .ack            = ack_lapic_irq,
   };
   
- static void lapic_register_intr(int irq)
+ static void lapic_register_intr(int irq, struct irq_desc *desc)
   {
-       struct irq_desc *desc;
- 
-       desc = irq_to_desc(irq);
         desc->status &= ~IRQ_LEVEL;
         set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
                                       "edge");
@@@ -2589,7 -2687,9 +2690,9 @@@ int timer_through_8259 __initdata
    */
   static inline void __init check_timer(void)
   {
-       struct irq_cfg *cfg = irq_cfg(0);
+       struct irq_desc *desc = irq_to_desc(0);
+       struct irq_cfg *cfg = desc->chip_data;
+       int cpu = boot_cpu_id;
         int apic1, pin1, apic2, pin2;
         unsigned long flags;
         unsigned int ver;
@@@ -2604,7 -2704,7 +2707,7 @@@
          * get/set the timer IRQ vector:
          */
         disable_8259A_irq(0);
-       assign_irq_vector(0, TARGET_CPUS);
+       assign_irq_vector(0, cfg, TARGET_CPUS);
   
         /*
          * As IRQ0 is to be enabled in the 8259A, the virtual
@@@ -2655,10 -2755,10 +2758,10 @@@
                  * Ok, does IRQ0 through the IOAPIC work?
                  */
                 if (no_pin1) {
-                       add_pin_to_irq(0, apic1, pin1);
+                       add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
                         setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
                 }
-               unmask_IO_APIC_irq(0);
+               unmask_IO_APIC_irq_desc(desc);
                 if (timer_irq_works()) {
                         if (nmi_watchdog == NMI_IO_APIC) {
                                 setup_nmi();
@@@ -2684,9 -2784,9 +2787,9 @@@
                 /*
                  * legacy devices should be connected to IO APIC #0
                  */
-               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+               replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
                 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-               unmask_IO_APIC_irq(0);
+               unmask_IO_APIC_irq_desc(desc);
                 enable_8259A_irq(0);
                 if (timer_irq_works()) {
                         apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@@ -2718,7 -2818,7 +2821,7 @@@
         apic_printk(APIC_QUIET, KERN_INFO
                     "...trying to set up timer as Virtual Wire IRQ...\n");
   
-       lapic_register_intr(0);
+       lapic_register_intr(0, desc);
         apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
         enable_8259A_irq(0);
   
@@@ -2903,22 -3003,26 +3006,26 @@@ unsigned int create_irq_nr(unsigned in
         unsigned int irq;
         unsigned int new;
         unsigned long flags;
-       struct irq_cfg *cfg_new;
- 
-       irq_want = nr_irqs - 1;
+       struct irq_cfg *cfg_new = NULL;
+       int cpu = boot_cpu_id;
+       struct irq_desc *desc_new = NULL;
   
         irq = 0;
         spin_lock_irqsave(&vector_lock, flags);
-       for (new = irq_want; new > 0; new--) {
+       for (new = irq_want; new < NR_IRQS; new++) {
                 if (platform_legacy_irq(new))
                         continue;
-               cfg_new = irq_cfg(new);
-               if (cfg_new && cfg_new->vector != 0)
+ 
+               desc_new = irq_to_desc_alloc_cpu(new, cpu);
+               if (!desc_new) {
+                       printk(KERN_INFO "can not get irq_desc for %d\n", new);
+                       continue;
+               }
+               cfg_new = desc_new->chip_data;
+ 
+               if (cfg_new->vector != 0)
                         continue;
-               /* check if need to create one */
-               if (!cfg_new)
-                       cfg_new = irq_cfg_alloc(new);
-               if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+               if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
                         irq = new;
                 break;
         }
@@@ -2926,15 -3030,21 +3033,21 @@@
   
         if (irq > 0) {
                 dynamic_irq_init(irq);
+               /* restore it, in case dynamic_irq_init clear it */
+               if (desc_new)
+                       desc_new->chip_data = cfg_new;
         }
         return irq;
   }
   
+ static int nr_irqs_gsi = NR_IRQS_LEGACY;
   int create_irq(void)
   {
+       unsigned int irq_want;
         int irq;
   
-       irq = create_irq_nr(nr_irqs - 1);
+       irq_want = nr_irqs_gsi;
+       irq = create_irq_nr(irq_want);
   
         if (irq == 0)
                 irq = -1;
@@@ -2945,14 -3055,22 +3058,22 @@@
   void destroy_irq(unsigned int irq)
   {
         unsigned long flags;
+       struct irq_cfg *cfg;
+       struct irq_desc *desc;
   
+       /* store it, in case dynamic_irq_cleanup clear it */
+       desc = irq_to_desc(irq);
+       cfg = desc->chip_data;
         dynamic_irq_cleanup(irq);
+       /* connect back irq_cfg */
+       if (desc)
+               desc->chip_data = cfg;
   
   #ifdef CONFIG_INTR_REMAP
         free_irte(irq);
   #endif
         spin_lock_irqsave(&vector_lock, flags);
-       __clear_irq_vector(irq);
+       __clear_irq_vector(irq, cfg);
         spin_unlock_irqrestore(&vector_lock, flags);
   }
   
@@@ -2967,12 -3085,12 +3088,12 @@@ static int msi_compose_msg(struct pci_d
         unsigned dest;
         cpumask_t tmp;
   
+       cfg = irq_cfg(irq);
         tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, tmp);
+       err = assign_irq_vector(irq, cfg, tmp);
         if (err)
                 return err;
   
-       cfg = irq_cfg(irq);
         cpus_and(tmp, cfg->domain, tmp);
         dest = cpu_mask_to_apicid(tmp);
   
@@@ -3028,61 -3146,64 +3149,63 @@@
   }
   
   #ifdef CONFIG_SMP
- -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+ +static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
   {
+       struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
         struct msi_msg msg;
         unsigned int dest;
         cpumask_t tmp;
-       struct irq_desc *desc;
   
- -      cpus_and(tmp, mask, cpu_online_map);
- -      if (cpus_empty(tmp))
+ +      if (!cpumask_intersects(mask, cpu_online_mask))
                 return;
   
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                 return;
   
-       cfg = irq_cfg(irq);
- -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
+ 
- -      cpus_and(tmp, cfg->domain, mask);
+ +      cpumask_and(&tmp, &cfg->domain, mask);
         dest = cpu_mask_to_apicid(tmp);
   
-       read_msi_msg(irq, &msg);
+       read_msi_msg_desc(desc, &msg);
   
         msg.data &= ~MSI_DATA_VECTOR_MASK;
         msg.data |= MSI_DATA_VECTOR(cfg->vector);
         msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
   
-       write_msi_msg(irq, &msg);
-       desc = irq_to_desc(irq);
+       write_msi_msg_desc(desc, &msg);
- -      desc->affinity = mask;
+ +      cpumask_copy(&desc->affinity, mask);
   }
- 
   #ifdef CONFIG_INTR_REMAP
   /*
    * Migrate the MSI irq to another cpumask. This migration is
    * done in the process context using interrupt-remapping hardware.
    */
- -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+ +static void ir_set_msi_irq_affinity(unsigned int irq,
+ +                                  const struct cpumask *mask)
   {
+       struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
         unsigned int dest;
         cpumask_t tmp, cleanup_mask;
         struct irte irte;
-       struct irq_desc *desc;
   
- -      cpus_and(tmp, mask, cpu_online_map);
- -      if (cpus_empty(tmp))
+ +      if (!cpumask_intersects(mask, cpu_online_mask))
                 return;
   
         if (get_irte(irq, &irte))
                 return;
   
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                 return;
   
-       cfg = irq_cfg(irq);
+       set_extra_move_desc(desc, mask);
+ 
- -      cpus_and(tmp, cfg->domain, mask);
+ +      cpumask_and(&tmp, &cfg->domain, mask);
         dest = cpu_mask_to_apicid(tmp);
   
         irte.vector = cfg->vector;
@@@ -3105,9 -3226,9 +3228,9 @@@
                 cfg->move_in_progress = 0;
         }
   
-       desc = irq_to_desc(irq);
- -      desc->affinity = mask;
+ +      cpumask_copy(&desc->affinity, mask);
   }
+ 
   #endif
   #endif /* CONFIG_SMP */
   
@@@ -3166,7 -3287,7 +3289,7 @@@ static int msi_alloc_irte(struct pci_de
   }
   #endif
   
- static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
   {
         int ret;
         struct msi_msg msg;
@@@ -3175,7 -3296,7 +3298,7 @@@
         if (ret < 0)
                 return ret;
   
-       set_irq_msi(irq, desc);
+       set_irq_msi(irq, msidesc);
         write_msi_msg(irq, &msg);
   
   #ifdef CONFIG_INTR_REMAP
@@@ -3195,26 -3316,13 +3318,13 @@@
         return 0;
   }
   
- static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
- {
-       unsigned int irq;
- 
-       irq = dev->bus->number;
-       irq <<= 8;
-       irq |= dev->devfn;
-       irq <<= 12;
- 
-       return irq;
- }
- 
- int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
   {
         unsigned int irq;
         int ret;
         unsigned int irq_want;
   
-       irq_want = build_irq_for_pci_dev(dev) + 0x100;
- 
+       irq_want = nr_irqs_gsi;
         irq = create_irq_nr(irq_want);
         if (irq == 0)
                 return -1;
@@@ -3228,7 -3336,7 +3338,7 @@@
                 goto error;
   no_ir:
   #endif
-       ret = setup_msi_irq(dev, desc, irq);
+       ret = setup_msi_irq(dev, msidesc, irq);
         if (ret < 0) {
                 destroy_irq(irq);
                 return ret;
@@@ -3246,7 -3354,7 +3356,7 @@@ int arch_setup_msi_irqs(struct pci_dev 
   {
         unsigned int irq;
         int ret, sub_handle;
-       struct msi_desc *desc;
+       struct msi_desc *msidesc;
         unsigned int irq_want;
   
   #ifdef CONFIG_INTR_REMAP
@@@ -3254,10 -3362,11 +3364,11 @@@
         int index = 0;
   #endif
   
-       irq_want = build_irq_for_pci_dev(dev) + 0x100;
+       irq_want = nr_irqs_gsi;
         sub_handle = 0;
-       list_for_each_entry(desc, &dev->msi_list, list) {
-               irq = create_irq_nr(irq_want--);
+       list_for_each_entry(msidesc, &dev->msi_list, list) {
+               irq = create_irq_nr(irq_want);
+               irq_want++;
                 if (irq == 0)
                         return -1;
   #ifdef CONFIG_INTR_REMAP
@@@ -3289,7 -3398,7 +3400,7 @@@
                 }
   no_ir:
   #endif
-               ret = setup_msi_irq(dev, desc, irq);
+               ret = setup_msi_irq(dev, msidesc, irq);
                 if (ret < 0)
                         goto error;
                 sub_handle++;
@@@ -3308,22 -3417,25 +3419,24 @@@ void arch_teardown_msi_irq(unsigned in
   
   #ifdef CONFIG_DMAR
   #ifdef CONFIG_SMP
- -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+ +static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
   {
+       struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
         struct msi_msg msg;
         unsigned int dest;
         cpumask_t tmp;
-       struct irq_desc *desc;
   
- -      cpus_and(tmp, mask, cpu_online_map);
- -      if (cpus_empty(tmp))
+ +      if (!cpumask_intersects(mask, cpu_online_mask))
                 return;
   
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                 return;
   
-       cfg = irq_cfg(irq);
- -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
+ 
- -      cpus_and(tmp, cfg->domain, mask);
+ +      cpumask_and(&tmp, &cfg->domain, mask);
         dest = cpu_mask_to_apicid(tmp);
   
         dmar_msi_read(irq, &msg);
@@@ -3334,9 -3446,9 +3447,9 @@@
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
   
         dmar_msi_write(irq, &msg);
-       desc = irq_to_desc(irq);
- -      desc->affinity = mask;
+ +      cpumask_copy(&desc->affinity, mask);
   }
+ 
   #endif /* CONFIG_SMP */
   
   struct irq_chip dmar_msi_type = {
@@@ -3368,22 -3480,25 +3481,24 @@@ int arch_setup_dmar_msi(unsigned int ir
   #ifdef CONFIG_HPET_TIMER
   
   #ifdef CONFIG_SMP
- -static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+ +static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
   {
+       struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
-       struct irq_desc *desc;
         struct msi_msg msg;
         unsigned int dest;
         cpumask_t tmp;
   
- -      cpus_and(tmp, mask, cpu_online_map);
- -      if (cpus_empty(tmp))
+ +      if (!cpumask_intersects(mask, cpu_online_mask))
                 return;
   
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                 return;
   
-       cfg = irq_cfg(irq);
- -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
+ 
- -      cpus_and(tmp, cfg->domain, mask);
+ +      cpumask_and(&tmp, &cfg->domain, mask);
         dest = cpu_mask_to_apicid(tmp);
   
         hpet_msi_read(irq, &msg);
@@@ -3394,9 -3509,9 +3509,9 @@@
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
   
         hpet_msi_write(irq, &msg);
-       desc = irq_to_desc(irq);
- -      desc->affinity = mask;
+ +      cpumask_copy(&desc->affinity, mask);
   }
+ 
   #endif /* CONFIG_SMP */
   
   struct irq_chip hpet_msi_type = {
@@@ -3449,27 -3564,30 +3564,29 @@@ static void target_ht_irq(unsigned int 
         write_ht_irq_msg(irq, &msg);
   }
   
- -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+ +static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
   {
+       struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
         unsigned int dest;
         cpumask_t tmp;
-       struct irq_desc *desc;
   
- -      cpus_and(tmp, mask, cpu_online_map);
- -      if (cpus_empty(tmp))
+ +      if (!cpumask_intersects(mask, cpu_online_mask))
                 return;
   
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
- -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                 return;
   
-       cfg = irq_cfg(irq);
- -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
+ 
- -      cpus_and(tmp, cfg->domain, mask);
+ +      cpumask_and(&tmp, &cfg->domain, mask);
         dest = cpu_mask_to_apicid(tmp);
   
         target_ht_irq(irq, dest, cfg->vector);
-       desc = irq_to_desc(irq);
- -      desc->affinity = mask;
+ +      cpumask_copy(&desc->affinity, mask);
   }
+ 
   #endif
   
   static struct irq_chip ht_irq_chip = {
@@@ -3489,13 -3607,13 +3606,13 @@@ int arch_setup_ht_irq(unsigned int irq
         int err;
         cpumask_t tmp;
   
+       cfg = irq_cfg(irq);
         tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, tmp);
+       err = assign_irq_vector(irq, cfg, tmp);
         if (!err) {
                 struct ht_irq_msg msg;
                 unsigned dest;
   
-               cfg = irq_cfg(irq);
                 cpus_and(tmp, cfg->domain, tmp);
                 dest = cpu_mask_to_apicid(tmp);
   
@@@ -3541,7 -3659,9 +3658,9 @@@ int arch_enable_uv_irq(char *irq_name, 
         unsigned long flags;
         int err;
   
-       err = assign_irq_vector(irq, *eligible_cpu);
+       cfg = irq_cfg(irq);
+ 
+       err = assign_irq_vector(irq, cfg, *eligible_cpu);
         if (err != 0)
                 return err;
   
@@@ -3550,8 -3670,6 +3669,6 @@@
                                       irq_name);
         spin_unlock_irqrestore(&vector_lock, flags);
   
-       cfg = irq_cfg(irq);
- 
         mmr_value = 0;
         entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
         BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@@ -3603,9 -3721,16 +3720,16 @@@ int __init io_apic_get_redir_entries (i
         return reg_01.bits.entries;
   }
   
- int __init probe_nr_irqs(void)
+ void __init probe_nr_irqs_gsi(void)
   {
-       return NR_IRQS;
+       int idx;
+       int nr = 0;
+ 
+       for (idx = 0; idx < nr_ioapics; idx++)
+               nr += io_apic_get_redir_entries(idx) + 1;
+ 
+       if (nr > nr_irqs_gsi)
+               nr_irqs_gsi = nr;
   }
   
   /* --------------------------------------------------------------------------
@@@ -3704,19 -3829,31 +3828,31 @@@ int __init io_apic_get_version(int ioap
   
   int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
   {
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+       int cpu = boot_cpu_id;
+ 
         if (!IO_APIC_IRQ(irq)) {
                 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
                         ioapic);
                 return -EINVAL;
         }
   
+       desc = irq_to_desc_alloc_cpu(irq, cpu);
+       if (!desc) {
+               printk(KERN_INFO "can not get irq_desc %d\n", irq);
+               return 0;
+       }
+ 
         /*
          * IRQs < 16 are already in the irq_2_pin[] map
          */
-       if (irq >= 16)
-               add_pin_to_irq(irq, ioapic, pin);
+       if (irq >= NR_IRQS_LEGACY) {
+               cfg = desc->chip_data;
+               add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+       }
   
-       setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+       setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
   
         return 0;
   }
@@@ -3770,9 -3907,10 +3906,10 @@@ void __init setup_ioapic_dest(void
                          * when you have too many devices, because at that time only boot
                          * cpu is online.
                          */
-                       cfg = irq_cfg(irq);
+                       desc = irq_to_desc(irq);
+                       cfg = desc->chip_data;
                         if (!cfg->vector) {
-                               setup_IO_APIC_irq(ioapic, pin, irq,
+                               setup_IO_APIC_irq(ioapic, pin, irq, desc,
                                                   irq_trigger(irq_entry),
                                                   irq_polarity(irq_entry));
                                 continue;
@@@ -3782,7 -3920,6 +3919,6 @@@
                         /*
                          * Honour affinities which have been set in early boot
                          */
-                       desc = irq_to_desc(irq);
                         if (desc->status &
                             (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
                                 mask = desc->affinity;
@@@ -3791,10 -3928,10 +3927,10 @@@
   
   #ifdef CONFIG_INTR_REMAP
                         if (intr_remapping_enabled)
-                               set_ir_ioapic_affinity_irq(irq, &mask);
- -                              set_ir_ioapic_affinity_irq_desc(desc, mask);
++                              set_ir_ioapic_affinity_irq_desc(desc, &mask);
                         else
   #endif
-                               set_ioapic_affinity_irq(irq, &mask);
- -                              set_ioapic_affinity_irq_desc(desc, mask);
++                              set_ioapic_affinity_irq_desc(desc, &mask);
                 }
   
         }
@@@ -3843,7 -3980,6 +3979,6 @@@ void __init ioapic_init_mappings(void
         struct resource *ioapic_res;
         int i;
   
-       irq_2_pin_init();
         ioapic_res = ioapic_setup_resources();
         for (i = 0; i < nr_ioapics; i++) {
                 if (smp_found_config) {
diff --combined arch/x86/kernel/irq_32.c

index 87870a4,119fc9c..9cf9cbb
--- 1/arch/x86/kernel/irq_32.c
--- 2/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@@ -242,6 -242,8 +242,8 @@@ void fixup_irqs(cpumask_t map
         for_each_irq_desc(irq, desc) {
                 cpumask_t mask;
   
+               if (!desc)
+                       continue;
                 if (irq == 2)
                         continue;
   
@@@ -251,7 -253,7 +253,7 @@@
                         mask = map;
                 }
                 if (desc->chip->set_affinity)
- -                      desc->chip->set_affinity(irq, mask);
+ +                      desc->chip->set_affinity(irq, &mask);
                 else if (desc->action && !(warned++))
                         printk("Cannot set affinity for irq %i\n", irq);
         }
diff --combined arch/x86/kernel/irq_64.c

index 7d37f84,900009c..27f2307
--- 1/arch/x86/kernel/irq_64.c
--- 2/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@@ -94,6 -94,8 +94,8 @@@ void fixup_irqs(cpumask_t map
                 int break_affinity = 0;
                 int set_affinity = 1;
   
+               if (!desc)
+                       continue;
                 if (irq == 2)
                         continue;
   
@@@ -116,7 -118,7 +118,7 @@@
                         desc->chip->mask(irq);
   
                 if (desc->chip->set_affinity)
- -                      desc->chip->set_affinity(irq, mask);
+ +                      desc->chip->set_affinity(irq, &mask);
                 else if (!(warned++))
                         set_affinity = 0;
   
diff --combined arch/x86/kernel/smpboot.c

index 468c2f9,0e9f446..9d58134
--- 1/arch/x86/kernel/smpboot.c
--- 2/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -62,6 -62,7 +62,7 @@@
   #include <asm/mtrr.h>
   #include <asm/vmi.h>
   #include <asm/genapic.h>
+ #include <asm/setup.h>
   #include <linux/mc146818rtc.h>
   
   #include <mach_apic.h>
@@@ -101,8 -102,14 +102,8 @@@ EXPORT_SYMBOL(smp_num_siblings)
   /* Last level cache ID of each logical CPU */
   DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
   
- -/* bitmap of online cpus */
- -cpumask_t cpu_online_map __read_mostly;
- -EXPORT_SYMBOL(cpu_online_map);
- -
   cpumask_t cpu_callin_map;
   cpumask_t cpu_callout_map;
- -cpumask_t cpu_possible_map;
- -EXPORT_SYMBOL(cpu_possible_map);
   
   /* representing HT siblings of each logical CPU */
   DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@@ -530,7 -537,7 +531,7 @@@ static void impress_friends(void
         pr_debug("Before bogocount - setting activated=1.\n");
   }
   
- static inline void __inquire_remote_apic(int apicid)
+ void __inquire_remote_apic(int apicid)
   {
         unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
         char *names[] = { "ID", "VERSION", "SPIV" };
@@@ -569,14 -576,13 +570,13 @@@
         }
   }
   
- #ifdef WAKE_SECONDARY_VIA_NMI
   /*
    * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
    * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
    * won't ... remember to clear down the APIC, etc later.
    */
- static int __devinit
- wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+ int __devinit
+ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
   {
         unsigned long send_status, accept_status = 0;
         int maxlvt;
@@@ -593,7 -599,7 +593,7 @@@
          * Give the other CPU some time to accept the IPI.
          */
         udelay(200);
-       if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+       if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
                 maxlvt = lapic_get_maxlvt();
                 if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
                         apic_write(APIC_ESR, 0);
@@@ -608,11 -614,9 +608,9 @@@
   
         return (send_status | accept_status);
   }
- #endif        /* WAKE_SECONDARY_VIA_NMI */
   
- #ifdef WAKE_SECONDARY_VIA_INIT
- static int __devinit
- wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+ int __devinit
+ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
   {
         unsigned long send_status, accept_status = 0;
         int maxlvt, num_starts, j;
@@@ -731,7 -735,6 +729,6 @@@
   
         return (send_status | accept_status);
   }
- #endif        /* WAKE_SECONDARY_VIA_INIT */
   
   struct create_idle {
         struct work_struct work;
diff --combined drivers/xen/events.c

index eba5ec5,2924faa..6c81930
--- 1/drivers/xen/events.c
--- 2/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@@ -141,8 -141,12 +141,12 @@@ static void init_evtchn_cpu_bindings(vo
         int i;
   
         /* By default all event channels notify CPU#0. */
-       for_each_irq_desc(i, desc)
+       for_each_irq_desc(i, desc) {
+               if (!desc)
+                       continue;
+ 
                 desc->affinity = cpumask_of_cpu(0);
+       }
   #endif
   
         memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@@ -231,7 -235,7 +235,7 @@@ static int find_unbound_irq(void
         int irq;
   
         /* Only allocate from dynirq range */
-       for_each_irq_nr(irq)
+       for (irq = 0; irq < nr_irqs; irq++)
                 if (irq_bindcount[irq] == 0)
                         break;
   
@@@ -579,7 -583,7 +583,7 @@@ void rebind_evtchn_irq(int evtchn, int 
         spin_unlock(&irq_mapping_update_lock);
   
         /* new event channels are always bound to cpu 0 */
- -      irq_set_affinity(irq, cpumask_of_cpu(0));
+ +      irq_set_affinity(irq, cpumask_of(0));
   
         /* Unmask the event channel. */
         enable_irq(irq);
@@@ -608,9 -612,9 +612,9 @@@ static void rebind_irq_to_cpu(unsigned 
   }
   
   
- -static void set_affinity_irq(unsigned irq, cpumask_t dest)
+ +static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
   {
- -      unsigned tcpu = first_cpu(dest);
+ +      unsigned tcpu = cpumask_first(dest);
         rebind_irq_to_cpu(irq, tcpu);
   }
   
@@@ -792,7 -796,7 +796,7 @@@ void xen_irq_resume(void
                 mask_evtchn(evtchn);
   
         /* No IRQ <-> event-channel mappings. */
-       for_each_irq_nr(irq)
+       for (irq = 0; irq < nr_irqs; irq++)
                 irq_info[irq].evtchn = 0; /* zap event-channel binding */
   
         for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@@ -824,7 -828,7 +828,7 @@@ void __init xen_init_IRQ(void
                 mask_evtchn(i);
   
         /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-       for_each_irq_nr(i)
+       for (i = 0; i < nr_irqs; i++)
                 irq_bindcount[i] = 0;
   
         irq_ctx_init(smp_processor_id());
diff --combined include/linux/interrupt.h

index 48e6393,777f89e..7e85a6e
--- 1/include/linux/interrupt.h
--- 2/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@@ -14,6 -14,8 +14,8 @@@
   #include <linux/irqflags.h>
   #include <linux/smp.h>
   #include <linux/percpu.h>
+ #include <linux/irqnr.h>
+ 
   #include <asm/atomic.h>
   #include <asm/ptrace.h>
   #include <asm/system.h>
@@@ -109,13 -111,13 +111,13 @@@ extern void enable_irq(unsigned int irq
   
   extern cpumask_t irq_default_affinity;
   
- -extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
+ +extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
   extern int irq_can_set_affinity(unsigned int irq);
   extern int irq_select_affinity(unsigned int irq);
   
   #else /* CONFIG_SMP */
   
- -static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+ +static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
   {
         return -EINVAL;
   }
diff --combined include/linux/irq.h

index ab70fd6,b5749db..59525b7
--- 1/include/linux/irq.h
--- 2/include/linux/irq.h
+++ b/include/linux/irq.h
@@@ -113,8 -113,7 +113,8 @@@ struct irq_chip 
         void            (*eoi)(unsigned int irq);
   
         void            (*end)(unsigned int irq);
- -      void            (*set_affinity)(unsigned int irq, cpumask_t dest);
+ +      void            (*set_affinity)(unsigned int irq,
+ +                                      const struct cpumask *dest);
         int             (*retrigger)(unsigned int irq);
         int             (*set_type)(unsigned int irq, unsigned int flow_type);
         int             (*set_wake)(unsigned int irq, unsigned int on);
@@@ -130,6 -129,8 +130,8 @@@
         const char      *typename;
   };
   
+ struct timer_rand_state;
+ struct irq_2_iommu;
   /**
    * struct irq_desc - interrupt descriptor
    * @irq:              interrupt number for this descriptor
@@@ -155,6 -156,13 +157,13 @@@
    */
   struct irq_desc {
         unsigned int            irq;
+ #ifdef CONFIG_SPARSE_IRQ
+       struct timer_rand_state *timer_rand_state;
+       unsigned int            *kstat_irqs;
+ # ifdef CONFIG_INTR_REMAP
+       struct irq_2_iommu      *irq_2_iommu;
+ # endif
+ #endif
         irq_flow_handler_t      handle_irq;
         struct irq_chip         *chip;
         struct msi_desc         *msi_desc;
@@@ -182,14 -190,43 +191,43 @@@
         const char              *name;
   } ____cacheline_internodealigned_in_smp;
   
+ extern void early_irq_init(void);
+ extern void arch_early_irq_init(void);
+ extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+ extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+                                       struct irq_desc *desc, int cpu);
+ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
   
+ #ifndef CONFIG_SPARSE_IRQ
   extern struct irq_desc irq_desc[NR_IRQS];
   
   static inline struct irq_desc *irq_to_desc(unsigned int irq)
   {
-       return (irq < nr_irqs) ? irq_desc + irq : NULL;
+       return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+ }
+ static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+ {
+       return irq_to_desc(irq);
   }
   
+ #else
+ 
+ extern struct irq_desc *irq_to_desc(unsigned int irq);
+ extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+ 
+ # define for_each_irq_desc(irq, desc)         \
+       for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq))
+ # define for_each_irq_desc_reverse(irq, desc)                          \
+       for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq))
+ 
+ #define kstat_irqs_this_cpu(DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()])
+ #define kstat_incr_irqs_this_cpu(irqno, DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()]++)
+ 
+ #endif
+ 
   /*
    * Migration helpers for obsolete names, they will go away:
    */
@@@ -381,6 -418,11 +419,11 @@@ extern int set_irq_msi(unsigned int irq
   #define get_irq_data(irq)     (irq_to_desc(irq)->handler_data)
   #define get_irq_msi(irq)      (irq_to_desc(irq)->msi_desc)
   
+ #define get_irq_desc_chip(desc)               ((desc)->chip)
+ #define get_irq_desc_chip_data(desc)  ((desc)->chip_data)
+ #define get_irq_desc_data(desc)               ((desc)->handler_data)
+ #define get_irq_desc_msi(desc)                ((desc)->msi_desc)
+ 
   #endif /* CONFIG_GENERIC_HARDIRQS */
   
   #endif /* !CONFIG_S390 */
diff --combined init/Kconfig

index 7656623,f291f08..b3782c6
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -808,6 -808,7 +808,7 @@@ config TRACEPOINT
   
   config MARKERS
         bool "Activate markers"
+       depends on TRACEPOINTS
         help
           Place an empty function call at each marker site. Can be
           dynamically changed for a probe function.
@@@ -916,15 -917,6 +917,15 @@@ config KMO
   
   endif # MODULES
   
+ +config INIT_ALL_POSSIBLE
+ +      bool
+ +      help
+ +        Back when each arch used to define their own cpu_online_map and
+ +        cpu_possible_map, some of them chose to initialize cpu_possible_map
+ +        with all 1s, and others with all 0s.  When they were centralised,
+ +        it was better to provide this option than to break all the archs
+ +        and have several arch maintainers persuing me down dark alleys.
+ +
   config STOP_MACHINE
         bool
         default y
diff --combined kernel/irq/chip.c

index 58d8e31,8e4fce4..0af16ae
--- 1/kernel/irq/chip.c
--- 2/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@@ -24,9 -24,10 +24,10 @@@
    */
   void dynamic_irq_init(unsigned int irq)
   {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_desc *desc;
         unsigned long flags;
   
+       desc = irq_to_desc(irq);
         if (!desc) {
                 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
                 return;
@@@ -45,7 -46,7 +46,7 @@@
         desc->irq_count = 0;
         desc->irqs_unhandled = 0;
   #ifdef CONFIG_SMP
- -      cpus_setall(desc->affinity);
+ +      cpumask_setall(&desc->affinity);
   #endif
         spin_unlock_irqrestore(&desc->lock, flags);
   }
diff --combined kernel/irq/proc.c

index 8e91c97,f6b3440..d2c0e5e
--- 1/kernel/irq/proc.c
--- 2/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@@ -40,42 -40,33 +40,42 @@@ static ssize_t irq_affinity_proc_write(
                 const char __user *buffer, size_t count, loff_t *pos)
   {
         unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
- -      cpumask_t new_value;
+ +      cpumask_var_t new_value;
         int err;
   
         if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
             irq_balancing_disabled(irq))
                 return -EIO;
   
+ +      if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ +              return -ENOMEM;
+ +
         err = cpumask_parse_user(buffer, count, new_value);
         if (err)
- -              return err;
+ +              goto free_cpumask;
   
- -      if (!is_affinity_mask_valid(new_value))
- -              return -EINVAL;
+ +      if (!is_affinity_mask_valid(*new_value)) {
+ +              err = -EINVAL;
+ +              goto free_cpumask;
+ +      }
   
         /*
          * Do not allow disabling IRQs completely - it's a too easy
          * way to make the system unusable accidentally :-) At least
          * one online CPU still has to be targeted.
          */
- -      if (!cpus_intersects(new_value, cpu_online_map))
+ +      if (!cpumask_intersects(new_value, cpu_online_mask)) {
                 /* Special case for empty set - allow the architecture
                    code to set default SMP affinity. */
- -              return irq_select_affinity_usr(irq) ? -EINVAL : count;
- -
- -      irq_set_affinity(irq, new_value);
- -
- -      return count;
+ +              err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+ +      } else {
+ +              irq_set_affinity(irq, new_value);
+ +              err = count;
+ +      }
+ +
+ +free_cpumask:
+ +      free_cpumask_var(new_value);
+ +      return err;
   }
   
   static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@@ -104,7 -95,7 +104,7 @@@ static ssize_t default_affinity_write(s
         cpumask_t new_value;
         int err;
   
- -      err = cpumask_parse_user(buffer, count, new_value);
+ +      err = cpumask_parse_user(buffer, count, &new_value);
         if (err)
                 return err;
   
@@@ -252,7 -243,11 +252,11 @@@ void init_irq_proc(void
         /*
          * Create entries for all existing IRQs.
          */
-       for_each_irq_desc(irq, desc)
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+ 
                 register_irq_proc(irq, desc);
+       }
   }
   
diff --combined kernel/profile.c

index 7d620df,60adefb..4cb7d68
--- 1/kernel/profile.c
--- 2/kernel/profile.c
+++ b/kernel/profile.c
@@@ -442,7 -442,7 +442,7 @@@ void profile_tick(int type
   static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
                         int count, int *eof, void *data)
   {
- -      int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+ +      int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
         if (count - len < 2)
                 return -EINVAL;
         len += sprintf(page + len, "\n");
@@@ -456,7 -456,7 +456,7 @@@ static int prof_cpu_mask_write_proc(str
         unsigned long full_count = count, err;
         cpumask_t new_value;
   
- -      err = cpumask_parse_user(buffer, count, new_value);
+ +      err = cpumask_parse_user(buffer, count, &new_value);
         if (err)
                 return err;
   
@@@ -544,7 -544,7 +544,7 @@@ static const struct file_operations pro
   };
   
   #ifdef CONFIG_SMP
- static inline void profile_nop(void *unused)
+ static void profile_nop(void *unused)
   {
   }
   
diff --combined kernel/sched.c

index d2d16d1,e00c92d..b309027
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -118,6 -118,12 +118,12 @@@
    */
   #define RUNTIME_INF   ((u64)~0ULL)
   
+ DEFINE_TRACE(sched_wait_task);
+ DEFINE_TRACE(sched_wakeup);
+ DEFINE_TRACE(sched_wakeup_new);
+ DEFINE_TRACE(sched_switch);
+ DEFINE_TRACE(sched_migrate_task);
+ 
   #ifdef CONFIG_SMP
   /*
    * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@@ -261,6 -267,10 +267,10 @@@ struct task_group 
         struct cgroup_subsys_state css;
   #endif
   
+ #ifdef CONFIG_USER_SCHED
+       uid_t uid;
+ #endif
+ 
   #ifdef CONFIG_FAIR_GROUP_SCHED
         /* schedulable entities of this group on each cpu */
         struct sched_entity **se;
@@@ -286,6 -296,12 +296,12 @@@
   
   #ifdef CONFIG_USER_SCHED
   
+ /* Helper function to pass uid information to create_sched_user() */
+ void set_tg_uid(struct user_struct *user)
+ {
+       user->tg->uid = user->uid;
+ }
+ 
   /*
    * Root task group.
    *    Every UID task group (including init_task_group aka UID-0) will
@@@ -481,14 -497,14 +497,14 @@@ struct rt_rq 
    */
   struct root_domain {
         atomic_t refcount;
-       cpumask_t span;
-       cpumask_t online;
+       cpumask_var_t span;
+       cpumask_var_t online;
   
         /*
          * The "RT overload" flag: it gets set if a CPU has more than
          * one runnable RT task.
          */
-       cpumask_t rto_mask;
+       cpumask_var_t rto_mask;
         atomic_t rto_count;
   #ifdef CONFIG_SMP
         struct cpupri cpupri;
@@@ -703,45 -719,18 +719,18 @@@ static __read_mostly char *sched_feat_n
   
   #undef SCHED_FEAT
   
- static int sched_feat_open(struct inode *inode, struct file *filp)
+ static int sched_feat_show(struct seq_file *m, void *v)
   {
         int i;
   
         for (i = 0; sched_feat_names[i]; i++) {
-               len += strlen(sched_feat_names[i]);
-               len += 4;
-       }
- 
-       buf = kmalloc(len + 2, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
- 
-       for (i = 0; sched_feat_names[i]; i++) {
-               if (sysctl_sched_features & (1UL << i))
-                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-               else
-                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
         }
+       seq_puts(m, "\n");
   
-       r += sprintf(buf + r, "\n");
-       WARN_ON(r >= len + 2);
- 
-       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
- 
-       kfree(buf);
- 
-       return r;
+       return 0;
   }
   
   static ssize_t
@@@ -786,10 -775,17 +775,17 @@@ sched_feat_write(struct file *filp, con
         return cnt;
   }
   
+ static int sched_feat_open(struct inode *inode, struct file *filp)
+ {
+       return single_open(filp, sched_feat_show, NULL);
+ }
+ 
   static struct file_operations sched_feat_fops = {
-       .open   = sched_feat_open,
-       .read   = sched_feat_read,
-       .write  = sched_feat_write,
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
   };
   
   static __init int sched_init_debug(void)
@@@ -1474,27 -1470,13 +1470,13 @@@ static voi
   update_group_shares_cpu(struct task_group *tg, int cpu,
                         unsigned long sd_shares, unsigned long sd_rq_weight)
   {
-       int boost = 0;
         unsigned long shares;
         unsigned long rq_weight;
   
         if (!tg->se[cpu])
                 return;
   
-       rq_weight = tg->cfs_rq[cpu]->load.weight;
- 
-       /*
-        * If there are currently no tasks on the cpu pretend there is one of
-        * average load so that when a new task gets to run here it will not
-        * get delayed by group starvation.
-        */
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
- 
-       if (unlikely(rq_weight > sd_rq_weight))
-               rq_weight = sd_rq_weight;
+       rq_weight = tg->cfs_rq[cpu]->rq_weight;
   
         /*
          *           \Sum shares * rq_weight
@@@ -1502,7 -1484,7 +1484,7 @@@
          *               \Sum rq_weight
          *
          */
-       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = (sd_shares * rq_weight) / sd_rq_weight;
         shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
   
         if (abs(shares - tg->se[cpu]->load.weight) >
@@@ -1511,11 -1493,7 +1493,7 @@@
                 unsigned long flags;
   
                 spin_lock_irqsave(&rq->lock, flags);
-               /*
-                * record the actual number of shares, not the boosted amount.
-                */
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               tg->cfs_rq[cpu]->rq_weight = rq_weight;
+               tg->cfs_rq[cpu]->shares = shares;
   
                 __set_se_shares(tg->se[cpu], shares);
                 spin_unlock_irqrestore(&rq->lock, flags);
@@@ -1529,13 -1507,23 +1507,23 @@@
    */
   static int tg_shares_up(struct task_group *tg, void *data)
   {
-       unsigned long rq_weight = 0;
+       unsigned long weight, rq_weight = 0;
         unsigned long shares = 0;
         struct sched_domain *sd = data;
         int i;
   
-       for_each_cpu_mask(i, sd->span) {
-               rq_weight += tg->cfs_rq[i]->load.weight;
+       for_each_cpu(i, sched_domain_span(sd)) {
+               /*
+                * If there are currently no tasks on the cpu pretend there
+                * is one of average load so that when a new task gets to
+                * run here it will not get delayed by group starvation.
+                */
+               weight = tg->cfs_rq[i]->load.weight;
+               if (!weight)
+                       weight = NICE_0_LOAD;
+ 
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
                 shares += tg->cfs_rq[i]->shares;
         }
   
@@@ -1545,10 -1533,7 +1533,7 @@@
         if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                 shares = tg->shares;
   
-       if (!rq_weight)
-               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
- 
-       for_each_cpu_mask(i, sd->span)
+       for_each_cpu(i, sched_domain_span(sd))
                 update_group_shares_cpu(tg, i, shares, rq_weight);
   
         return 0;
@@@ -1612,6 -1597,39 +1597,39 @@@ static inline void update_shares_locked
   
   #endif
   
+ /*
+  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  */
+ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+ {
+       int ret = 0;
+ 
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       if (unlikely(!spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       spin_unlock(&this_rq->lock);
+                       spin_lock(&busiest->lock);
+                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+ }
+ 
+ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+ {
+       spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ }
   #endif
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -2079,15 -2097,17 +2097,17 @@@ find_idlest_group(struct sched_domain *
                 int i;
   
                 /* Skip over this group if it has no CPUs allowed */
-               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
                         continue;
   
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
   
                 /* Tally up the load of all CPUs in the group */
                 avg_load = 0;
   
-               for_each_cpu_mask_nr(i, group->cpumask) {
+               for_each_cpu(i, sched_group_cpus(group)) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
                                 load = source_load(i, load_idx);
@@@ -2119,17 -2139,14 +2139,14 @@@
    * find_idlest_cpu - find the idlest cpu among the cpus in group.
    */
   static int
- find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-               cpumask_t *tmp)
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
   {
         unsigned long load, min_load = ULONG_MAX;
         int idlest = -1;
         int i;
   
         /* Traverse only the allowed CPUs */
-       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
- 
-       for_each_cpu_mask_nr(i, *tmp) {
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                 load = weighted_cpuload(i);
   
                 if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2171,7 -2188,6 +2188,6 @@@ static int sched_balance_self(int cpu, 
                 update_shares(sd);
   
         while (sd) {
-               cpumask_t span, tmpmask;
                 struct sched_group *group;
                 int new_cpu, weight;
   
@@@ -2180,14 -2196,13 +2196,13 @@@
                         continue;
                 }
   
-               span = sd->span;
                 group = find_idlest_group(sd, t, cpu);
                 if (!group) {
                         sd = sd->child;
                         continue;
                 }
   
-               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                 if (new_cpu == -1 || new_cpu == cpu) {
                         /* Now try balancing at a lower domain level of cpu */
                         sd = sd->child;
@@@ -2196,10 -2211,10 +2211,10 @@@
   
                 /* Now try balancing at a lower domain level of new_cpu */
                 cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
                 sd = NULL;
-               weight = cpus_weight(span);
                 for_each_domain(cpu, tmp) {
-                       if (weight <= cpus_weight(tmp->span))
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
                                 break;
                         if (tmp->flags & flag)
                                 sd = tmp;
@@@ -2244,7 -2259,7 +2259,7 @@@ static int try_to_wake_up(struct task_s
                 cpu = task_cpu(p);
   
                 for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                 update_shares(sd);
                                 break;
                         }
@@@ -2292,7 -2307,7 +2307,7 @@@
         else {
                 struct sched_domain *sd;
                 for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                 schedstat_inc(sd, ttwu_wake_remote);
                                 break;
                         }
@@@ -2812,40 -2827,6 +2827,6 @@@ static void double_rq_unlock(struct rq 
   }
   
   /*
-  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
-  */
- static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
- {
-       int ret = 0;
- 
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-       if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       spin_unlock(&this_rq->lock);
-                       spin_lock(&busiest->lock);
-                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-       }
-       return ret;
- }
- 
- static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
- {
-       spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
- }
- 
- /*
    * If dest_cpu is allowed for this process, migrate the task to it.
    * This is accomplished by forcing the cpu_allowed mask to only
    * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@@ -2858,7 -2839,7 +2839,7 @@@ static void sched_migrate_task(struct t
         struct rq *rq;
   
         rq = task_rq_lock(p, &flags);
-       if (!cpu_isset(dest_cpu, p->cpus_allowed)
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
   
@@@ -2924,7 -2905,7 +2905,7 @@@ int can_migrate_task(struct task_struc
          * 2) cannot be migrated to this CPU due to cpus_allowed, or
          * 3) are cache-hot on their current CPU.
          */
-       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+       if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
                 schedstat_inc(p, se.nr_failed_migrations_affine);
                 return 0;
         }
@@@ -3099,7 -3080,7 +3080,7 @@@ static int move_one_task(struct rq *thi
   static struct sched_group *
   find_busiest_group(struct sched_domain *sd, int this_cpu,
                    unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const cpumask_t *cpus, int *balance)
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
   {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@@ -3135,10 -3116,11 +3116,11 @@@
                 unsigned long sum_avg_load_per_task;
                 unsigned long avg_load_per_task;
   
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
   
                 if (local_group)
-                       balance_cpu = first_cpu(group->cpumask);
+                       balance_cpu = cpumask_first(sched_group_cpus(group));
   
                 /* Tally up the load of all CPUs in the group */
                 sum_weighted_load = sum_nr_running = avg_load = 0;
@@@ -3147,13 -3129,8 +3129,8 @@@
                 max_cpu_load = 0;
                 min_cpu_load = ~0UL;
   
-               for_each_cpu_mask_nr(i, group->cpumask) {
-                       struct rq *rq;
- 
-                       if (!cpu_isset(i, *cpus))
-                               continue;
- 
-                       rq = cpu_rq(i);
+               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                       struct rq *rq = cpu_rq(i);
   
                         if (*sd_idle && rq->nr_running)
                                 *sd_idle = 0;
@@@ -3264,8 -3241,8 +3241,8 @@@
                  */
                 if ((sum_nr_running < min_nr_running) ||
                     (sum_nr_running == min_nr_running &&
-                    first_cpu(group->cpumask) <
-                    first_cpu(group_min->cpumask))) {
+                    cpumask_first(sched_group_cpus(group)) <
+                    cpumask_first(sched_group_cpus(group_min)))) {
                         group_min = group;
                         min_nr_running = sum_nr_running;
                         min_load_per_task = sum_weighted_load /
@@@ -3280,8 -3257,8 +3257,8 @@@
                 if (sum_nr_running <= group_capacity - 1) {
                         if (sum_nr_running > leader_nr_running ||
                             (sum_nr_running == leader_nr_running &&
-                            first_cpu(group->cpumask) >
-                             first_cpu(group_leader->cpumask))) {
+                            cpumask_first(sched_group_cpus(group)) >
+                            cpumask_first(sched_group_cpus(group_leader)))) {
                                 group_leader = group;
                                 leader_nr_running = sum_nr_running;
                         }
@@@ -3420,16 -3397,16 +3397,16 @@@ ret
    */
   static struct rq *
   find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                  unsigned long imbalance, const cpumask_t *cpus)
+                  unsigned long imbalance, const struct cpumask *cpus)
   {
         struct rq *busiest = NULL, *rq;
         unsigned long max_load = 0;
         int i;
   
-       for_each_cpu_mask_nr(i, group->cpumask) {
+       for_each_cpu(i, sched_group_cpus(group)) {
                 unsigned long wl;
   
-               if (!cpu_isset(i, *cpus))
+               if (!cpumask_test_cpu(i, cpus))
                         continue;
   
                 rq = cpu_rq(i);
@@@ -3459,7 -3436,7 +3436,7 @@@
    */
   static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, cpumask_t *cpus)
+                       int *balance, struct cpumask *cpus)
   {
         int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
         struct sched_group *group;
@@@ -3467,7 -3444,7 +3444,7 @@@
         struct rq *busiest;
         unsigned long flags;
   
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
   
         /*
          * When power savings policy is enabled for the parent domain, idle
@@@ -3527,8 -3504,8 +3504,8 @@@ redo
   
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                 goto redo;
                         goto out_balanced;
                 }
@@@ -3545,7 -3522,8 +3522,8 @@@
                         /* don't kick the migration_thread, if the curr
                          * task on busiest cpu can't be moved to this_cpu
                          */
-                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       if (!cpumask_test_cpu(this_cpu,
+                                             &busiest->curr->cpus_allowed)) {
                                 spin_unlock_irqrestore(&busiest->lock, flags);
                                 all_pinned = 1;
                                 goto out_one_pinned;
@@@ -3620,7 -3598,7 +3598,7 @@@ out
    */
   static int
   load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       cpumask_t *cpus)
+                       struct cpumask *cpus)
   {
         struct sched_group *group;
         struct rq *busiest = NULL;
@@@ -3629,7 -3607,7 +3607,7 @@@
         int sd_idle = 0;
         int all_pinned = 0;
   
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
   
         /*
          * When power savings policy is enabled for the parent domain, idle
@@@ -3673,8 -3651,8 +3651,8 @@@ redo
                 double_unlock_balance(this_rq, busiest);
   
                 if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                 goto redo;
                 }
         }
@@@ -3707,9 -3685,12 +3685,12 @@@ out_balanced
   static void idle_balance(int this_cpu, struct rq *this_rq)
   {
         struct sched_domain *sd;
-       int pulled_task = -1;
+       int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
-       cpumask_t tmpmask;
+       cpumask_var_t tmpmask;
+ 
+       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+               return;
   
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@@ -3720,7 -3701,7 +3701,7 @@@
                 if (sd->flags & SD_BALANCE_NEWIDLE)
                         /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, &tmpmask);
+                                                          sd, tmpmask);
   
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
@@@ -3735,6 -3716,7 +3716,7 @@@
                  */
                 this_rq->next_balance = next_balance;
         }
+       free_cpumask_var(tmpmask);
   }
   
   /*
@@@ -3772,7 -3754,7 +3754,7 @@@ static void active_load_balance(struct 
         /* Search for an sd spanning us and the target CPU. */
         for_each_domain(target_cpu, sd) {
                 if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpu_isset(busiest_cpu, sd->span))
+                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                                 break;
         }
   
@@@ -3791,10 -3773,9 +3773,9 @@@
   #ifdef CONFIG_NO_HZ
   static struct {
         atomic_t load_balancer;
-       cpumask_t cpu_mask;
+       cpumask_var_t cpu_mask;
   } nohz ____cacheline_aligned = {
         .load_balancer = ATOMIC_INIT(-1),
-       .cpu_mask = CPU_MASK_NONE,
   };
   
   /*
@@@ -3822,7 -3803,7 +3803,7 @@@ int select_nohz_load_balancer(int stop_
         int cpu = smp_processor_id();
   
         if (stop_tick) {
-               cpu_set(cpu, nohz.cpu_mask);
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
                 cpu_rq(cpu)->in_nohz_recently = 1;
   
                 /*
@@@ -3836,7 -3817,7 +3817,7 @@@
                 }
   
                 /* time for ilb owner also to sleep */
-               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                         if (atomic_read(&nohz.load_balancer) == cpu)
                                 atomic_set(&nohz.load_balancer, -1);
                         return 0;
@@@ -3849,10 -3830,10 +3830,10 @@@
                 } else if (atomic_read(&nohz.load_balancer) == cpu)
                         return 1;
         } else {
-               if (!cpu_isset(cpu, nohz.cpu_mask))
+               if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                         return 0;
   
-               cpu_clear(cpu, nohz.cpu_mask);
+               cpumask_clear_cpu(cpu, nohz.cpu_mask);
   
                 if (atomic_read(&nohz.load_balancer) == cpu)
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@@ -3880,7 -3861,11 +3861,11 @@@ static void rebalance_domains(int cpu, 
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
         int need_serialize;
-       cpumask_t tmp;
+       cpumask_var_t tmp;
+ 
+       /* Fails alloc?  Rebalancing probably not a priority right now. */
+       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+               return;
   
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
@@@ -3905,7 -3890,7 +3890,7 @@@
                 }
   
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                                 /*
                                  * We've pulled tasks over so either we're no
                                  * longer idle, or one of our SMT siblings is
@@@ -3939,6 -3924,8 +3924,8 @@@ out
          */
         if (likely(update_next_balance))
                 rq->next_balance = next_balance;
+ 
+       free_cpumask_var(tmp);
   }
   
   /*
@@@ -3963,12 -3950,13 +3950,13 @@@ static void run_rebalance_domains(struc
          */
         if (this_rq->idle_at_tick &&
             atomic_read(&nohz.load_balancer) == this_cpu) {
                 struct rq *rq;
                 int balance_cpu;
   
-               cpu_clear(this_cpu, cpus);
-               for_each_cpu_mask_nr(balance_cpu, cpus) {
+               for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                       if (balance_cpu == this_cpu)
+                               continue;
+ 
                         /*
                          * If this cpu gets work to do, stop the load balancing
                          * work being done for other cpus. Next load
@@@ -4006,7 -3994,7 +3994,7 @@@ static inline void trigger_load_balance
                 rq->in_nohz_recently = 0;
   
                 if (atomic_read(&nohz.load_balancer) == cpu) {
-                       cpu_clear(cpu, nohz.cpu_mask);
+                       cpumask_clear_cpu(cpu, nohz.cpu_mask);
                         atomic_set(&nohz.load_balancer, -1);
                 }
   
@@@ -4019,7 -4007,7 +4007,7 @@@
                          * TBD: Traverse the sched domains and nominate
                          * the nearest cpu in the nohz.cpu_mask.
                          */
-                       int ilb = first_cpu(nohz.cpu_mask);
+                       int ilb = cpumask_first(nohz.cpu_mask);
   
                         if (ilb < nr_cpu_ids)
                                 resched_cpu(ilb);
@@@ -4031,7 -4019,7 +4019,7 @@@
          * cpus with ticks stopped, is it time for that to stop?
          */
         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+           cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                 resched_cpu(cpu);
                 return;
         }
@@@ -4041,7 -4029,7 +4029,7 @@@
          * someone else, then no need raise the SCHED_SOFTIRQ
          */
         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-           cpu_isset(cpu, nohz.cpu_mask))
+           cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
   #endif
         if (time_after_eq(jiffies, rq->next_balance))
@@@ -4203,7 -4191,6 +4191,6 @@@ void account_steal_time(struct task_str
   
         if (p == rq->idle) {
                 p->stime = cputime_add(p->stime, steal);
-               account_group_system_time(p, steal);
                 if (atomic_read(&rq->nr_iowait) > 0)
                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                 else
@@@ -4339,7 -4326,7 +4326,7 @@@ void __kprobes sub_preempt_count(int va
         /*
          * Underflow?
          */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+        if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
                 return;
         /*
          * Is the spinlock portion underflowing?
@@@ -5400,10 -5387,9 +5387,9 @@@ out_unlock
         return retval;
   }
   
- long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
   {
-       cpumask_t cpus_allowed;
-       cpumask_t new_mask = *in_mask;
+       cpumask_var_t cpus_allowed, new_mask;
         struct task_struct *p;
         int retval;
   
@@@ -5425,6 -5411,14 +5411,14 @@@
         get_task_struct(p);
         read_unlock(&tasklist_lock);
   
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
         retval = -EPERM;
         if ((current->euid != p->euid) && (current->euid != p->uid) &&
                         !capable(CAP_SYS_NICE))
@@@ -5434,37 -5428,41 +5428,41 @@@
         if (retval)
                 goto out_unlock;
   
-       cpuset_cpus_allowed(p, &cpus_allowed);
-       cpus_and(new_mask, new_mask, cpus_allowed);
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, in_mask, cpus_allowed);
    again:
-       retval = set_cpus_allowed_ptr(p, &new_mask);
+       retval = set_cpus_allowed_ptr(p, new_mask);
   
         if (!retval) {
-               cpuset_cpus_allowed(p, &cpus_allowed);
-               if (!cpus_subset(new_mask, cpus_allowed)) {
+               cpuset_cpus_allowed(p, cpus_allowed);
+               if (!cpumask_subset(new_mask, cpus_allowed)) {
                         /*
                          * We must have raced with a concurrent cpuset
                          * update. Just reset the cpus_allowed to the
                          * cpuset's cpus_allowed
                          */
-                       new_mask = cpus_allowed;
+                       cpumask_copy(new_mask, cpus_allowed);
                         goto again;
                 }
         }
   out_unlock:
+       free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+ out_put_task:
         put_task_struct(p);
         put_online_cpus();
         return retval;
   }
   
   static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                            cpumask_t *new_mask)
+                            struct cpumask *new_mask)
   {
-       if (len < sizeof(cpumask_t)) {
-               memset(new_mask, 0, sizeof(cpumask_t));
-       } else if (len > sizeof(cpumask_t)) {
-               len = sizeof(cpumask_t);
-       }
+       if (len < cpumask_size())
+               cpumask_clear(new_mask);
+       else if (len > cpumask_size())
+               len = cpumask_size();
+ 
         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
   }
   
@@@ -5477,17 -5475,20 +5475,20 @@@
   asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                       unsigned long __user *user_mask_ptr)
   {
-       cpumask_t new_mask;
+       cpumask_var_t new_mask;
         int retval;
   
-       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
-       if (retval)
-               return retval;
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+               return -ENOMEM;
   
-       return sched_setaffinity(pid, &new_mask);
+       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+       if (retval == 0)
+               retval = sched_setaffinity(pid, new_mask);
+       free_cpumask_var(new_mask);
+       return retval;
   }
   
- long sched_getaffinity(pid_t pid, cpumask_t *mask)
+ long sched_getaffinity(pid_t pid, struct cpumask *mask)
   {
         struct task_struct *p;
         int retval;
@@@ -5504,7 -5505,7 +5505,7 @@@
         if (retval)
                 goto out_unlock;
   
-       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
   
   out_unlock:
         read_unlock(&tasklist_lock);
@@@ -5523,19 -5524,24 +5524,24 @@@ asmlinkage long sys_sched_getaffinity(p
                                       unsigned long __user *user_mask_ptr)
   {
         int ret;
-       cpumask_t mask;
+       cpumask_var_t mask;
   
-       if (len < sizeof(cpumask_t))
+       if (len < cpumask_size())
                 return -EINVAL;
   
-       ret = sched_getaffinity(pid, &mask);
-       if (ret < 0)
-               return ret;
+       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+               return -ENOMEM;
   
-       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
-               return -EFAULT;
+       ret = sched_getaffinity(pid, mask);
+       if (ret == 0) {
+               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                       ret = -EFAULT;
+               else
+                       ret = cpumask_size();
+       }
+       free_cpumask_var(mask);
   
-       return sizeof(cpumask_t);
+       return ret;
   }
   
   /**
@@@ -5877,7 -5883,7 +5883,7 @@@ void __cpuinit init_idle(struct task_st
         idle->se.exec_start = sched_clock();
   
         idle->prio = idle->normal_prio = MAX_PRIO;
-       idle->cpus_allowed = cpumask_of_cpu(cpu);
+       cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
         __set_task_cpu(idle, cpu);
   
         rq->curr = rq->idle = idle;
@@@ -5896,6 -5902,7 +5902,7 @@@
          * The idle tasks have their own, simple scheduling class:
          */
         idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_task(idle);
   }
   
   /*
@@@ -5903,9 -5910,9 +5910,9 @@@
    * indicates which cpus entered this state. This is used
    * in the rcu update to wait only for active cpus. For system
    * which do not switch off the HZ timer nohz_cpu_mask should
-  * always be CPU_MASK_NONE.
+  * always be CPU_BITS_NONE.
    */
- cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ cpumask_var_t nohz_cpu_mask;
   
   /*
    * Increase the granularity value when there are more CPUs,
@@@ -5960,7 -5967,7 +5967,7 @@@ static inline void sched_init_granulari
    * task must not exit() & deallocate itself prematurely. The
    * call is not atomic; no spinlocks may be held.
    */
- int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   {
         struct migration_req req;
         unsigned long flags;
@@@ -5968,13 -5975,13 +5975,13 @@@
         int ret = 0;
   
         rq = task_rq_lock(p, &flags);
-       if (!cpus_intersects(*new_mask, cpu_online_map)) {
+       if (!cpumask_intersects(new_mask, cpu_online_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
   
         if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+                    !cpumask_equal(&p->cpus_allowed, new_mask))) {
                 ret = -EINVAL;
                 goto out;
         }
@@@ -5982,15 -5989,15 +5989,15 @@@
         if (p->sched_class->set_cpus_allowed)
                 p->sched_class->set_cpus_allowed(p, new_mask);
         else {
-               p->cpus_allowed = *new_mask;
-               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
         }
   
         /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpu_isset(task_cpu(p), *new_mask))
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
                 goto out;
   
-       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+       if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                 /* Need help from migration thread: drop lock and wait. */
                 task_rq_unlock(rq, &flags);
                 wake_up_process(rq->migration_thread);
@@@ -6032,7 -6039,7 +6039,7 @@@ static int __migrate_task(struct task_s
         if (task_cpu(p) != src_cpu)
                 goto done;
         /* Affinity changed (again). */
-       if (!cpu_isset(dest_cpu, p->cpus_allowed))
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                 goto fail;
   
         on_rq = p->se.on_rq;
@@@ -6126,54 -6133,46 +6133,46 @@@ static int __migrate_task_irq(struct ta
   
   /*
    * Figure out where task on dead CPU should go, use force if necessary.
    */
   static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
   {
-       unsigned long flags;
-       cpumask_t mask;
-       struct rq *rq;
         int dest_cpu;
+       /* FIXME: Use cpumask_of_node here. */
+       cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
+       const struct cpumask *nodemask = &_nodemask;
+ 
+ again:
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                       goto move;
+ 
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+       if (dest_cpu < nr_cpu_ids)
+               goto move;
+ 
+       /* No more Mr. Nice Guy. */
+       if (dest_cpu >= nr_cpu_ids) {
+               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+               dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
   
-       do {
-               /* On same node? */
-               mask = node_to_cpumask(cpu_to_node(dead_cpu));
-               cpus_and(mask, mask, p->cpus_allowed);
-               dest_cpu = any_online_cpu(mask);
- 
-               /* On any allowed CPU? */
-               if (dest_cpu >= nr_cpu_ids)
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
- 
-               /* No more Mr. Nice Guy. */
-               if (dest_cpu >= nr_cpu_ids) {
-                       cpumask_t cpus_allowed;
- 
-                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
-                       /*
-                        * Try to stay on the same cpuset, where the
-                        * current cpuset may be a subset of all cpus.
-                        * The cpuset_cpus_allowed_locked() variant of
-                        * cpuset_cpus_allowed() will not block. It must be
-                        * called within calls to cpuset_lock/cpuset_unlock.
-                        */
-                       rq = task_rq_lock(p, &flags);
-                       p->cpus_allowed = cpus_allowed;
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-                       task_rq_unlock(rq, &flags);
- 
-                       /*
-                        * Don't tell them about moving exiting tasks or
-                        * kernel threads (both mm NULL), since they never
-                        * leave kernel.
-                        */
-                       if (p->mm && printk_ratelimit()) {
-                               printk(KERN_INFO "process %d (%s) no "
-                                      "longer affine to cpu%d\n",
-                                       task_pid_nr(p), p->comm, dead_cpu);
-                       }
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk(KERN_INFO "process %d (%s) no "
+                              "longer affine to cpu%d\n",
+                              task_pid_nr(p), p->comm, dead_cpu);
                 }
-       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+       }
+ 
+ move:
+       /* It can have affinity changed while we were choosing. */
+       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+               goto again;
   }
   
   /*
@@@ -6185,7 -6184,7 +6184,7 @@@
    */
   static void migrate_nr_uninterruptible(struct rq *rq_src)
   {
-       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
         unsigned long flags;
   
         local_irq_save(flags);
@@@ -6475,7 -6474,7 +6474,7 @@@ static void set_rq_online(struct rq *rq
         if (!rq->online) {
                 const struct sched_class *class;
   
-               cpu_set(rq->cpu, rq->rd->online);
+               cpumask_set_cpu(rq->cpu, rq->rd->online);
                 rq->online = 1;
   
                 for_each_class(class) {
@@@ -6495,7 -6494,7 +6494,7 @@@ static void set_rq_offline(struct rq *r
                                 class->rq_offline(rq);
                 }
   
-               cpu_clear(rq->cpu, rq->rd->online);
+               cpumask_clear_cpu(rq->cpu, rq->rd->online);
                 rq->online = 0;
         }
   }
@@@ -6536,7 -6535,7 +6535,7 @@@ migration_call(struct notifier_block *n
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
   
                         set_rq_online(rq);
                 }
@@@ -6550,7 -6549,7 +6549,7 @@@
                         break;
                 /* Unbind it from offline cpu so it can run. Fall thru. */
                 kthread_bind(cpu_rq(cpu)->migration_thread,
-                            any_online_cpu(cpu_online_map));
+                            cpumask_any(cpu_online_mask));
                 kthread_stop(cpu_rq(cpu)->migration_thread);
                 cpu_rq(cpu)->migration_thread = NULL;
                 break;
@@@ -6600,7 -6599,7 +6599,7 @@@
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
                 spin_unlock_irqrestore(&rq->lock, flags);
@@@ -6638,36 -6637,14 +6637,14 @@@ early_initcall(migration_init)
   
   #ifdef CONFIG_SCHED_DEBUG
   
- static inline const char *sd_level_to_string(enum sched_domain_level lvl)
- {
-       switch (lvl) {
-       case SD_LV_NONE:
-                       return "NONE";
-       case SD_LV_SIBLING:
-                       return "SIBLING";
-       case SD_LV_MC:
-                       return "MC";
-       case SD_LV_CPU:
-                       return "CPU";
-       case SD_LV_NODE:
-                       return "NODE";
-       case SD_LV_ALLNODES:
-                       return "ALLNODES";
-       case SD_LV_MAX:
-                       return "MAX";
- 
-       }
-       return "MAX";
- }
- 
   static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                 cpumask_t *groupmask)
+                                 struct cpumask *groupmask)
   {
         struct sched_group *group = sd->groups;
         char str[256];
   
-       cpulist_scnprintf(str, sizeof(str), &sd->span);
-       cpus_clear(*groupmask);
- -      cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
++      cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+       cpumask_clear(groupmask);
   
         printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
   
@@@ -6679,14 -6656,13 +6656,13 @@@
                 return -1;
         }
   
-       printk(KERN_CONT "span %s level %s\n",
-               str, sd_level_to_string(sd->level));
+       printk(KERN_CONT "span %s level %s\n", str, sd->name);
   
-       if (!cpu_isset(cpu, sd->span)) {
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                 printk(KERN_ERR "ERROR: domain->span does not contain "
                                 "CPU%d\n", cpu);
         }
-       if (!cpu_isset(cpu, group->cpumask)) {
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
                 printk(KERN_ERR "ERROR: domain->groups does not contain"
                                 " CPU%d\n", cpu);
         }
@@@ -6706,31 -6682,32 +6682,32 @@@
                         break;
                 }
   
-               if (!cpus_weight(group->cpumask)) {
+               if (!cpumask_weight(sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: empty group\n");
                         break;
                 }
   
-               if (cpus_intersects(*groupmask, group->cpumask)) {
+               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: repeated CPUs\n");
                         break;
                 }
   
-               cpus_or(*groupmask, *groupmask, group->cpumask);
+               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
   
-               cpulist_scnprintf(str, sizeof(str), &group->cpumask);
- -              cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group));
++              cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                 printk(KERN_CONT " %s", str);
   
                 group = group->next;
         } while (group != sd->groups);
         printk(KERN_CONT "\n");
   
-       if (!cpus_equal(sd->span, *groupmask))
+       if (!cpumask_equal(sched_domain_span(sd), groupmask))
                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
   
-       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+       if (sd->parent &&
+           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
                 printk(KERN_ERR "ERROR: parent span is not a superset "
                         "of domain->span\n");
         return 0;
@@@ -6738,7 -6715,7 +6715,7 @@@
   
   static void sched_domain_debug(struct sched_domain *sd, int cpu)
   {
-       cpumask_t *groupmask;
+       cpumask_var_t groupmask;
         int level = 0;
   
         if (!sd) {
@@@ -6748,8 -6725,7 +6725,7 @@@
   
         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
   
-       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-       if (!groupmask) {
+       if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
                 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
                 return;
         }
@@@ -6762,7 -6738,7 +6738,7 @@@
                 if (!sd)
                         break;
         }
-       kfree(groupmask);
+       free_cpumask_var(groupmask);
   }
   #else /* !CONFIG_SCHED_DEBUG */
   # define sched_domain_debug(sd, cpu) do { } while (0)
@@@ -6770,7 -6746,7 +6746,7 @@@
   
   static int sd_degenerate(struct sched_domain *sd)
   {
-       if (cpus_weight(sd->span) == 1)
+       if (cpumask_weight(sched_domain_span(sd)) == 1)
                 return 1;
   
         /* Following flags need at least 2 groups */
@@@ -6801,7 -6777,7 +6777,7 @@@ sd_parent_degenerate(struct sched_domai
         if (sd_degenerate(parent))
                 return 1;
   
-       if (!cpus_equal(sd->span, parent->span))
+       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                 return 0;
   
         /* Does parent contain flags not in child? */
@@@ -6816,6 -6792,8 +6792,8 @@@
                                 SD_BALANCE_EXEC |
                                 SD_SHARE_CPUPOWER |
                                 SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
         }
         if (~cflags & pflags)
                 return 0;
@@@ -6823,6 -6801,16 +6801,16 @@@
         return 1;
   }
   
+ static void free_rootdomain(struct root_domain *rd)
+ {
+       cpupri_cleanup(&rd->cpupri);
+ 
+       free_cpumask_var(rd->rto_mask);
+       free_cpumask_var(rd->online);
+       free_cpumask_var(rd->span);
+       kfree(rd);
+ }
+ 
   static void rq_attach_root(struct rq *rq, struct root_domain *rd)
   {
         unsigned long flags;
@@@ -6832,38 -6820,63 +6820,63 @@@
         if (rq->rd) {
                 struct root_domain *old_rd = rq->rd;
   
-               if (cpu_isset(rq->cpu, old_rd->online))
+               if (cpumask_test_cpu(rq->cpu, old_rd->online))
                         set_rq_offline(rq);
   
-               cpu_clear(rq->cpu, old_rd->span);
+               cpumask_clear_cpu(rq->cpu, old_rd->span);
   
                 if (atomic_dec_and_test(&old_rd->refcount))
-                       kfree(old_rd);
+                       free_rootdomain(old_rd);
         }
   
         atomic_inc(&rd->refcount);
         rq->rd = rd;
   
-       cpu_set(rq->cpu, rd->span);
-       if (cpu_isset(rq->cpu, cpu_online_map))
+       cpumask_set_cpu(rq->cpu, rd->span);
+       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
                 set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
   }
   
- static void init_rootdomain(struct root_domain *rd)
+ static int init_rootdomain(struct root_domain *rd, bool bootmem)
   {
         memset(rd, 0, sizeof(*rd));
   
-       cpus_clear(rd->span);
-       cpus_clear(rd->online);
+       if (bootmem) {
+               alloc_bootmem_cpumask_var(&def_root_domain.span);
+               alloc_bootmem_cpumask_var(&def_root_domain.online);
+               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+               cpupri_init(&rd->cpupri, true);
+               return 0;
+       }
+ 
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+               goto free_rd;
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+               goto free_span;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_online;
+ 
+       if (cpupri_init(&rd->cpupri, false) != 0)
+               goto free_rto_mask;
+       return 0;
   
-       cpupri_init(&rd->cpupri);
+ free_rto_mask:
+       free_cpumask_var(rd->rto_mask);
+ free_online:
+       free_cpumask_var(rd->online);
+ free_span:
+       free_cpumask_var(rd->span);
+ free_rd:
+       kfree(rd);
+       return -ENOMEM;
   }
   
   static void init_defrootdomain(void)
   {
-       init_rootdomain(&def_root_domain);
+       init_rootdomain(&def_root_domain, true);
+ 
         atomic_set(&def_root_domain.refcount, 1);
   }
   
@@@ -6875,7 -6888,10 +6888,10 @@@ static struct root_domain *alloc_rootdo
         if (!rd)
                 return NULL;
   
-       init_rootdomain(rd);
+       if (init_rootdomain(rd, false) != 0) {
+               kfree(rd);
+               return NULL;
+       }
   
         return rd;
   }
@@@ -6917,19 -6933,12 +6933,12 @@@ cpu_attach_domain(struct sched_domain *
   }
   
   /* cpus with isolated domains */
- static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+ static cpumask_var_t cpu_isolated_map;
   
   /* Setup the mask of cpus configured for isolated domains */
   static int __init isolated_cpu_setup(char *str)
   {
-       static int __initdata ints[NR_CPUS];
-       int i;
- 
-       str = get_options(str, ARRAY_SIZE(ints), ints);
-       cpus_clear(cpu_isolated_map);
-       for (i = 1; i <= ints[0]; i++)
-               if (ints[i] < NR_CPUS)
-                       cpu_set(ints[i], cpu_isolated_map);
- -      cpulist_parse(str, *cpu_isolated_map);
++      cpulist_parse(str, cpu_isolated_map);
         return 1;
   }
   
@@@ -6938,42 -6947,43 +6947,43 @@@ __setup("isolcpus=", isolated_cpu_setup
   /*
    * init_sched_build_groups takes the cpumask we wish to span, and a pointer
    * to a function which identifies what group(along with sched group) a CPU
-  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
-  * (due to the fact that we keep track of groups covered with a cpumask_t).
+  * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+  * (due to the fact that we keep track of groups covered with a struct cpumask).
    *
    * init_sched_build_groups will build a circular linked list of the groups
    * covered by the given span, and will set each group's ->cpumask correctly,
    * and ->cpu_power to 0.
    */
   static void
- init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ init_sched_build_groups(const struct cpumask *span,
+                       const struct cpumask *cpu_map,
+                       int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                                         struct sched_group **sg,
-                                       cpumask_t *tmpmask),
-                       cpumask_t *covered, cpumask_t *tmpmask)
+                                       struct cpumask *tmpmask),
+                       struct cpumask *covered, struct cpumask *tmpmask)
   {
         struct sched_group *first = NULL, *last = NULL;
         int i;
   
-       cpus_clear(*covered);
+       cpumask_clear(covered);
   
-       for_each_cpu_mask_nr(i, *span) {
+       for_each_cpu(i, span) {
                 struct sched_group *sg;
                 int group = group_fn(i, cpu_map, &sg, tmpmask);
                 int j;
   
-               if (cpu_isset(i, *covered))
+               if (cpumask_test_cpu(i, covered))
                         continue;
   
-               cpus_clear(sg->cpumask);
+               cpumask_clear(sched_group_cpus(sg));
                 sg->__cpu_power = 0;
   
-               for_each_cpu_mask_nr(j, *span) {
+               for_each_cpu(j, span) {
                         if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                 continue;
   
-                       cpu_set(j, *covered);
-                       cpu_set(j, sg->cpumask);
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
                 }
                 if (!first)
                         first = sg;
@@@ -7037,9 -7047,10 +7047,10 @@@ static int find_next_best_node(int node
    * should be one that prevents unnecessary balancing, but also spreads tasks
    * out optimally.
    */
- static void sched_domain_node_span(int node, cpumask_t *span)
+ static void sched_domain_node_span(int node, struct cpumask *span)
   {
         nodemask_t used_nodes;
+       /* FIXME: use cpumask_of_node() */
         node_to_cpumask_ptr(nodemask, node);
         int i;
   
@@@ -7061,18 -7072,33 +7072,33 @@@
   int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
   
   /*
+  * The cpus mask in sched_group and sched_domain hangs off the end.
+  * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+  * for nr_cpu_ids < CONFIG_NR_CPUS.
+  */
+ struct static_sched_group {
+       struct sched_group sg;
+       DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+ };
+ 
+ struct static_sched_domain {
+       struct sched_domain sd;
+       DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ };
+ 
+ /*
    * SMT sched-domains:
    */
   #ifdef CONFIG_SCHED_SMT
- static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
   
   static int
- cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                cpumask_t *unused)
+ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+                struct sched_group **sg, struct cpumask *unused)
   {
         if (sg)
-               *sg = &per_cpu(sched_group_cpus, cpu);
+               *sg = &per_cpu(sched_group_cpus, cpu).sg;
         return cpu;
   }
   #endif /* CONFIG_SCHED_SMT */
@@@ -7081,56 -7107,55 +7107,55 @@@
    * multi-core sched-domains:
    */
   #ifdef CONFIG_SCHED_MC
- static DEFINE_PER_CPU(struct sched_domain, core_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+ static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
   #endif /* CONFIG_SCHED_MC */
   
   #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
   static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
   {
         int group;
   
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
         if (sg)
-               *sg = &per_cpu(sched_group_core, group);
+               *sg = &per_cpu(sched_group_core, group).sg;
         return group;
   }
   #elif defined(CONFIG_SCHED_MC)
   static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *unused)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *unused)
   {
         if (sg)
-               *sg = &per_cpu(sched_group_core, cpu);
+               *sg = &per_cpu(sched_group_core, cpu).sg;
         return cpu;
   }
   #endif
   
- static DEFINE_PER_CPU(struct sched_domain, phys_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
   
   static int
- cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
   {
         int group;
   #ifdef CONFIG_SCHED_MC
+       /* FIXME: Use cpu_coregroup_mask. */
         *mask = cpu_coregroup_map(cpu);
         cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
   #else
         group = cpu;
   #endif
         if (sg)
-               *sg = &per_cpu(sched_group_phys, group);
+               *sg = &per_cpu(sched_group_phys, group).sg;
         return group;
   }
   
@@@ -7144,19 -7169,21 +7169,21 @@@ static DEFINE_PER_CPU(struct sched_doma
   static struct sched_group ***sched_group_nodes_bycpu;
   
   static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
   
- static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-                                struct sched_group **sg, cpumask_t *nodemask)
+ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                                struct sched_group **sg,
+                                struct cpumask *nodemask)
   {
         int group;
+       /* FIXME: use cpumask_of_node */
+       node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
   
-       *nodemask = node_to_cpumask(cpu_to_node(cpu));
-       cpus_and(*nodemask, *nodemask, *cpu_map);
-       group = first_cpu(*nodemask);
+       cpumask_and(nodemask, pnodemask, cpu_map);
+       group = cpumask_first(nodemask);
   
         if (sg)
-               *sg = &per_cpu(sched_group_allnodes, group);
+               *sg = &per_cpu(sched_group_allnodes, group).sg;
         return group;
   }
   
@@@ -7168,11 -7195,11 +7195,11 @@@ static void init_numa_sched_groups_powe
         if (!sg)
                 return;
         do {
-               for_each_cpu_mask_nr(j, sg->cpumask) {
+               for_each_cpu(j, sched_group_cpus(sg)) {
                         struct sched_domain *sd;
   
-                       sd = &per_cpu(phys_domains, j);
-                       if (j != first_cpu(sd->groups->cpumask)) {
+                       sd = &per_cpu(phys_domains, j).sd;
+                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
                                 /*
                                  * Only add "power" once for each
                                  * physical package.
@@@ -7189,11 -7216,12 +7216,12 @@@
   
   #ifdef CONFIG_NUMA
   /* Free memory allocated for various sched_group structures */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
   {
         int cpu, i;
   
-       for_each_cpu_mask_nr(cpu, *cpu_map) {
+       for_each_cpu(cpu, cpu_map) {
                 struct sched_group **sched_group_nodes
                         = sched_group_nodes_bycpu[cpu];
   
@@@ -7202,10 -7230,11 +7230,11 @@@
   
                 for (i = 0; i < nr_node_ids; i++) {
                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                       /* FIXME: Use cpumask_of_node */
+                       node_to_cpumask_ptr(pnodemask, i);
   
-                       *nodemask = node_to_cpumask(i);
-                       cpus_and(*nodemask, *nodemask, *cpu_map);
-                       if (cpus_empty(*nodemask))
+                       cpus_and(*nodemask, *pnodemask, *cpu_map);
+                       if (cpumask_empty(nodemask))
                                 continue;
   
                         if (sg == NULL)
@@@ -7223,7 -7252,8 +7252,8 @@@ next_sg
         }
   }
   #else /* !CONFIG_NUMA */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
   {
   }
   #endif /* CONFIG_NUMA */
@@@ -7249,7 -7279,7 +7279,7 @@@ static void init_sched_groups_power(in
   
         WARN_ON(!sd || !sd->groups);
   
-       if (cpu != first_cpu(sd->groups->cpumask))
+       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
                 return;
   
         child = sd->child;
@@@ -7314,40 -7344,6 +7344,6 @@@ SD_INIT_FUNC(CPU
    SD_INIT_FUNC(MC)
   #endif
   
- /*
-  * To minimize stack usage kmalloc room for cpumasks and share the
-  * space as the usage in build_sched_domains() dictates.  Used only
-  * if the amount of space is significant.
-  */
- struct allmasks {
-       cpumask_t tmpmask;                      /* make this one first */
-       union {
-               cpumask_t nodemask;
-               cpumask_t this_sibling_map;
-               cpumask_t this_core_map;
-       };
-       cpumask_t send_covered;
- 
- #ifdef CONFIG_NUMA
-       cpumask_t domainspan;
-       cpumask_t covered;
-       cpumask_t notcovered;
- #endif
- };
- 
- #if   NR_CPUS > 128
- #define       SCHED_CPUMASK_ALLOC             1
- #define       SCHED_CPUMASK_FREE(v)           kfree(v)
- #define       SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
- #else
- #define       SCHED_CPUMASK_ALLOC             0
- #define       SCHED_CPUMASK_FREE(v)
- #define       SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
- #endif
- 
- #define       SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
-                       ((unsigned long)(a) + offsetof(struct allmasks, v))
- 
   static int default_relax_domain_level = -1;
   
   static int __init setup_relax_domain_level(char *str)
@@@ -7387,17 -7383,38 +7383,38 @@@ static void set_domain_attribute(struc
    * Build sched domains for a given set of cpus and attach the sched domains
    * to the individual cpus
    */
- static int __build_sched_domains(const cpumask_t *cpu_map,
+ static int __build_sched_domains(const struct cpumask *cpu_map,
                                  struct sched_domain_attr *attr)
   {
-       int i;
+       int i, err = -ENOMEM;
         struct root_domain *rd;
-       SCHED_CPUMASK_DECLARE(allmasks);
-       cpumask_t *tmpmask;
+       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+               tmpmask;
   #ifdef CONFIG_NUMA
+       cpumask_var_t domainspan, covered, notcovered;
         struct sched_group **sched_group_nodes = NULL;
         int sd_allnodes = 0;
   
+       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+               goto free_domainspan;
+       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+               goto free_covered;
+ #endif
+ 
+       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+               goto free_notcovered;
+       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+               goto free_nodemask;
+       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+               goto free_this_sibling_map;
+       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+               goto free_this_core_map;
+       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               goto free_send_covered;
+ 
+ #ifdef CONFIG_NUMA
         /*
          * Allocate the per-node list of sched groups
          */
@@@ -7405,55 -7422,37 +7422,37 @@@
                                     GFP_KERNEL);
         if (!sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return -ENOMEM;
+               goto free_tmpmask;
         }
   #endif
   
         rd = alloc_rootdomain();
         if (!rd) {
                 printk(KERN_WARNING "Cannot alloc root domain\n");
- #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
+               goto free_sched_groups;
         }
   
- #if SCHED_CPUMASK_ALLOC
-       /* get space for all scratch cpumask variables */
-       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
-       if (!allmasks) {
-               printk(KERN_WARNING "Cannot alloc cpumask array\n");
-               kfree(rd);
   #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
-       }
- #endif
-       tmpmask = (cpumask_t *)allmasks;
- 
- 
- #ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
   #endif
   
         /*
          * Set up domains for cpus specified by the cpu_map.
          */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd = NULL, *p;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
   
+               /* FIXME: use cpumask_of_node */
                 *nodemask = node_to_cpumask(cpu_to_node(i));
                 cpus_and(*nodemask, *nodemask, *cpu_map);
   
   #ifdef CONFIG_NUMA
-               if (cpus_weight(*cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+               if (cpumask_weight(cpu_map) >
+                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
                         sd = &per_cpu(allnodes_domains, i);
                         SD_INIT(sd, ALLNODES);
                         set_domain_attribute(sd, attr);
-                       sd->span = *cpu_map;
+                       cpumask_copy(sched_domain_span(sd), cpu_map);
                         cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                         p = sd;
                         sd_allnodes = 1;
@@@ -7463,18 -7462,19 +7462,19 @@@
                 sd = &per_cpu(node_domains, i);
                 SD_INIT(sd, NODE);
                 set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), &sd->span);
+               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                 sd->parent = p;
                 if (p)
                         p->child = sd;
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
   #endif
   
                 p = sd;
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
                 SD_INIT(sd, CPU);
                 set_domain_attribute(sd, attr);
-               sd->span = *nodemask;
+               cpumask_copy(sched_domain_span(sd), nodemask);
                 sd->parent = p;
                 if (p)
                         p->child = sd;
@@@ -7482,11 -7482,12 +7482,12 @@@
   
   #ifdef CONFIG_SCHED_MC
                 p = sd;
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
                 SD_INIT(sd, MC);
                 set_domain_attribute(sd, attr);
-               sd->span = cpu_coregroup_map(i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               *sched_domain_span(sd) = cpu_coregroup_map(i);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7494,11 -7495,11 +7495,11 @@@
   
   #ifdef CONFIG_SCHED_SMT
                 p = sd;
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
                 SD_INIT(sd, SIBLING);
                 set_domain_attribute(sd, attr);
-               sd->span = per_cpu(cpu_sibling_map, i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7507,13 -7508,10 +7508,10 @@@
   
   #ifdef CONFIG_SCHED_SMT
         /* Set up CPU (sibling) groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
-               *this_sibling_map = per_cpu(cpu_sibling_map, i);
-               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-               if (i != first_cpu(*this_sibling_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_sibling_map,
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
+               if (i != cpumask_first(this_sibling_map))
                         continue;
   
                 init_sched_build_groups(this_sibling_map, cpu_map,
@@@ -7524,13 -7522,11 +7522,11 @@@
   
   #ifdef CONFIG_SCHED_MC
         /* Set up multi-core groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_core_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
+       for_each_cpu(i, cpu_map) {
+               /* FIXME: Use cpu_coregroup_mask */
                 *this_core_map = cpu_coregroup_map(i);
                 cpus_and(*this_core_map, *this_core_map, *cpu_map);
-               if (i != first_cpu(*this_core_map))
+               if (i != cpumask_first(this_core_map))
                         continue;
   
                 init_sched_build_groups(this_core_map, cpu_map,
@@@ -7541,12 -7537,10 +7537,10 @@@
   
         /* Set up physical groups */
         for (i = 0; i < nr_node_ids; i++) {
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
+               /* FIXME: Use cpumask_of_node */
                 *nodemask = node_to_cpumask(i);
                 cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask))
+               if (cpumask_empty(nodemask))
                         continue;
   
                 init_sched_build_groups(nodemask, cpu_map,
@@@ -7557,8 -7551,6 +7551,6 @@@
   #ifdef CONFIG_NUMA
         /* Set up node groups */
         if (sd_allnodes) {
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
                 init_sched_build_groups(cpu_map, cpu_map,
                                         &cpu_to_allnodes_group,
                                         send_covered, tmpmask);
@@@ -7567,58 -7559,58 +7559,58 @@@
         for (i = 0; i < nr_node_ids; i++) {
                 /* Set up node groups */
                 struct sched_group *sg, *prev;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(domainspan, allmasks);
-               SCHED_CPUMASK_VAR(covered, allmasks);
                 int j;
   
+               /* FIXME: Use cpumask_of_node */
                 *nodemask = node_to_cpumask(i);
-               cpus_clear(*covered);
+               cpumask_clear(covered);
   
                 cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask)) {
+               if (cpumask_empty(nodemask)) {
                         sched_group_nodes[i] = NULL;
                         continue;
                 }
   
                 sched_domain_node_span(i, domainspan);
-               cpus_and(*domainspan, *domainspan, *cpu_map);
+               cpumask_and(domainspan, domainspan, cpu_map);
   
-               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, i);
                 if (!sg) {
                         printk(KERN_WARNING "Can not alloc domain group for "
                                 "node %d\n", i);
                         goto error;
                 }
                 sched_group_nodes[i] = sg;
-               for_each_cpu_mask_nr(j, *nodemask) {
+               for_each_cpu(j, nodemask) {
                         struct sched_domain *sd;
   
                         sd = &per_cpu(node_domains, j);
                         sd->groups = sg;
                 }
                 sg->__cpu_power = 0;
-               sg->cpumask = *nodemask;
+               cpumask_copy(sched_group_cpus(sg), nodemask);
                 sg->next = sg;
-               cpus_or(*covered, *covered, *nodemask);
+               cpumask_or(covered, covered, nodemask);
                 prev = sg;
   
                 for (j = 0; j < nr_node_ids; j++) {
-                       SCHED_CPUMASK_VAR(notcovered, allmasks);
                         int n = (i + j) % nr_node_ids;
+                       /* FIXME: Use cpumask_of_node */
                         node_to_cpumask_ptr(pnodemask, n);
   
-                       cpus_complement(*notcovered, *covered);
-                       cpus_and(*tmpmask, *notcovered, *cpu_map);
-                       cpus_and(*tmpmask, *tmpmask, *domainspan);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_complement(notcovered, covered);
+                       cpumask_and(tmpmask, notcovered, cpu_map);
+                       cpumask_and(tmpmask, tmpmask, domainspan);
+                       if (cpumask_empty(tmpmask))
                                 break;
   
-                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_and(tmpmask, tmpmask, pnodemask);
+                       if (cpumask_empty(tmpmask))
                                 continue;
   
-                       sg = kmalloc_node(sizeof(struct sched_group),
+                       sg = kmalloc_node(sizeof(struct sched_group) +
+                                         cpumask_size(),
                                           GFP_KERNEL, i);
                         if (!sg) {
                                 printk(KERN_WARNING
@@@ -7626,9 -7618,9 +7618,9 @@@
                                 goto error;
                         }
                         sg->__cpu_power = 0;
-                       sg->cpumask = *tmpmask;
+                       cpumask_copy(sched_group_cpus(sg), tmpmask);
                         sg->next = prev->next;
-                       cpus_or(*covered, *covered, *tmpmask);
+                       cpumask_or(covered, covered, tmpmask);
                         prev->next = sg;
                         prev = sg;
                 }
@@@ -7637,22 -7629,22 +7629,22 @@@
   
         /* Calculate CPU power for physical packages and nodes */
   #ifdef CONFIG_SCHED_SMT
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   #ifdef CONFIG_SCHED_MC
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
@@@ -7664,56 -7656,87 +7656,87 @@@
         if (sd_allnodes) {
                 struct sched_group *sg;
   
-               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+               cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
                                                                 tmpmask);
                 init_numa_sched_groups_power(sg);
         }
   #endif
   
         /* Attach the domains */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
   #elif defined(CONFIG_SCHED_MC)
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
   #else
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
   #endif
                 cpu_attach_domain(sd, rd, i);
         }
   
-       SCHED_CPUMASK_FREE((void *)allmasks);
-       return 0;
+       err = 0;
+ 
+ free_tmpmask:
+       free_cpumask_var(tmpmask);
+ free_send_covered:
+       free_cpumask_var(send_covered);
+ free_this_core_map:
+       free_cpumask_var(this_core_map);
+ free_this_sibling_map:
+       free_cpumask_var(this_sibling_map);
+ free_nodemask:
+       free_cpumask_var(nodemask);
+ free_notcovered:
+ #ifdef CONFIG_NUMA
+       free_cpumask_var(notcovered);
+ free_covered:
+       free_cpumask_var(covered);
+ free_domainspan:
+       free_cpumask_var(domainspan);
+ out:
+ #endif
+       return err;
+ 
+ free_sched_groups:
+ #ifdef CONFIG_NUMA
+       kfree(sched_group_nodes);
+ #endif
+       goto free_tmpmask;
   
   #ifdef CONFIG_NUMA
   error:
         free_sched_groups(cpu_map, tmpmask);
-       SCHED_CPUMASK_FREE((void *)allmasks);
-       kfree(rd);
-       return -ENOMEM;
+       free_rootdomain(rd);
+       goto free_tmpmask;
   #endif
   }
   
- static int build_sched_domains(const cpumask_t *cpu_map)
+ static int build_sched_domains(const struct cpumask *cpu_map)
   {
         return __build_sched_domains(cpu_map, NULL);
   }
   
- static cpumask_t *doms_cur;   /* current sched domains */
+ static struct cpumask *doms_cur;      /* current sched domains */
   static int ndoms_cur;         /* number of sched domains in 'doms_cur' */
   static struct sched_domain_attr *dattr_cur;
                                 /* attribues of custom domains in 'doms_cur' */
   
   /*
    * Special case: If a kmalloc of a doms_cur partition (array of
-  * cpumask_t) fails, then fallback to a single sched domain,
-  * as determined by the single cpumask_t fallback_doms.
+  * cpumask) fails, then fallback to a single sched domain,
+  * as determined by the single cpumask fallback_doms.
    */
- static cpumask_t fallback_doms;
+ static cpumask_var_t fallback_doms;
   
- void __attribute__((weak)) arch_update_cpu_topology(void)
+ /*
+  * arch_update_cpu_topology lets virtualized architectures update the
+  * cpu core maps. It is supposed to return 1 if the topology changed
+  * or 0 if it stayed the same.
+  */
+ int __attribute__((weak)) arch_update_cpu_topology(void)
   {
+       return 0;
   }
   
   /*
@@@ -7721,16 -7744,16 +7744,16 @@@
    * For now this just excludes isolated cpus, but could be used to
    * exclude other special cases in the future.
    */
- static int arch_init_sched_domains(const cpumask_t *cpu_map)
+ static int arch_init_sched_domains(const struct cpumask *cpu_map)
   {
         int err;
   
         arch_update_cpu_topology();
         ndoms_cur = 1;
-       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+       doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
         if (!doms_cur)
-               doms_cur = &fallback_doms;
-       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+               doms_cur = fallback_doms;
+       cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
         dattr_cur = NULL;
         err = build_sched_domains(doms_cur);
         register_sched_domain_sysctl();
@@@ -7738,8 -7761,8 +7761,8 @@@
         return err;
   }
   
- static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-                                      cpumask_t *tmpmask)
+ static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+                                      struct cpumask *tmpmask)
   {
         free_sched_groups(cpu_map, tmpmask);
   }
@@@ -7748,17 -7771,16 +7771,16 @@@
    * Detach sched domains from a group of cpus specified in cpu_map
    * These cpus will now be attached to the NULL domain
    */
- static void detach_destroy_domains(const cpumask_t *cpu_map)
+ static void detach_destroy_domains(const struct cpumask *cpu_map)
   {
-       cpumask_t tmpmask;
+       /* Save because hotplug lock held. */
+       static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
         int i;
   
-       unregister_sched_domain_sysctl();
- 
-       for_each_cpu_mask_nr(i, *cpu_map)
+       for_each_cpu(i, cpu_map)
                 cpu_attach_domain(NULL, &def_root_domain, i);
         synchronize_sched();
-       arch_destroy_sched_domains(cpu_map, &tmpmask);
+       arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
   }
   
   /* handle null as "default" */
@@@ -7783,7 -7805,7 +7805,7 @@@ static int dattrs_equal(struct sched_do
    * doms_new[] to the current sched domain partitioning, doms_cur[].
    * It destroys each deleted domain and builds each new domain.
    *
-  * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+  * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
    * The masks don't intersect (don't overlap.) We should setup one
    * sched domain for each mask. CPUs not in any of the cpumasks will
    * not be load balanced. If the same cpumask appears both in the
@@@ -7797,28 -7819,33 +7819,33 @@@
    * the single partition 'fallback_doms', it also forces the domains
    * to be rebuilt.
    *
-  * If doms_new == NULL it will be replaced with cpu_online_map.
+  * If doms_new == NULL it will be replaced with cpu_online_mask.
    * ndoms_new == 0 is a special case for destroying existing domains,
    * and it will not create the default domain.
    *
    * Call with hotplug lock held
    */
- void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ /* FIXME: Change to struct cpumask *doms_new[] */
+ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                              struct sched_domain_attr *dattr_new)
   {
         int i, j, n;
+       int new_topology;
   
         mutex_lock(&sched_domains_mutex);
   
         /* always unregister in case we don't destroy any domains */
         unregister_sched_domain_sysctl();
   
+       /* Let architecture update cpu core mappings. */
+       new_topology = arch_update_cpu_topology();
+ 
         n = doms_new ? ndoms_new : 0;
   
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < n; j++) {
-                       if (cpus_equal(doms_cur[i], doms_new[j])
+               for (j = 0; j < n && !new_topology; j++) {
+                       if (cpumask_equal(&doms_cur[i], &doms_new[j])
                             && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
                 }
@@@ -7830,15 -7857,15 +7857,15 @@@ match1
   
         if (doms_new == NULL) {
                 ndoms_cur = 0;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-               dattr_new = NULL;
+               doms_new = fallback_doms;
+               cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
+               WARN_ON_ONCE(dattr_new);
         }
   
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur; j++) {
-                       if (cpus_equal(doms_new[i], doms_cur[j])
+               for (j = 0; j < ndoms_cur && !new_topology; j++) {
+                       if (cpumask_equal(&doms_new[i], &doms_cur[j])
                             && dattrs_equal(dattr_new, i, dattr_cur, j))
                                 goto match2;
                 }
@@@ -7850,7 -7877,7 +7877,7 @@@ match2
         }
   
         /* Remember the new sched domains */
-       if (doms_cur != &fallback_doms)
+       if (doms_cur != fallback_doms)
                 kfree(doms_cur);
         kfree(dattr_cur);       /* kfree(NULL) is safe */
         doms_cur = doms_new;
@@@ -7990,7 -8017,9 +8017,9 @@@ static int update_runtime(struct notifi
   
   void __init sched_init_smp(void)
   {
-       cpumask_t non_isolated_cpus;
+       cpumask_var_t non_isolated_cpus;
+ 
+       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
   
   #if defined(CONFIG_NUMA)
         sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@@ -7999,10 -8028,10 +8028,10 @@@
   #endif
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
-       arch_init_sched_domains(&cpu_online_map);
-       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
-       if (cpus_empty(non_isolated_cpus))
-               cpu_set(smp_processor_id(), non_isolated_cpus);
+       arch_init_sched_domains(cpu_online_mask);
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+       if (cpumask_empty(non_isolated_cpus))
+               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
   
@@@ -8017,9 -8046,13 +8046,13 @@@
         init_hrtick();
   
         /* Move init over to a non-isolated CPU */
-       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                 BUG();
         sched_init_granularity();
+       free_cpumask_var(non_isolated_cpus);
+ 
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+       init_sched_rt_class();
   }
   #else
   void __init sched_init_smp(void)
@@@ -8334,6 -8367,15 +8367,15 @@@ void __init sched_init(void
          */
         current->sched_class = &fair_sched_class;
   
+       /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ #endif
+       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ #endif /* SMP */
+ 
         scheduler_running = 1;
   }
   
@@@ -8492,7 -8534,7 +8534,7 @@@ stati
   int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
   {
         struct cfs_rq *cfs_rq;
-       struct sched_entity *se, *parent_se;
+       struct sched_entity *se;
         struct rq *rq;
         int i;
   
@@@ -8508,18 -8550,17 +8550,17 @@@
         for_each_possible_cpu(i) {
                 rq = cpu_rq(i);
   
-               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
                 if (!cfs_rq)
                         goto err;
   
-               se = kmalloc_node(sizeof(struct sched_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               se = kzalloc_node(sizeof(struct sched_entity),
+                                 GFP_KERNEL, cpu_to_node(i));
                 if (!se)
                         goto err;
   
-               parent_se = parent ? parent->se[i] : NULL;
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
         }
   
         return 1;
@@@ -8580,7 -8621,7 +8621,7 @@@ stati
   int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
   {
         struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se, *parent_se;
+       struct sched_rt_entity *rt_se;
         struct rq *rq;
         int i;
   
@@@ -8597,18 -8638,17 +8638,17 @@@
         for_each_possible_cpu(i) {
                 rq = cpu_rq(i);
   
-               rt_rq = kmalloc_node(sizeof(struct rt_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                    GFP_KERNEL, cpu_to_node(i));
                 if (!rt_rq)
                         goto err;
   
-               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                    GFP_KERNEL, cpu_to_node(i));
                 if (!rt_se)
                         goto err;
   
-               parent_se = parent ? parent->rt_se[i] : NULL;
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
         }
   
         return 1;
@@@ -9251,11 -9291,12 +9291,12 @@@ struct cgroup_subsys cpu_cgroup_subsys 
    * (balbir@in.ibm.com).
    */
   
- /* track cpu usage of a group of tasks */
+ /* track cpu usage of a group of tasks and its child groups */
   struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
         u64 *cpuusage;
+       struct cpuacct *parent;
   };
   
   struct cgroup_subsys cpuacct_subsys;
@@@ -9289,6 -9330,9 +9330,9 @@@ static struct cgroup_subsys_state *cpua
                 return ERR_PTR(-ENOMEM);
         }
   
+       if (cgrp->parent)
+               ca->parent = cgroup_ca(cgrp->parent);
+ 
         return &ca->css;
   }
   
@@@ -9368,14 -9412,16 +9412,16 @@@ static int cpuacct_populate(struct cgro
   static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
   {
         struct cpuacct *ca;
+       int cpu;
   
         if (!cpuacct_subsys.active)
                 return;
   
+       cpu = task_cpu(tsk);
         ca = task_ca(tsk);
-       if (ca) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
   
+       for (; ca; ca = ca->parent) {
+               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
   }
diff --combined kernel/sched_stats.h

index 6beff1e,ce34083..5fcf0e1
--- 1/kernel/sched_stats.h
--- 2/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@@ -42,7 -42,8 +42,8 @@@ static int show_schedstat(struct seq_fi
                 for_each_domain(cpu, sd) {
                         enum cpu_idle_type itype;
   
-                       cpumask_scnprintf(mask_str, mask_len, &sd->span);
+                       cpumask_scnprintf(mask_str, mask_len,
- -                                        *sched_domain_span(sd));
++                                        sched_domain_span(sd));
                         seq_printf(seq, "domain%d %s", dcount++, mask_str);
                         for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
                                         itype++) {
diff --combined kernel/trace/trace.c

index d2e7547,8ebe007..6adf660
--- 1/kernel/trace/trace.c
--- 2/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -30,6 -30,7 +30,7 @@@
   #include <linux/gfp.h>
   #include <linux/fs.h>
   #include <linux/kprobes.h>
+ #include <linux/seq_file.h>
   #include <linux/writeback.h>
   
   #include <linux/stacktrace.h>
@@@ -43,6 -44,38 +44,38 @@@
   unsigned long __read_mostly   tracing_max_latency = (cycle_t)ULONG_MAX;
   unsigned long __read_mostly   tracing_thresh;
   
+ /*
+  * We need to change this state when a selftest is running.
+  * A selftest will lurk into the ring-buffer to count the
+  * entries inserted during the selftest although some concurrent
+  * insertions into the ring-buffer such as ftrace_printk could occurred
+  * at the same time, giving false positive or negative results.
+  */
+ static bool __read_mostly tracing_selftest_running;
+ 
+ /* For tracers that don't implement custom flags */
+ static struct tracer_opt dummy_tracer_opt[] = {
+       { }
+ };
+ 
+ static struct tracer_flags dummy_tracer_flags = {
+       .val = 0,
+       .opts = dummy_tracer_opt
+ };
+ 
+ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+ {
+       return 0;
+ }
+ 
+ /*
+  * Kill all tracing for good (never come back).
+  * It is initialized to 1 but will turn to zero if the initialization
+  * of the tracer is successful. But that is the only place that sets
+  * this back to zero.
+  */
+ int tracing_disabled = 1;
+ 
   static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
   
   static inline void ftrace_disable_cpu(void)
@@@ -62,7 -95,36 +95,36 @@@ static cpumask_t __read_mostly              tracing
   #define for_each_tracing_cpu(cpu)     \
         for_each_cpu_mask(cpu, tracing_buffer_mask)
   
- static int tracing_disabled = 1;
+ /*
+  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+  *
+  * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+  * is set, then ftrace_dump is called. This will output the contents
+  * of the ftrace buffers to the console.  This is very useful for
+  * capturing traces that lead to crashes and outputing it to a
+  * serial console.
+  *
+  * It is default off, but you can enable it with either specifying
+  * "ftrace_dump_on_oops" in the kernel command line, or setting
+  * /proc/sys/kernel/ftrace_dump_on_oops to true.
+  */
+ int ftrace_dump_on_oops;
+ 
+ static int tracing_set_tracer(char *buf);
+ 
+ static int __init set_ftrace(char *str)
+ {
+       tracing_set_tracer(str);
+       return 1;
+ }
+ __setup("ftrace", set_ftrace);
+ 
+ static int __init set_ftrace_dump_on_oops(char *str)
+ {
+       ftrace_dump_on_oops = 1;
+       return 1;
+ }
+ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
   
   long
   ns2usecs(cycle_t nsec)
@@@ -112,6 -174,19 +174,19 @@@ static DEFINE_PER_CPU(struct trace_arra
   /* tracer_enabled is used to toggle activation of a tracer */
   static int                    tracer_enabled = 1;
   
+ /**
+  * tracing_is_enabled - return tracer_enabled status
+  *
+  * This function is used by other tracers to know the status
+  * of the tracer_enabled flag.  Tracers may use this function
+  * to know if it should enable their features when starting
+  * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+  */
+ int tracing_is_enabled(void)
+ {
+       return tracer_enabled;
+ }
+ 
   /* function tracing enabled */
   int                           ftrace_function_enabled;
   
@@@ -153,8 -228,9 +228,9 @@@ static DEFINE_MUTEX(trace_types_lock)
   /* trace_wait is a waitqueue for tasks blocked on trace_poll */
   static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
   
- /* trace_flags holds iter_ctrl options */
- unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+ /* trace_flags holds trace_options default values */
+ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+       TRACE_ITER_ANNOTATE;
   
   /**
    * trace_wake_up - wake up tasks waiting for trace input
@@@ -193,13 -269,6 +269,6 @@@ unsigned long nsecs_to_usecs(unsigned l
         return nsecs / 1000;
   }
   
- /*
-  * TRACE_ITER_SYM_MASK masks the options in trace_flags that
-  * control the output of kernel symbols.
-  */
- #define TRACE_ITER_SYM_MASK \
-       (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
- 
   /* These must match the bit postions in trace_iterator_flags */
   static const char *trace_options[] = {
         "print-parent",
@@@ -213,6 -282,11 +282,11 @@@
         "stacktrace",
         "sched-tree",
         "ftrace_printk",
+       "ftrace_preempt",
+       "branch",
+       "annotate",
+       "userstacktrace",
+       "sym-userobj",
         NULL
   };
   
@@@ -359,6 -433,28 +433,28 @@@ trace_seq_putmem_hex(struct trace_seq *
         return trace_seq_putmem(s, hex, j);
   }
   
+ static int
+ trace_seq_path(struct trace_seq *s, struct path *path)
+ {
+       unsigned char *p;
+ 
+       if (s->len >= (PAGE_SIZE - 1))
+               return 0;
+       p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+       if (!IS_ERR(p)) {
+               p = mangle_path(s->buffer + s->len, p, "\n");
+               if (p) {
+                       s->len = p - s->buffer;
+                       return 1;
+               }
+       } else {
+               s->buffer[s->len++] = '?';
+               return 1;
+       }
+ 
+       return 0;
+ }
+ 
   static void
   trace_seq_reset(struct trace_seq *s)
   {
@@@ -470,7 -566,17 +566,17 @@@ int register_tracer(struct tracer *type
                 return -1;
         }
   
+       /*
+        * When this gets called we hold the BKL which means that
+        * preemption is disabled. Various trace selftests however
+        * need to disable and enable preemption for successful tests.
+        * So we drop the BKL here and grab it after the tests again.
+        */
+       unlock_kernel();
         mutex_lock(&trace_types_lock);
+ 
+       tracing_selftest_running = true;
+ 
         for (t = trace_types; t; t = t->next) {
                 if (strcmp(type->name, t->name) == 0) {
                         /* already found */
@@@ -481,12 -587,20 +587,20 @@@
                 }
         }
   
+       if (!type->set_flag)
+               type->set_flag = &dummy_set_flag;
+       if (!type->flags)
+               type->flags = &dummy_tracer_flags;
+       else
+               if (!type->flags->opts)
+                       type->flags->opts = dummy_tracer_opt;
+ 
   #ifdef CONFIG_FTRACE_STARTUP_TEST
         if (type->selftest) {
                 struct tracer *saved_tracer = current_trace;
                 struct trace_array *tr = &global_trace;
-               int saved_ctrl = tr->ctrl;
                 int i;
+ 
                 /*
                  * Run a selftest on this tracer.
                  * Here we reset the trace buffer, and set the current
@@@ -494,25 -608,23 +608,23 @@@
                  * internal tracing to verify that everything is in order.
                  * If we fail, we do not register this tracer.
                  */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                         tracing_reset(tr, i);
-               }
+ 
                 current_trace = type;
-               tr->ctrl = 0;
                 /* the test is responsible for initializing and enabling */
                 pr_info("Testing tracer %s: ", type->name);
                 ret = type->selftest(type, tr);
                 /* the test is responsible for resetting too */
                 current_trace = saved_tracer;
-               tr->ctrl = saved_ctrl;
                 if (ret) {
                         printk(KERN_CONT "FAILED!\n");
                         goto out;
                 }
                 /* Only reset on passing, to avoid touching corrupted buffers */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                         tracing_reset(tr, i);
-               }
+ 
                 printk(KERN_CONT "PASSED\n");
         }
   #endif
@@@ -524,7 -636,9 +636,9 @@@
                 max_tracer_type_len = len;
   
    out:
+       tracing_selftest_running = false;
         mutex_unlock(&trace_types_lock);
+       lock_kernel();
   
         return ret;
   }
@@@ -581,6 -695,91 +695,91 @@@ static void trace_init_cmdlines(void
         cmdline_idx = 0;
   }
   
+ static int trace_stop_count;
+ static DEFINE_SPINLOCK(tracing_start_lock);
+ 
+ /**
+  * ftrace_off_permanent - disable all ftrace code permanently
+  *
+  * This should only be called when a serious anomally has
+  * been detected.  This will turn off the function tracing,
+  * ring buffers, and other tracing utilites. It takes no
+  * locks and can be called from any context.
+  */
+ void ftrace_off_permanent(void)
+ {
+       tracing_disabled = 1;
+       ftrace_stop();
+       tracing_off_permanent();
+ }
+ 
+ /**
+  * tracing_start - quick start of the tracer
+  *
+  * If tracing is enabled but was stopped by tracing_stop,
+  * this will start the tracer back up.
+  */
+ void tracing_start(void)
+ {
+       struct ring_buffer *buffer;
+       unsigned long flags;
+ 
+       if (tracing_disabled)
+               return;
+ 
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (--trace_stop_count)
+               goto out;
+ 
+       if (trace_stop_count < 0) {
+               /* Someone screwed up their debugging */
+               WARN_ON_ONCE(1);
+               trace_stop_count = 0;
+               goto out;
+       }
+ 
+ 
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+ 
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+ 
+       ftrace_start();
+  out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+ }
+ 
+ /**
+  * tracing_stop - quick stop of the tracer
+  *
+  * Light weight way to stop tracing. Use in conjunction with
+  * tracing_start.
+  */
+ void tracing_stop(void)
+ {
+       struct ring_buffer *buffer;
+       unsigned long flags;
+ 
+       ftrace_stop();
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (trace_stop_count++)
+               goto out;
+ 
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+ 
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+ 
+  out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+ }
+ 
   void trace_stop_cmdline_recording(void);
   
   static void trace_save_cmdline(struct task_struct *tsk)
@@@ -618,7 -817,7 +817,7 @@@
         spin_unlock(&trace_cmdline_lock);
   }
   
- static char *trace_find_cmdline(int pid)
+ char *trace_find_cmdline(int pid)
   {
         char *cmdline = "<...>";
         unsigned map;
@@@ -655,6 -854,7 +854,7 @@@ tracing_generic_entry_update(struct tra
   
         entry->preempt_count            = pc & 0xff;
         entry->pid                      = (tsk) ? tsk->pid : 0;
+       entry->tgid                     = (tsk) ? tsk->tgid : 0;
         entry->flags =
   #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@@ -691,6 -891,56 +891,56 @@@ trace_function(struct trace_array *tr, 
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
   }
   
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ static void __trace_graph_entry(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ent *trace,
+                               unsigned long flags,
+                               int pc)
+ {
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ent_entry *entry;
+       unsigned long irq_flags;
+ 
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+ 
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_ENT;
+       entry->graph_ent                        = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ }
+ 
+ static void __trace_graph_return(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ret *trace,
+                               unsigned long flags,
+                               int pc)
+ {
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ret_entry *entry;
+       unsigned long irq_flags;
+ 
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+ 
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_RET;
+       entry->ret                              = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ }
+ #endif
+ 
   void
   ftrace(struct trace_array *tr, struct trace_array_cpu *data,
          unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@@ -742,6 -992,46 +992,46 @@@ void __trace_stack(struct trace_array *
         ftrace_trace_stack(tr, data, flags, skip, preempt_count());
   }
   
+ static void ftrace_trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags, int pc)
+ {
+ #ifdef CONFIG_STACKTRACE
+       struct ring_buffer_event *event;
+       struct userstack_entry *entry;
+       struct stack_trace trace;
+       unsigned long irq_flags;
+ 
+       if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+               return;
+ 
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type         = TRACE_USER_STACK;
+ 
+       memset(&entry->caller, 0, sizeof(entry->caller));
+ 
+       trace.nr_entries        = 0;
+       trace.max_entries       = FTRACE_STACK_ENTRIES;
+       trace.skip              = 0;
+       trace.entries           = entry->caller;
+ 
+       save_stack_trace_user(&trace);
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ #endif
+ }
+ 
+ void __trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags)
+ {
+       ftrace_trace_userstack(tr, data, flags, preempt_count());
+ }
+ 
   static void
   ftrace_trace_special(void *__tr, void *__data,
                      unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@@ -765,6 -1055,7 +1055,7 @@@
         entry->arg3                     = arg3;
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
         ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+       ftrace_trace_userstack(tr, data, irq_flags, pc);
   
         trace_wake_up();
   }
@@@ -803,6 -1094,7 +1094,7 @@@ tracing_sched_switch_trace(struct trace
         entry->next_cpu = task_cpu(next);
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
         ftrace_trace_stack(tr, data, flags, 5, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
   }
   
   void
@@@ -832,6 -1124,7 +1124,7 @@@ tracing_sched_wakeup_trace(struct trace
         entry->next_cpu                 = task_cpu(wakee);
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
         ftrace_trace_stack(tr, data, flags, 6, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
   
         trace_wake_up();
   }
@@@ -841,26 -1134,28 +1134,28 @@@ ftrace_special(unsigned long arg1, unsi
   {
         struct trace_array *tr = &global_trace;
         struct trace_array_cpu *data;
+       unsigned long flags;
         int cpu;
         int pc;
   
-       if (tracing_disabled || !tr->ctrl)
+       if (tracing_disabled)
                 return;
   
         pc = preempt_count();
-       preempt_disable_notrace();
+       local_irq_save(flags);
         cpu = raw_smp_processor_id();
         data = tr->data[cpu];
   
-       if (likely(!atomic_read(&data->disabled)))
+       if (likely(atomic_inc_return(&data->disabled) == 1))
                 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
   
-       preempt_enable_notrace();
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
   }
   
   #ifdef CONFIG_FUNCTION_TRACER
   static void
- function_trace_call(unsigned long ip, unsigned long parent_ip)
+ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
   {
         struct trace_array *tr = &global_trace;
         struct trace_array_cpu *data;
@@@ -873,8 -1168,7 +1168,7 @@@
                 return;
   
         pc = preempt_count();
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
         local_save_flags(flags);
         cpu = raw_smp_processor_id();
         data = tr->data[cpu];
@@@ -884,11 -1178,96 +1178,96 @@@
                 trace_function(tr, data, ip, parent_ip, flags, pc);
   
         atomic_dec(&data->disabled);
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
+ }
+ 
+ static void
+ function_trace_call(unsigned long ip, unsigned long parent_ip)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+ 
+       if (unlikely(!ftrace_function_enabled))
+               return;
+ 
+       /*
+        * Need to use raw, since this must be called before the
+        * recursive protection is performed.
+        */
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+ 
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               trace_function(tr, data, ip, parent_ip, flags, pc);
+       }
+ 
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+ }
+ 
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ int trace_graph_entry(struct ftrace_graph_ent *trace)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+ 
+       if (!ftrace_trace_task(current))
+               return 0;
+ 
+       if (!ftrace_graph_addr(trace->func))
+               return 0;
+ 
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_entry(tr, data, trace, flags, pc);
+       }
+       /* Only do the atomic if it is not already set */
+       if (!test_tsk_trace_graph(current))
+               set_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+ 
+       return 1;
+ }
+ 
+ void trace_graph_return(struct ftrace_graph_ret *trace)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+ 
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_return(tr, data, trace, flags, pc);
+       }
+       if (!trace->depth)
+               clear_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
   }
+ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
   
   static struct ftrace_ops trace_ops __read_mostly =
   {
@@@ -898,9 -1277,14 +1277,14 @@@
   void tracing_start_function_trace(void)
   {
         ftrace_function_enabled = 0;
+ 
+       if (trace_flags & TRACE_ITER_PREEMPTONLY)
+               trace_ops.func = function_trace_call_preempt_only;
+       else
+               trace_ops.func = function_trace_call;
+ 
         register_ftrace_function(&trace_ops);
-       if (tracer_enabled)
-               ftrace_function_enabled = 1;
+       ftrace_function_enabled = 1;
   }
   
   void tracing_stop_function_trace(void)
@@@ -912,6 -1296,7 +1296,7 @@@
   
   enum trace_file_type {
         TRACE_FILE_LAT_FMT      = 1,
+       TRACE_FILE_ANNOTATE     = 2,
   };
   
   static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@@ -1047,10 -1432,6 +1432,6 @@@ static void *s_start(struct seq_file *m
   
         atomic_inc(&trace_record_cmdline_disabled);
   
-       /* let the tracer grab locks here if needed */
-       if (current_trace->start)
-               current_trace->start(iter);
- 
         if (*pos != iter->pos) {
                 iter->ent = NULL;
                 iter->cpu = 0;
@@@ -1077,14 -1458,7 +1458,7 @@@
   
   static void s_stop(struct seq_file *m, void *p)
   {
-       struct trace_iterator *iter = m->private;
- 
         atomic_dec(&trace_record_cmdline_disabled);
- 
-       /* let the tracer release locks here if needed */
-       if (current_trace && current_trace == iter->trace && iter->trace->stop)
-               iter->trace->stop(iter);
- 
         mutex_unlock(&trace_types_lock);
   }
   
@@@ -1143,7 -1517,7 +1517,7 @@@ seq_print_sym_offset(struct trace_seq *
   # define IP_FMT "%016lx"
   #endif
   
- static int
+ int
   seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
   {
         int ret;
@@@ -1164,6 -1538,78 +1538,78 @@@
         return ret;
   }
   
+ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+                                   unsigned long ip, unsigned long sym_flags)
+ {
+       struct file *file = NULL;
+       unsigned long vmstart = 0;
+       int ret = 1;
+ 
+       if (mm) {
+               const struct vm_area_struct *vma;
+ 
+               down_read(&mm->mmap_sem);
+               vma = find_vma(mm, ip);
+               if (vma) {
+                       file = vma->vm_file;
+                       vmstart = vma->vm_start;
+               }
+               if (file) {
+                       ret = trace_seq_path(s, &file->f_path);
+                       if (ret)
+                               ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
+               }
+               up_read(&mm->mmap_sem);
+       }
+       if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+               ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+       return ret;
+ }
+ 
+ static int
+ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+                     unsigned long sym_flags)
+ {
+       struct mm_struct *mm = NULL;
+       int ret = 1;
+       unsigned int i;
+ 
+       if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+               struct task_struct *task;
+               /*
+                * we do the lookup on the thread group leader,
+                * since individual threads might have already quit!
+                */
+               rcu_read_lock();
+               task = find_task_by_vpid(entry->ent.tgid);
+               if (task)
+                       mm = get_task_mm(task);
+               rcu_read_unlock();
+       }
+ 
+       for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+               unsigned long ip = entry->caller[i];
+ 
+               if (ip == ULONG_MAX || !ret)
+                       break;
+               if (i && ret)
+                       ret = trace_seq_puts(s, " <- ");
+               if (!ip) {
+                       if (ret)
+                               ret = trace_seq_puts(s, "??");
+                       continue;
+               }
+               if (!ret)
+                       break;
+               if (ret)
+                       ret = seq_print_user_ip(s, mm, ip, sym_flags);
+       }
+ 
+       if (mm)
+               mmput(mm);
+       return ret;
+ }
+ 
   static void print_lat_help_header(struct seq_file *m)
   {
         seq_puts(m, "#                  _------=> CPU#            \n");
@@@ -1338,6 -1784,23 +1784,23 @@@ void trace_seq_print_cont(struct trace_
                 trace_seq_putc(s, '\n');
   }
   
+ static void test_cpu_buff_start(struct trace_iterator *iter)
+ {
+       struct trace_seq *s = &iter->seq;
+ 
+       if (!(trace_flags & TRACE_ITER_ANNOTATE))
+               return;
+ 
+       if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+               return;
+ 
+       if (cpu_isset(iter->cpu, iter->started))
+               return;
+ 
+       cpu_set(iter->cpu, iter->started);
+       trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+ }
+ 
   static enum print_line_t
   print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
   {
@@@ -1357,6 -1820,8 +1820,8 @@@
         if (entry->type == TRACE_CONT)
                 return TRACE_TYPE_HANDLED;
   
+       test_cpu_buff_start(iter);
+ 
         next_entry = find_next_entry(iter, NULL, &next_ts);
         if (!next_entry)
                 next_ts = iter->ts;
@@@ -1448,6 -1913,27 +1913,27 @@@
                         trace_seq_print_cont(s, iter);
                 break;
         }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+ 
+               trace_assign_type(field, entry);
+ 
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+ 
+               trace_assign_type(field, entry);
+ 
+               seq_print_userip_objs(field, s, sym_flags);
+               trace_seq_putc(s, '\n');
+               break;
+       }
         default:
                 trace_seq_printf(s, "Unknown type %d\n", entry->type);
         }
@@@ -1472,6 -1958,8 +1958,8 @@@ static enum print_line_t print_trace_fm
         if (entry->type == TRACE_CONT)
                 return TRACE_TYPE_HANDLED;
   
+       test_cpu_buff_start(iter);
+ 
         comm = trace_find_cmdline(iter->ent->pid);
   
         t = ns2usecs(iter->ts);
@@@ -1581,6 -2069,37 +2069,37 @@@
                         trace_seq_print_cont(s, iter);
                 break;
         }
+       case TRACE_GRAPH_RET: {
+               return print_graph_function(iter);
+       }
+       case TRACE_GRAPH_ENT: {
+               return print_graph_function(iter);
+       }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+ 
+               trace_assign_type(field, entry);
+ 
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+ 
+               trace_assign_type(field, entry);
+ 
+               ret = seq_print_userip_objs(field, s, sym_flags);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               ret = trace_seq_putc(s, '\n');
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               break;
+       }
         }
         return TRACE_TYPE_HANDLED;
   }
@@@ -1640,6 -2159,7 +2159,7 @@@ static enum print_line_t print_raw_fmt(
                 break;
         }
         case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
         case TRACE_STACK: {
                 struct special_entry *field;
   
@@@ -1728,6 -2248,7 +2248,7 @@@ static enum print_line_t print_hex_fmt(
                 break;
         }
         case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
         case TRACE_STACK: {
                 struct special_entry *field;
   
@@@ -1782,6 -2303,7 +2303,7 @@@ static enum print_line_t print_bin_fmt(
                 break;
         }
         case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
         case TRACE_STACK: {
                 struct special_entry *field;
   
@@@ -1847,7 -2369,9 +2369,9 @@@ static int s_show(struct seq_file *m, v
                         seq_printf(m, "# tracer: %s\n", iter->trace->name);
                         seq_puts(m, "#\n");
                 }
-               if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+               if (iter->trace && iter->trace->print_header)
+                       iter->trace->print_header(m);
+               else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                         /* print nothing if the buffers are empty */
                         if (trace_empty(iter))
                                 return 0;
@@@ -1899,6 -2423,15 +2423,15 @@@ __tracing_open(struct inode *inode, str
         iter->trace = current_trace;
         iter->pos = -1;
   
+       /* Notify the tracer early; before we stop tracing. */
+       if (iter->trace && iter->trace->open)
+                       iter->trace->open(iter);
+ 
+       /* Annotate start of buffers if we had overruns */
+       if (ring_buffer_overruns(iter->tr->buffer))
+               iter->iter_flags |= TRACE_FILE_ANNOTATE;
+ 
+ 
         for_each_tracing_cpu(cpu) {
   
                 iter->buffer_iter[cpu] =
@@@ -1917,13 -2450,7 +2450,7 @@@
         m->private = iter;
   
         /* stop the trace while dumping */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 0;
-               ftrace_function_enabled = 0;
-       }
- 
-       if (iter->trace && iter->trace->open)
-                       iter->trace->open(iter);
+       tracing_stop();
   
         mutex_unlock(&trace_types_lock);
   
@@@ -1966,14 -2493,7 +2493,7 @@@ int tracing_release(struct inode *inode
                 iter->trace->close(iter);
   
         /* reenable tracing if it was previously enabled */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 1;
-               /*
-                * It is safe to enable function tracing even if it
-                * isn't used
-                */
-               ftrace_function_enabled = 1;
-       }
+       tracing_start();
         mutex_unlock(&trace_types_lock);
   
         seq_release(inode, file);
@@@ -2126,7 -2646,7 +2646,7 @@@ tracing_cpumask_read(struct file *filp
   
         mutex_lock(&tracing_cpumask_update_lock);
   
- -      len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+ +      len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
         if (count - len < 2) {
                 count = -EINVAL;
                 goto out_err;
@@@ -2147,11 -2667,11 +2667,11 @@@ tracing_cpumask_write(struct file *filp
         int err, cpu;
   
         mutex_lock(&tracing_cpumask_update_lock);
- -      err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+ +      err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
         if (err)
                 goto err_unlock;
   
-       raw_local_irq_disable();
+       local_irq_disable();
         __raw_spin_lock(&ftrace_max_lock);
         for_each_tracing_cpu(cpu) {
                 /*
@@@ -2168,7 -2688,7 +2688,7 @@@
                 }
         }
         __raw_spin_unlock(&ftrace_max_lock);
-       raw_local_irq_enable();
+       local_irq_enable();
   
         tracing_cpumask = tracing_cpumask_new;
   
@@@ -2189,13 -2709,16 +2709,16 @@@ static struct file_operations tracing_c
   };
   
   static ssize_t
- tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+ tracing_trace_options_read(struct file *filp, char __user *ubuf,
                        size_t cnt, loff_t *ppos)
   {
+       int i;
         char *buf;
         int r = 0;
         int len = 0;
-       int i;
+       u32 tracer_flags = current_trace->flags->val;
+       struct tracer_opt *trace_opts = current_trace->flags->opts;
+ 
   
         /* calulate max size */
         for (i = 0; trace_options[i]; i++) {
@@@ -2203,6 -2726,15 +2726,15 @@@
                 len += 3; /* "no" and space */
         }
   
+       /*
+        * Increase the size with names of options specific
+        * of the current tracer.
+        */
+       for (i = 0; trace_opts[i].name; i++) {
+               len += strlen(trace_opts[i].name);
+               len += 3; /* "no" and space */
+       }
+ 
         /* +2 for \n and \0 */
         buf = kmalloc(len + 2, GFP_KERNEL);
         if (!buf)
@@@ -2215,6 -2747,15 +2747,15 @@@
                         r += sprintf(buf + r, "no%s ", trace_options[i]);
         }
   
+       for (i = 0; trace_opts[i].name; i++) {
+               if (tracer_flags & trace_opts[i].bit)
+                       r += sprintf(buf + r, "%s ",
+                               trace_opts[i].name);
+               else
+                       r += sprintf(buf + r, "no%s ",
+                               trace_opts[i].name);
+       }
+ 
         r += sprintf(buf + r, "\n");
         WARN_ON(r >= len + 2);
   
@@@ -2225,13 -2766,48 +2766,48 @@@
         return r;
   }
   
+ /* Try to assign a tracer specific option */
+ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+ {
+       struct tracer_flags *trace_flags = trace->flags;
+       struct tracer_opt *opts = NULL;
+       int ret = 0, i = 0;
+       int len;
+ 
+       for (i = 0; trace_flags->opts[i].name; i++) {
+               opts = &trace_flags->opts[i];
+               len = strlen(opts->name);
+ 
+               if (strncmp(cmp, opts->name, len) == 0) {
+                       ret = trace->set_flag(trace_flags->val,
+                               opts->bit, !neg);
+                       break;
+               }
+       }
+       /* Not found */
+       if (!trace_flags->opts[i].name)
+               return -EINVAL;
+ 
+       /* Refused to handle */
+       if (ret)
+               return ret;
+ 
+       if (neg)
+               trace_flags->val &= ~opts->bit;
+       else
+               trace_flags->val |= opts->bit;
+ 
+       return 0;
+ }
+ 
   static ssize_t
- tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                         size_t cnt, loff_t *ppos)
   {
         char buf[64];
         char *cmp = buf;
         int neg = 0;
+       int ret;
         int i;
   
         if (cnt >= sizeof(buf))
@@@ -2258,11 -2834,13 +2834,13 @@@
                         break;
                 }
         }
-       /*
-        * If no option could be set, return an error:
-        */
-       if (!trace_options[i])
-               return -EINVAL;
+ 
+       /* If no option could be set, test the specific tracer options */
+       if (!trace_options[i]) {
+               ret = set_tracer_option(current_trace, cmp, neg);
+               if (ret)
+                       return ret;
+       }
   
         filp->f_pos += cnt;
   
@@@ -2271,8 -2849,8 +2849,8 @@@
   
   static struct file_operations tracing_iter_fops = {
         .open           = tracing_open_generic,
-       .read           = tracing_iter_ctrl_read,
-       .write          = tracing_iter_ctrl_write,
+       .read           = tracing_trace_options_read,
+       .write          = tracing_trace_options_write,
   };
   
   static const char readme_msg[] =
@@@ -2286,9 -2864,9 +2864,9 @@@
         "# echo sched_switch > /debug/tracing/current_tracer\n"
         "# cat /debug/tracing/current_tracer\n"
         "sched_switch\n"
-       "# cat /debug/tracing/iter_ctrl\n"
+       "# cat /debug/tracing/trace_options\n"
         "noprint-parent nosym-offset nosym-addr noverbose\n"
-       "# echo print-parent > /debug/tracing/iter_ctrl\n"
+       "# echo print-parent > /debug/tracing/trace_options\n"
         "# echo 1 > /debug/tracing/tracing_enabled\n"
         "# cat /debug/tracing/trace > /tmp/trace.txt\n"
         "echo 0 > /debug/tracing/tracing_enabled\n"
@@@ -2311,11 -2889,10 +2889,10 @@@ static ssize_
   tracing_ctrl_read(struct file *filp, char __user *ubuf,
                   size_t cnt, loff_t *ppos)
   {
         char buf[64];
         int r;
   
-       r = sprintf(buf, "%ld\n", tr->ctrl);
+       r = sprintf(buf, "%u\n", tracer_enabled);
         return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
   }
   
@@@ -2343,16 -2920,18 +2920,18 @@@ tracing_ctrl_write(struct file *filp, c
         val = !!val;
   
         mutex_lock(&trace_types_lock);
-       if (tr->ctrl ^ val) {
-               if (val)
+       if (tracer_enabled ^ val) {
+               if (val) {
                         tracer_enabled = 1;
-               else
+                       if (current_trace->start)
+                               current_trace->start(tr);
+                       tracing_start();
+               } else {
                         tracer_enabled = 0;
- 
-               tr->ctrl = val;
- 
-               if (current_trace && current_trace->ctrl_update)
-                       current_trace->ctrl_update(tr);
+                       tracing_stop();
+                       if (current_trace->stop)
+                               current_trace->stop(tr);
+               }
         }
         mutex_unlock(&trace_types_lock);
   
@@@ -2378,29 -2957,11 +2957,11 @@@ tracing_set_trace_read(struct file *fil
         return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
   }
   
- static ssize_t
- tracing_set_trace_write(struct file *filp, const char __user *ubuf,
-                       size_t cnt, loff_t *ppos)
+ static int tracing_set_tracer(char *buf)
   {
         struct trace_array *tr = &global_trace;
         struct tracer *t;
-       char buf[max_tracer_type_len+1];
-       int i;
-       size_t ret;
- 
-       ret = cnt;
- 
-       if (cnt > max_tracer_type_len)
-               cnt = max_tracer_type_len;
- 
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
- 
-       buf[cnt] = 0;
- 
-       /* strip ending whitespace. */
-       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
-               buf[i] = 0;
+       int ret = 0;
   
         mutex_lock(&trace_types_lock);
         for (t = trace_types; t; t = t->next) {
@@@ -2414,18 -2975,52 +2975,52 @@@
         if (t == current_trace)
                 goto out;
   
+       trace_branch_disable();
         if (current_trace && current_trace->reset)
                 current_trace->reset(tr);
   
         current_trace = t;
-       if (t->init)
-               t->init(tr);
+       if (t->init) {
+               ret = t->init(tr);
+               if (ret)
+                       goto out;
+       }
   
+       trace_branch_enable(tr);
    out:
         mutex_unlock(&trace_types_lock);
   
-       if (ret > 0)
-               filp->f_pos += ret;
+       return ret;
+ }
+ 
+ static ssize_t
+ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+ {
+       char buf[max_tracer_type_len+1];
+       int i;
+       size_t ret;
+       int err;
+ 
+       ret = cnt;
+ 
+       if (cnt > max_tracer_type_len)
+               cnt = max_tracer_type_len;
+ 
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+ 
+       buf[cnt] = 0;
+ 
+       /* strip ending whitespace. */
+       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+               buf[i] = 0;
+ 
+       err = tracing_set_tracer(buf);
+       if (err)
+               return err;
+ 
+       filp->f_pos += ret;
   
         return ret;
   }
@@@ -2492,6 -3087,10 +3087,10 @@@ static int tracing_open_pipe(struct ino
                 return -ENOMEM;
   
         mutex_lock(&trace_types_lock);
+ 
+       /* trace pipe does not show start of buffer */
+       cpus_setall(iter->started);
+ 
         iter->tr = &global_trace;
         iter->trace = current_trace;
         filp->private_data = iter;
@@@ -2667,7 -3266,7 +3266,7 @@@ tracing_entries_read(struct file *filp
         char buf[64];
         int r;
   
-       r = sprintf(buf, "%lu\n", tr->entries);
+       r = sprintf(buf, "%lu\n", tr->entries >> 10);
         return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
   }
   
@@@ -2678,7 -3277,6 +3277,6 @@@ tracing_entries_write(struct file *filp
         unsigned long val;
         char buf[64];
         int ret, cpu;
-       struct trace_array *tr = filp->private_data;
   
         if (cnt >= sizeof(buf))
                 return -EINVAL;
@@@ -2698,12 -3296,7 +3296,7 @@@
   
         mutex_lock(&trace_types_lock);
   
-       if (tr->ctrl) {
-               cnt = -EBUSY;
-               pr_info("ftrace: please disable tracing"
-                       " before modifying buffer size\n");
-               goto out;
-       }
+       tracing_stop();
   
         /* disable all cpu buffers */
         for_each_tracing_cpu(cpu) {
@@@ -2713,6 -3306,9 +3306,9 @@@
                         atomic_inc(&max_tr.data[cpu]->disabled);
         }
   
+       /* value is in KB */
+       val <<= 10;
+ 
         if (val != global_trace.entries) {
                 ret = ring_buffer_resize(global_trace.buffer, val);
                 if (ret < 0) {
@@@ -2751,6 -3347,7 +3347,7 @@@
                         atomic_dec(&max_tr.data[cpu]->disabled);
         }
   
+       tracing_start();
         max_tr.entries = global_trace.entries;
         mutex_unlock(&trace_types_lock);
   
@@@ -2762,7 -3359,7 +3359,7 @@@ static int mark_printk(const char *fmt
         int ret;
         va_list args;
         va_start(args, fmt);
-       ret = trace_vprintk(0, fmt, args);
+       ret = trace_vprintk(0, -1, fmt, args);
         va_end(args);
         return ret;
   }
@@@ -2773,9 -3370,8 +3370,8 @@@ tracing_mark_write(struct file *filp, c
   {
         char *buf;
         char *end;
-       struct trace_array *tr = &global_trace;
   
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled)
                 return -EINVAL;
   
         if (cnt > TRACE_BUF_SIZE)
@@@ -2841,22 -3437,38 +3437,38 @@@ static struct file_operations tracing_m
   
   #ifdef CONFIG_DYNAMIC_FTRACE
   
+ int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+ {
+       return 0;
+ }
+ 
   static ssize_t
- tracing_read_long(struct file *filp, char __user *ubuf,
+ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
                   size_t cnt, loff_t *ppos)
   {
+       static char ftrace_dyn_info_buffer[1024];
+       static DEFINE_MUTEX(dyn_info_mutex);
         unsigned long *p = filp->private_data;
-       char buf[64];
+       char *buf = ftrace_dyn_info_buffer;
+       int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
         int r;
   
-       r = sprintf(buf, "%ld\n", *p);
+       mutex_lock(&dyn_info_mutex);
+       r = sprintf(buf, "%ld ", *p);
   
-       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+       r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+       buf[r++] = '\n';
+ 
+       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ 
+       mutex_unlock(&dyn_info_mutex);
+ 
+       return r;
   }
   
- static struct file_operations tracing_read_long_fops = {
+ static struct file_operations tracing_dyn_info_fops = {
         .open           = tracing_open_generic,
-       .read           = tracing_read_long,
+       .read           = tracing_read_dyn_info,
   };
   #endif
   
@@@ -2897,10 -3509,10 +3509,10 @@@ static __init int tracer_init_debugfs(v
         if (!entry)
                 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
   
-       entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+       entry = debugfs_create_file("trace_options", 0644, d_tracer,
                                     NULL, &tracing_iter_fops);
         if (!entry)
-               pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+               pr_warning("Could not create debugfs 'trace_options' entry\n");
   
         entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
                                     NULL, &tracing_cpumask_fops);
@@@ -2950,11 -3562,11 +3562,11 @@@
                 pr_warning("Could not create debugfs "
                            "'trace_pipe' entry\n");
   
-       entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+       entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
                                     &global_trace, &tracing_entries_fops);
         if (!entry)
                 pr_warning("Could not create debugfs "
-                          "'trace_entries' entry\n");
+                          "'buffer_size_kb' entry\n");
   
         entry = debugfs_create_file("trace_marker", 0220, d_tracer,
                                     NULL, &tracing_mark_fops);
@@@ -2965,7 -3577,7 +3577,7 @@@
   #ifdef CONFIG_DYNAMIC_FTRACE
         entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                                     &ftrace_update_tot_cnt,
-                                   &tracing_read_long_fops);
+                                   &tracing_dyn_info_fops);
         if (!entry)
                 pr_warning("Could not create debugfs "
                            "'dyn_ftrace_total_info' entry\n");
@@@ -2976,7 -3588,7 +3588,7 @@@
         return 0;
   }
   
- int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
   {
         static DEFINE_SPINLOCK(trace_buf_lock);
         static char trace_buf[TRACE_BUF_SIZE];
@@@ -2984,11 -3596,11 +3596,11 @@@
         struct ring_buffer_event *event;
         struct trace_array *tr = &global_trace;
         struct trace_array_cpu *data;
-       struct print_entry *entry;
-       unsigned long flags, irq_flags;
         int cpu, len = 0, size, pc;
+       struct print_entry *entry;
+       unsigned long irq_flags;
   
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled || tracing_selftest_running)
                 return 0;
   
         pc = preempt_count();
@@@ -2999,7 -3611,8 +3611,8 @@@
         if (unlikely(atomic_read(&data->disabled)))
                 goto out;
   
-       spin_lock_irqsave(&trace_buf_lock, flags);
+       pause_graph_tracing();
+       spin_lock_irqsave(&trace_buf_lock, irq_flags);
         len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
   
         len = min(len, TRACE_BUF_SIZE-1);
@@@ -3010,17 -3623,18 +3623,18 @@@
         if (!event)
                 goto out_unlock;
         entry = ring_buffer_event_data(event);
-       tracing_generic_entry_update(&entry->ent, flags, pc);
+       tracing_generic_entry_update(&entry->ent, irq_flags, pc);
         entry->ent.type                 = TRACE_PRINT;
         entry->ip                       = ip;
+       entry->depth                    = depth;
   
         memcpy(&entry->buf, trace_buf, len);
         entry->buf[len] = 0;
         ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
   
    out_unlock:
-       spin_unlock_irqrestore(&trace_buf_lock, flags);
- 
+       spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+       unpause_graph_tracing();
    out:
         preempt_enable_notrace();
   
@@@ -3037,7 -3651,7 +3651,7 @@@ int __ftrace_printk(unsigned long ip, c
                 return 0;
   
         va_start(ap, fmt);
-       ret = trace_vprintk(ip, fmt, ap);
+       ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
         va_end(ap);
         return ret;
   }
@@@ -3046,7 -3660,8 +3660,8 @@@ EXPORT_SYMBOL_GPL(__ftrace_printk)
   static int trace_panic_handler(struct notifier_block *this,
                                unsigned long event, void *unused)
   {
-       ftrace_dump();
+       if (ftrace_dump_on_oops)
+               ftrace_dump();
         return NOTIFY_OK;
   }
   
@@@ -3062,7 -3677,8 +3677,8 @@@ static int trace_die_handler(struct not
   {
         switch (val) {
         case DIE_OOPS:
-               ftrace_dump();
+               if (ftrace_dump_on_oops)
+                       ftrace_dump();
                 break;
         default:
                 break;
@@@ -3103,7 -3719,6 +3719,6 @@@ trace_printk_seq(struct trace_seq *s
         trace_seq_reset(s);
   }
   
- 
   void ftrace_dump(void)
   {
         static DEFINE_SPINLOCK(ftrace_dump_lock);
@@@ -3128,6 -3743,9 +3743,9 @@@
                 atomic_inc(&global_trace.data[cpu]->disabled);
         }
   
+       /* don't look at user memory in panic mode */
+       trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ 
         printk(KERN_TRACE "Dumping ftrace buffer:\n");
   
         iter.tr = &global_trace;
@@@ -3221,7 -3839,6 +3839,6 @@@ __init static int tracer_alloc_buffers(
   #endif
   
         /* All seems OK, enable tracing */
-       global_trace.ctrl = tracer_enabled;
         tracing_disabled = 0;
   
         atomic_notifier_chain_register(&panic_notifier_list,
author	Rusty Russell <rusty@rustcorp.com.au>
	Sat, 13 Dec 2008 11:25:51 +0000 (21:55 +1030)
committer	Rusty Russell <rusty@rustcorp.com.au>
	Sat, 13 Dec 2008 11:25:51 +0000 (21:55 +1030)
		1	2
arch/m32r/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/io_apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irq_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irq_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/xen/events.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/interrupt.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/irq.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/chip.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/proc.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/profile.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_stats.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.c	patch \|	diff1 \|	diff2 \|	blob \| history