Merge ../linux-2.6-x86
authorRusty Russell <rusty@rustcorp.com.au>
Sat, 13 Dec 2008 11:25:51 +0000 (21:55 +1030)
committerRusty Russell <rusty@rustcorp.com.au>
Sat, 13 Dec 2008 11:25:51 +0000 (21:55 +1030)
Conflicts:

arch/x86/kernel/io_apic.c
kernel/sched.c
kernel/sched_stats.h

15 files changed:
1  2 
arch/m32r/Kconfig
arch/x86/kernel/io_apic.c
arch/x86/kernel/irq_32.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/smpboot.c
drivers/xen/events.c
include/linux/interrupt.h
include/linux/irq.h
init/Kconfig
kernel/irq/chip.c
kernel/irq/proc.c
kernel/profile.c
kernel/sched.c
kernel/sched_stats.h
kernel/trace/trace.c

diff --combined arch/m32r/Kconfig
@@@ -10,7 -10,6 +10,7 @@@ config M32
        default y
        select HAVE_IDE
        select HAVE_OPROFILE
 +      select INIT_ALL_POSSIBLE
  
  config SBUS
        bool
@@@ -274,7 -273,7 +274,7 @@@ config GENERIC_CALIBRATE_DELA
        bool
        default y
  
- config SCHED_NO_NO_OMIT_FRAME_POINTER
+ config SCHED_OMIT_FRAME_POINTER
          bool
          default y
  
@@@ -108,8 -108,33 +108,33 @@@ static int __init parse_noapic(char *st
  early_param("noapic", parse_noapic);
  
  struct irq_pin_list;
+ /*
+  * This is performance-critical, we want to do it O(1)
+  *
+  * the indexing order of this array favors 1:1 mappings
+  * between pins and IRQs.
+  */
+ struct irq_pin_list {
+       int apic, pin;
+       struct irq_pin_list *next;
+ };
+ static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+ {
+       struct irq_pin_list *pin;
+       int node;
+       node = cpu_to_node(cpu);
+       pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+       return pin;
+ }
  struct irq_cfg {
-       unsigned int irq;
        struct irq_pin_list *irq_2_pin;
        cpumask_t domain;
        cpumask_t old_domain;
  };
  
  /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+ #ifdef CONFIG_SPARSE_IRQ
+ static struct irq_cfg irq_cfgx[] = {
+ #else
  static struct irq_cfg irq_cfgx[NR_IRQS] = {
-       [0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-       [1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-       [2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-       [3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-       [4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-       [5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-       [6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-       [7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-       [8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-       [9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-       [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-       [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-       [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-       [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-       [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-       [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+ #endif
+       [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+       [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+       [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+       [3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+       [4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+       [5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+       [6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+       [7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+       [8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+       [9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+       [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+       [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+       [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+       [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+       [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+       [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
  };
  
- #define for_each_irq_cfg(irq, cfg)            \
-       for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
- static struct irq_cfg *irq_cfg(unsigned int irq)
+ void __init arch_early_irq_init(void)
  {
-       return irq < nr_irqs ? irq_cfgx + irq : NULL;
+       struct irq_cfg *cfg;
+       struct irq_desc *desc;
+       int count;
+       int i;
+       cfg = irq_cfgx;
+       count = ARRAY_SIZE(irq_cfgx);
+       for (i = 0; i < count; i++) {
+               desc = irq_to_desc(i);
+               desc->chip_data = &cfg[i];
+       }
  }
  
- static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+ #ifdef CONFIG_SPARSE_IRQ
+ static struct irq_cfg *irq_cfg(unsigned int irq)
  {
-       return irq_cfg(irq);
+       struct irq_cfg *cfg = NULL;
+       struct irq_desc *desc;
+       desc = irq_to_desc(irq);
+       if (desc)
+               cfg = desc->chip_data;
+       return cfg;
  }
  
- /*
-  * Rough estimation of how many shared IRQs there are, can be changed
-  * anytime.
-  */
- #define MAX_PLUS_SHARED_IRQS NR_IRQS
- #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+ {
+       struct irq_cfg *cfg;
+       int node;
  
- /*
-  * This is performance-critical, we want to do it O(1)
-  *
-  * the indexing order of this array favors 1:1 mappings
-  * between pins and IRQs.
-  */
+       node = cpu_to_node(cpu);
  
- struct irq_pin_list {
-       int apic, pin;
-       struct irq_pin_list *next;
- };
+       cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
  
static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
- static struct irq_pin_list *irq_2_pin_ptr;
      return cfg;
+ }
  
static void __init irq_2_pin_init(void)
void arch_init_chip_data(struct irq_desc *desc, int cpu)
  {
-       struct irq_pin_list *pin = irq_2_pin_head;
-       int i;
-       for (i = 1; i < PIN_MAP_SIZE; i++)
-               pin[i-1].next = &pin[i];
+       struct irq_cfg *cfg;
  
-       irq_2_pin_ptr = &pin[0];
+       cfg = desc->chip_data;
+       if (!cfg) {
+               desc->chip_data = get_one_free_irq_cfg(cpu);
+               if (!desc->chip_data) {
+                       printk(KERN_ERR "can not alloc irq_cfg\n");
+                       BUG_ON(1);
+               }
+       }
  }
  
- static struct irq_pin_list *get_one_free_irq_2_pin(void)
+ #else
+ static struct irq_cfg *irq_cfg(unsigned int irq)
  {
-       struct irq_pin_list *pin = irq_2_pin_ptr;
+       return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ }
  
-       if (!pin)
-               panic("can not get more irq_2_pin\n");
+ #endif
  
-       irq_2_pin_ptr = pin->next;
-       pin->next = NULL;
-       return pin;
+ static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+ {
  }
  
  struct io_apic {
@@@ -237,11 -276,10 +276,10 @@@ static inline void io_apic_modify(unsig
        writel(value, &io_apic->data);
  }
  
- static bool io_apic_level_ack_pending(unsigned int irq)
+ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
  {
        struct irq_pin_list *entry;
        unsigned long flags;
-       struct irq_cfg *cfg = irq_cfg(irq);
  
        spin_lock_irqsave(&ioapic_lock, flags);
        entry = cfg->irq_2_pin;
@@@ -323,13 -361,12 +361,12 @@@ static void ioapic_mask_entry(int apic
  }
  
  #ifdef CONFIG_SMP
- static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
  {
        int apic, pin;
-       struct irq_cfg *cfg;
        struct irq_pin_list *entry;
+       u8 vector = cfg->vector;
  
-       cfg = irq_cfg(irq);
        entry = cfg->irq_2_pin;
        for (;;) {
                unsigned int reg;
        }
  }
  
- static int assign_irq_vector(int irq, cpumask_t mask);
+ static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
  
- static void set_ioapic_affinity_irq(unsigned int irq,
-                                   const struct cpumask *mask)
 -static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
++static void set_ioapic_affinity_irq_desc(struct irq_desc *desc,
++                                       const struct cpumask *mask)
  {
        struct irq_cfg *cfg;
        unsigned long flags;
        unsigned int dest;
        cpumask_t tmp;
-       struct irq_desc *desc;
+       unsigned int irq;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
-       cfg = irq_cfg(irq);
-       if (assign_irq_vector(irq, *mask))
+       irq = desc->irq;
+       cfg = desc->chip_data;
 -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                return;
  
 -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
        /*
         * Only the high 8 bits are valid.
         */
        dest = SET_APIC_LOGICAL_ID(dest);
  
-       desc = irq_to_desc(irq);
        spin_lock_irqsave(&ioapic_lock, flags);
-       __target_IO_APIC_irq(irq, dest, cfg->vector);
+       __target_IO_APIC_irq(irq, dest, cfg);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
        spin_unlock_irqrestore(&ioapic_lock, flags);
  }
 -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
++static void set_ioapic_affinity_irq(unsigned int irq,
++                                  const struct cpumask *mask)
+ {
+       struct irq_desc *desc;
+       desc = irq_to_desc(irq);
+       set_ioapic_affinity_irq_desc(desc, mask);
+ }
  #endif /* CONFIG_SMP */
  
  /*
   * shared ISA-space IRQs, so we have to support them. We are super
   * fast in the common case, and fast for shared ISA-space IRQs.
   */
- static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
  {
-       struct irq_cfg *cfg;
        struct irq_pin_list *entry;
  
-       /* first time to refer irq_cfg, so with new */
-       cfg = irq_cfg_alloc(irq);
        entry = cfg->irq_2_pin;
        if (!entry) {
-               entry = get_one_free_irq_2_pin();
+               entry = get_one_free_irq_2_pin(cpu);
+               if (!entry) {
+                       printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+                                       apic, pin);
+                       return;
+               }
                cfg->irq_2_pin = entry;
                entry->apic = apic;
                entry->pin = pin;
                entry = entry->next;
        }
  
-       entry->next = get_one_free_irq_2_pin();
+       entry->next = get_one_free_irq_2_pin(cpu);
        entry = entry->next;
        entry->apic = apic;
        entry->pin = pin;
  /*
   * Reroute an IRQ to a different pin.
   */
- static void __init replace_pin_at_irq(unsigned int irq,
+ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
                                      int oldapic, int oldpin,
                                      int newapic, int newpin)
  {
-       struct irq_cfg *cfg = irq_cfg(irq);
        struct irq_pin_list *entry = cfg->irq_2_pin;
        int replaced = 0;
  
  
        /* why? call replace before add? */
        if (!replaced)
-               add_pin_to_irq(irq, newapic, newpin);
+               add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
  }
  
- static inline void io_apic_modify_irq(unsigned int irq,
+ static inline void io_apic_modify_irq(struct irq_cfg *cfg,
                                int mask_and, int mask_or,
                                void (*final)(struct irq_pin_list *entry))
  {
        int pin;
-       struct irq_cfg *cfg;
        struct irq_pin_list *entry;
  
-       cfg = irq_cfg(irq);
        for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
                unsigned int reg;
                pin = entry->pin;
        }
  }
  
- static void __unmask_IO_APIC_irq(unsigned int irq)
+ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
  {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
  }
  
  #ifdef CONFIG_X86_64
@@@ -492,47 -539,64 +540,64 @@@ void io_apic_sync(struct irq_pin_list *
        readl(&io_apic->data);
  }
  
- static void __mask_IO_APIC_irq(unsigned int irq)
+ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
  {
-       io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+       io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
  }
  #else /* CONFIG_X86_32 */
- static void __mask_IO_APIC_irq(unsigned int irq)
+ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
  {
-       io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+       io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
  }
  
- static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+ static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
  {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
                        IO_APIC_REDIR_MASKED, NULL);
  }
  
- static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
  {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
                        IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
  }
  #endif /* CONFIG_X86_32 */
  
- static void mask_IO_APIC_irq (unsigned int irq)
+ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
  {
+       struct irq_cfg *cfg = desc->chip_data;
        unsigned long flags;
  
+       BUG_ON(!cfg);
        spin_lock_irqsave(&ioapic_lock, flags);
-       __mask_IO_APIC_irq(irq);
+       __mask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
- static void unmask_IO_APIC_irq (unsigned int irq)
+ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
  {
+       struct irq_cfg *cfg = desc->chip_data;
        unsigned long flags;
  
        spin_lock_irqsave(&ioapic_lock, flags);
-       __unmask_IO_APIC_irq(irq);
+       __unmask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
+ static void mask_IO_APIC_irq(unsigned int irq)
+ {
+       struct irq_desc *desc = irq_to_desc(irq);
+       mask_IO_APIC_irq_desc(desc);
+ }
+ static void unmask_IO_APIC_irq(unsigned int irq)
+ {
+       struct irq_desc *desc = irq_to_desc(irq);
+       unmask_IO_APIC_irq_desc(desc);
+ }
  static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
  {
        struct IO_APIC_route_entry entry;
@@@ -809,7 -873,7 +874,7 @@@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vecto
   */
  static int EISA_ELCR(unsigned int irq)
  {
-       if (irq < 16) {
+       if (irq < NR_IRQS_LEGACY) {
                unsigned int port = 0x4d0 + (irq >> 3);
                return (inb(port) >> (irq & 7)) & 1;
        }
@@@ -1034,7 -1098,7 +1099,7 @@@ void unlock_vector_lock(void
        spin_unlock(&vector_lock);
  }
  
- static int __assign_irq_vector(int irq, cpumask_t mask)
+ static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
  {
        /*
         * NOTE! The local APIC isn't very good at handling
        static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
        unsigned int old_vector;
        int cpu;
-       struct irq_cfg *cfg;
  
-       cfg = irq_cfg(irq);
+       if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+               return -EBUSY;
  
        /* Only try and allocate irqs on cpus that are present */
        cpus_and(mask, mask, cpu_online_map);
  
-       if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-               return -EBUSY;
        old_vector = cfg->vector;
        if (old_vector) {
                cpumask_t tmp;
@@@ -1113,24 -1174,22 +1175,22 @@@ next
        return -ENOSPC;
  }
  
- static int assign_irq_vector(int irq, cpumask_t mask)
+ static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
  {
        int err;
        unsigned long flags;
  
        spin_lock_irqsave(&vector_lock, flags);
-       err = __assign_irq_vector(irq, mask);
+       err = __assign_irq_vector(irq, cfg, mask);
        spin_unlock_irqrestore(&vector_lock, flags);
        return err;
  }
  
- static void __clear_irq_vector(int irq)
+ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
  {
        cpumask_t mask;
        int cpu, vector;
  
-       cfg = irq_cfg(irq);
        BUG_ON(!cfg->vector);
  
        vector = cfg->vector;
@@@ -1162,9 -1221,13 +1222,13 @@@ void __setup_vector_irq(int cpu
        /* This function must be called with vector_lock held */
        int irq, vector;
        struct irq_cfg *cfg;
+       struct irq_desc *desc;
  
        /* Mark the inuse vectors */
-       for_each_irq_cfg(irq, cfg) {
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+               cfg = desc->chip_data;
                if (!cpu_isset(cpu, cfg->domain))
                        continue;
                vector = cfg->vector;
@@@ -1215,11 -1278,8 +1279,8 @@@ static inline int IO_APIC_irq_trigger(i
  }
  #endif
  
- static void ioapic_register_intr(int irq, unsigned long trigger)
+ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
  {
-       struct irq_desc *desc;
-       desc = irq_to_desc(irq);
  
        if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
            trigger == IOAPIC_LEVEL)
@@@ -1311,7 -1371,7 +1372,7 @@@ static int setup_ioapic_entry(int apic
        return 0;
  }
  
- static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
                              int trigger, int polarity)
  {
        struct irq_cfg *cfg;
        if (!IO_APIC_IRQ(irq))
                return;
  
-       cfg = irq_cfg(irq);
+       cfg = desc->chip_data;
  
        mask = TARGET_CPUS;
-       if (assign_irq_vector(irq, mask))
+       if (assign_irq_vector(irq, cfg, mask))
                return;
  
        cpus_and(mask, cfg->domain, mask);
                               cfg->vector)) {
                printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
                       mp_ioapics[apic].mp_apicid, pin);
-               __clear_irq_vector(irq);
+               __clear_irq_vector(irq, cfg);
                return;
        }
  
-       ioapic_register_intr(irq, trigger);
-       if (irq < 16)
+       ioapic_register_intr(irq, desc, trigger);
+       if (irq < NR_IRQS_LEGACY)
                disable_8259A_irq(irq);
  
        ioapic_write_entry(apic, pin, entry);
@@@ -1356,6 -1416,9 +1417,9 @@@ static void __init setup_IO_APIC_irqs(v
  {
        int apic, pin, idx, irq;
        int notcon = 0;
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+       int cpu = boot_cpu_id;
  
        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
  
                        if (multi_timer_check(apic, irq))
                                continue;
  #endif
-                       add_pin_to_irq(irq, apic, pin);
+                       desc = irq_to_desc_alloc_cpu(irq, cpu);
+                       if (!desc) {
+                               printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+                               continue;
+                       }
+                       cfg = desc->chip_data;
+                       add_pin_to_irq_cpu(cfg, cpu, apic, pin);
  
-                       setup_IO_APIC_irq(apic, pin, irq,
+                       setup_IO_APIC_irq(apic, pin, irq, desc,
                                        irq_trigger(idx), irq_polarity(idx));
                }
        }
@@@ -1448,6 -1517,7 +1518,7 @@@ __apicdebuginit(void) print_IO_APIC(voi
        union IO_APIC_reg_03 reg_03;
        unsigned long flags;
        struct irq_cfg *cfg;
+       struct irq_desc *desc;
        unsigned int irq;
  
        if (apic_verbosity == APIC_QUIET)
        }
        }
        printk(KERN_DEBUG "IRQ to pin mappings:\n");
-       for_each_irq_cfg(irq, cfg) {
-               struct irq_pin_list *entry = cfg->irq_2_pin;
+       for_each_irq_desc(irq, desc) {
+               struct irq_pin_list *entry;
+               if (!desc)
+                       continue;
+               cfg = desc->chip_data;
+               entry = cfg->irq_2_pin;
                if (!entry)
                        continue;
                printk(KERN_DEBUG "IRQ%d ", irq);
@@@ -2022,14 -2097,16 +2098,16 @@@ static unsigned int startup_ioapic_irq(
  {
        int was_pending = 0;
        unsigned long flags;
+       struct irq_cfg *cfg;
  
        spin_lock_irqsave(&ioapic_lock, flags);
-       if (irq < 16) {
+       if (irq < NR_IRQS_LEGACY) {
                disable_8259A_irq(irq);
                if (i8259A_irq_pending(irq))
                        was_pending = 1;
        }
-       __unmask_IO_APIC_irq(irq);
+       cfg = irq_cfg(irq);
+       __unmask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
  
        return was_pending;
@@@ -2092,35 -2169,37 +2170,37 @@@ static DECLARE_DELAYED_WORK(ir_migratio
   * as simple as edge triggered migration and we can do the irq migration
   * with a simple atomic update to IO-APIC RTE.
   */
- static void migrate_ioapic_irq(int irq, cpumask_t mask)
+ static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
  {
        struct irq_cfg *cfg;
-       struct irq_desc *desc;
        cpumask_t tmp, cleanup_mask;
        struct irte irte;
        int modify_ioapic_rte;
        unsigned int dest;
        unsigned long flags;
+       unsigned int irq;
  
        cpus_and(tmp, mask, cpu_online_map);
        if (cpus_empty(tmp))
                return;
  
+       irq = desc->irq;
        if (get_irte(irq, &irte))
                return;
  
-       if (assign_irq_vector(irq, mask))
+       cfg = desc->chip_data;
+       if (assign_irq_vector(irq, cfg, mask))
                return;
  
-       cfg = irq_cfg(irq);
+       set_extra_move_desc(desc, mask);
        cpus_and(tmp, cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        modify_ioapic_rte = desc->status & IRQ_LEVEL;
        if (modify_ioapic_rte) {
                spin_lock_irqsave(&ioapic_lock, flags);
-               __target_IO_APIC_irq(irq, dest, cfg->vector);
+               __target_IO_APIC_irq(irq, dest, cfg);
                spin_unlock_irqrestore(&ioapic_lock, flags);
        }
  
        desc->affinity = mask;
  }
  
- static int migrate_irq_remapped_level(int irq)
+ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
  {
        int ret = -1;
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_cfg *cfg = desc->chip_data;
  
-       mask_IO_APIC_irq(irq);
+       mask_IO_APIC_irq_desc(desc);
  
-       if (io_apic_level_ack_pending(irq)) {
+       if (io_apic_level_ack_pending(cfg)) {
                /*
                 * Interrupt in progress. Migrating irq now will change the
                 * vector information in the IO-APIC RTE and that will confuse
        }
  
        /* everthing is clear. we have right of way */
-       migrate_ioapic_irq(irq, desc->pending_mask);
+       migrate_ioapic_irq_desc(desc, desc->pending_mask);
  
        ret = 0;
        desc->status &= ~IRQ_MOVE_PENDING;
        cpus_clear(desc->pending_mask);
  
  unmask:
-       unmask_IO_APIC_irq(irq);
+       unmask_IO_APIC_irq_desc(desc);
        return ret;
  }
  
@@@ -2178,6 -2258,9 +2259,9 @@@ static void ir_irq_migration(struct wor
        struct irq_desc *desc;
  
        for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
                if (desc->status & IRQ_MOVE_PENDING) {
                        unsigned long flags;
  
                                continue;
                        }
  
 -                      desc->chip->set_affinity(irq, desc->pending_mask);
 +                      desc->chip->set_affinity(irq, &desc->pending_mask);
                        spin_unlock_irqrestore(&desc->lock, flags);
                }
        }
  /*
   * Migrates the IRQ destination in the process context.
   */
- static void set_ir_ioapic_affinity_irq(unsigned int irq,
-                                      const struct cpumask *mask)
 -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
++static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
++                                          const struct cpumask *mask)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
        if (desc->status & IRQ_LEVEL) {
                desc->status |= IRQ_MOVE_PENDING;
 -              desc->pending_mask = mask;
 +              cpumask_copy(&desc->pending_mask, mask);
-               migrate_irq_remapped_level(irq);
+               migrate_irq_remapped_level_desc(desc);
                return;
        }
  
-       migrate_ioapic_irq(irq, *mask);
+       migrate_ioapic_irq_desc(desc, mask);
+ }
 -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
++static void set_ir_ioapic_affinity_irq(unsigned int irq,
++                                     const struct cpumask *mask)
+ {
+       struct irq_desc *desc = irq_to_desc(irq);
+       set_ir_ioapic_affinity_irq_desc(desc, mask);
  }
  #endif
  
@@@ -2230,6 -2316,9 +2319,9 @@@ asmlinkage void smp_irq_move_cleanup_in
                struct irq_cfg *cfg;
                irq = __get_cpu_var(vector_irq)[vector];
  
+               if (irq == -1)
+                       continue;
                desc = irq_to_desc(irq);
                if (!desc)
                        continue;
@@@ -2251,9 -2340,10 +2343,10 @@@ unlock
        irq_exit();
  }
  
- static void irq_complete_move(unsigned int irq)
+ static void irq_complete_move(struct irq_desc **descp)
  {
-       struct irq_cfg *cfg = irq_cfg(irq);
+       struct irq_desc *desc = *descp;
+       struct irq_cfg *cfg = desc->chip_data;
        unsigned vector, me;
  
        if (likely(!cfg->move_in_progress))
        }
  }
  #else
- static inline void irq_complete_move(unsigned int irq) {}
+ static inline void irq_complete_move(struct irq_desc **descp) {}
  #endif
  #ifdef CONFIG_INTR_REMAP
  static void ack_x2apic_level(unsigned int irq)
  {
@@@ -2283,11 -2374,14 +2377,14 @@@ static void ack_x2apic_edge(unsigned in
  {
        ack_x2APIC_irq();
  }
  #endif
  
  static void ack_apic_edge(unsigned int irq)
  {
-       irq_complete_move(irq);
+       struct irq_desc *desc = irq_to_desc(irq);
+       irq_complete_move(&desc);
        move_native_irq(irq);
        ack_APIC_irq();
  }
@@@ -2296,18 -2390,21 +2393,21 @@@ atomic_t irq_mis_count
  
  static void ack_apic_level(unsigned int irq)
  {
+       struct irq_desc *desc = irq_to_desc(irq);
  #ifdef CONFIG_X86_32
        unsigned long v;
        int i;
  #endif
+       struct irq_cfg *cfg;
        int do_unmask_irq = 0;
  
-       irq_complete_move(irq);
+       irq_complete_move(&desc);
  #ifdef CONFIG_GENERIC_PENDING_IRQ
        /* If we are moving the irq we need to mask it */
-       if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+       if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
                do_unmask_irq = 1;
-               mask_IO_APIC_irq(irq);
+               mask_IO_APIC_irq_desc(desc);
        }
  #endif
  
        * operation to prevent an edge-triggered interrupt escaping meanwhile.
        * The idea is from Manfred Spraul.  --macro
        */
-       i = irq_cfg(irq)->vector;
+       cfg = desc->chip_data;
+       i = cfg->vector;
  
        v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
  #endif
                 * accurate and is causing problems then it is a hardware bug
                 * and you can go talk to the chipset vendor about it.
                 */
-               if (!io_apic_level_ack_pending(irq))
+               cfg = desc->chip_data;
+               if (!io_apic_level_ack_pending(cfg))
                        move_masked_irq(irq);
-               unmask_IO_APIC_irq(irq);
+               unmask_IO_APIC_irq_desc(desc);
        }
  
  #ifdef CONFIG_X86_32
        if (!(v & (1 << (i & 0x1f)))) {
                atomic_inc(&irq_mis_count);
                spin_lock(&ioapic_lock);
-               __mask_and_edge_IO_APIC_irq(irq);
-               __unmask_and_level_IO_APIC_irq(irq);
+               __mask_and_edge_IO_APIC_irq(cfg);
+               __unmask_and_level_IO_APIC_irq(cfg);
                spin_unlock(&ioapic_lock);
        }
  #endif
@@@ -2431,20 -2530,22 +2533,22 @@@ static inline void init_IO_APIC_traps(v
         * Also, we've got to be careful not to trash gate
         * 0x80, because int 0x80 is hm, kind of importantish. ;)
         */
-       for_each_irq_cfg(irq, cfg) {
-               if (IO_APIC_IRQ(irq) && !cfg->vector) {
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+               cfg = desc->chip_data;
+               if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
                        /*
                         * Hmm.. We don't have an entry for this,
                         * so default to an old-fashioned 8259
                         * interrupt if we can..
                         */
-                       if (irq < 16)
+                       if (irq < NR_IRQS_LEGACY)
                                make_8259A_irq(irq);
-                       else {
-                               desc = irq_to_desc(irq);
+                       else
                                /* Strange. Oh, well.. */
                                desc->chip = &no_irq_chip;
-                       }
                }
        }
  }
@@@ -2469,7 -2570,7 +2573,7 @@@ static void unmask_lapic_irq(unsigned i
        apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
  }
  
- static void ack_lapic_irq (unsigned int irq)
+ static void ack_lapic_irq(unsigned int irq)
  {
        ack_APIC_irq();
  }
@@@ -2481,11 -2582,8 +2585,8 @@@ static struct irq_chip lapic_chip __rea
        .ack            = ack_lapic_irq,
  };
  
- static void lapic_register_intr(int irq)
+ static void lapic_register_intr(int irq, struct irq_desc *desc)
  {
-       struct irq_desc *desc;
-       desc = irq_to_desc(irq);
        desc->status &= ~IRQ_LEVEL;
        set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
                                      "edge");
@@@ -2589,7 -2687,9 +2690,9 @@@ int timer_through_8259 __initdata
   */
  static inline void __init check_timer(void)
  {
-       struct irq_cfg *cfg = irq_cfg(0);
+       struct irq_desc *desc = irq_to_desc(0);
+       struct irq_cfg *cfg = desc->chip_data;
+       int cpu = boot_cpu_id;
        int apic1, pin1, apic2, pin2;
        unsigned long flags;
        unsigned int ver;
         * get/set the timer IRQ vector:
         */
        disable_8259A_irq(0);
-       assign_irq_vector(0, TARGET_CPUS);
+       assign_irq_vector(0, cfg, TARGET_CPUS);
  
        /*
         * As IRQ0 is to be enabled in the 8259A, the virtual
                 * Ok, does IRQ0 through the IOAPIC work?
                 */
                if (no_pin1) {
-                       add_pin_to_irq(0, apic1, pin1);
+                       add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
                        setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
                }
-               unmask_IO_APIC_irq(0);
+               unmask_IO_APIC_irq_desc(desc);
                if (timer_irq_works()) {
                        if (nmi_watchdog == NMI_IO_APIC) {
                                setup_nmi();
                /*
                 * legacy devices should be connected to IO APIC #0
                 */
-               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+               replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
                setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-               unmask_IO_APIC_irq(0);
+               unmask_IO_APIC_irq_desc(desc);
                enable_8259A_irq(0);
                if (timer_irq_works()) {
                        apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
        apic_printk(APIC_QUIET, KERN_INFO
                    "...trying to set up timer as Virtual Wire IRQ...\n");
  
-       lapic_register_intr(0);
+       lapic_register_intr(0, desc);
        apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
        enable_8259A_irq(0);
  
@@@ -2903,22 -3003,26 +3006,26 @@@ unsigned int create_irq_nr(unsigned in
        unsigned int irq;
        unsigned int new;
        unsigned long flags;
-       struct irq_cfg *cfg_new;
-       irq_want = nr_irqs - 1;
+       struct irq_cfg *cfg_new = NULL;
+       int cpu = boot_cpu_id;
+       struct irq_desc *desc_new = NULL;
  
        irq = 0;
        spin_lock_irqsave(&vector_lock, flags);
-       for (new = irq_want; new > 0; new--) {
+       for (new = irq_want; new < NR_IRQS; new++) {
                if (platform_legacy_irq(new))
                        continue;
-               cfg_new = irq_cfg(new);
-               if (cfg_new && cfg_new->vector != 0)
+               desc_new = irq_to_desc_alloc_cpu(new, cpu);
+               if (!desc_new) {
+                       printk(KERN_INFO "can not get irq_desc for %d\n", new);
+                       continue;
+               }
+               cfg_new = desc_new->chip_data;
+               if (cfg_new->vector != 0)
                        continue;
-               /* check if need to create one */
-               if (!cfg_new)
-                       cfg_new = irq_cfg_alloc(new);
-               if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+               if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
                        irq = new;
                break;
        }
  
        if (irq > 0) {
                dynamic_irq_init(irq);
+               /* restore it, in case dynamic_irq_init clear it */
+               if (desc_new)
+                       desc_new->chip_data = cfg_new;
        }
        return irq;
  }
  
+ static int nr_irqs_gsi = NR_IRQS_LEGACY;
  int create_irq(void)
  {
+       unsigned int irq_want;
        int irq;
  
-       irq = create_irq_nr(nr_irqs - 1);
+       irq_want = nr_irqs_gsi;
+       irq = create_irq_nr(irq_want);
  
        if (irq == 0)
                irq = -1;
  void destroy_irq(unsigned int irq)
  {
        unsigned long flags;
+       struct irq_cfg *cfg;
+       struct irq_desc *desc;
  
+       /* store it, in case dynamic_irq_cleanup clear it */
+       desc = irq_to_desc(irq);
+       cfg = desc->chip_data;
        dynamic_irq_cleanup(irq);
+       /* connect back irq_cfg */
+       if (desc)
+               desc->chip_data = cfg;
  
  #ifdef CONFIG_INTR_REMAP
        free_irte(irq);
  #endif
        spin_lock_irqsave(&vector_lock, flags);
-       __clear_irq_vector(irq);
+       __clear_irq_vector(irq, cfg);
        spin_unlock_irqrestore(&vector_lock, flags);
  }
  
@@@ -2967,12 -3085,12 +3088,12 @@@ static int msi_compose_msg(struct pci_d
        unsigned dest;
        cpumask_t tmp;
  
+       cfg = irq_cfg(irq);
        tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, tmp);
+       err = assign_irq_vector(irq, cfg, tmp);
        if (err)
                return err;
  
-       cfg = irq_cfg(irq);
        cpus_and(tmp, cfg->domain, tmp);
        dest = cpu_mask_to_apicid(tmp);
  
  }
  
  #ifdef CONFIG_SMP
 -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 +static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
        cpumask_t tmp;
-       struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
 -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                return;
  
-       cfg = irq_cfg(irq);
 -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
-       read_msi_msg(irq, &msg);
+       read_msi_msg_desc(desc, &msg);
  
        msg.data &= ~MSI_DATA_VECTOR_MASK;
        msg.data |= MSI_DATA_VECTOR(cfg->vector);
        msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
-       write_msi_msg(irq, &msg);
-       desc = irq_to_desc(irq);
+       write_msi_msg_desc(desc, &msg);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #ifdef CONFIG_INTR_REMAP
  /*
   * Migrate the MSI irq to another cpumask. This migration is
   * done in the process context using interrupt-remapping hardware.
   */
 -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 +static void ir_set_msi_irq_affinity(unsigned int irq,
 +                                  const struct cpumask *mask)
  {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        unsigned int dest;
        cpumask_t tmp, cleanup_mask;
        struct irte irte;
-       struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
        if (get_irte(irq, &irte))
                return;
  
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
 -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                return;
  
-       cfg = irq_cfg(irq);
+       set_extra_move_desc(desc, mask);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        irte.vector = cfg->vector;
                cfg->move_in_progress = 0;
        }
  
-       desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #endif
  #endif /* CONFIG_SMP */
  
@@@ -3166,7 -3287,7 +3289,7 @@@ static int msi_alloc_irte(struct pci_de
  }
  #endif
  
- static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
  {
        int ret;
        struct msi_msg msg;
        if (ret < 0)
                return ret;
  
-       set_irq_msi(irq, desc);
+       set_irq_msi(irq, msidesc);
        write_msi_msg(irq, &msg);
  
  #ifdef CONFIG_INTR_REMAP
        return 0;
  }
  
- static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
- {
-       unsigned int irq;
-       irq = dev->bus->number;
-       irq <<= 8;
-       irq |= dev->devfn;
-       irq <<= 12;
-       return irq;
- }
- int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
  {
        unsigned int irq;
        int ret;
        unsigned int irq_want;
  
-       irq_want = build_irq_for_pci_dev(dev) + 0x100;
+       irq_want = nr_irqs_gsi;
        irq = create_irq_nr(irq_want);
        if (irq == 0)
                return -1;
                goto error;
  no_ir:
  #endif
-       ret = setup_msi_irq(dev, desc, irq);
+       ret = setup_msi_irq(dev, msidesc, irq);
        if (ret < 0) {
                destroy_irq(irq);
                return ret;
@@@ -3246,7 -3354,7 +3356,7 @@@ int arch_setup_msi_irqs(struct pci_dev 
  {
        unsigned int irq;
        int ret, sub_handle;
-       struct msi_desc *desc;
+       struct msi_desc *msidesc;
        unsigned int irq_want;
  
  #ifdef CONFIG_INTR_REMAP
        int index = 0;
  #endif
  
-       irq_want = build_irq_for_pci_dev(dev) + 0x100;
+       irq_want = nr_irqs_gsi;
        sub_handle = 0;
-       list_for_each_entry(desc, &dev->msi_list, list) {
-               irq = create_irq_nr(irq_want--);
+       list_for_each_entry(msidesc, &dev->msi_list, list) {
+               irq = create_irq_nr(irq_want);
+               irq_want++;
                if (irq == 0)
                        return -1;
  #ifdef CONFIG_INTR_REMAP
                }
  no_ir:
  #endif
-               ret = setup_msi_irq(dev, desc, irq);
+               ret = setup_msi_irq(dev, msidesc, irq);
                if (ret < 0)
                        goto error;
                sub_handle++;
@@@ -3308,22 -3417,25 +3419,24 @@@ void arch_teardown_msi_irq(unsigned in
  
  #ifdef CONFIG_DMAR
  #ifdef CONFIG_SMP
 -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 +static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
        cpumask_t tmp;
-       struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
 -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                return;
  
-       cfg = irq_cfg(irq);
 -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        dmar_msi_read(irq, &msg);
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
        dmar_msi_write(irq, &msg);
-       desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #endif /* CONFIG_SMP */
  
  struct irq_chip dmar_msi_type = {
@@@ -3368,22 -3480,25 +3481,24 @@@ int arch_setup_dmar_msi(unsigned int ir
  #ifdef CONFIG_HPET_TIMER
  
  #ifdef CONFIG_SMP
 -static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
 +static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
-       struct irq_desc *desc;
        struct msi_msg msg;
        unsigned int dest;
        cpumask_t tmp;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
 -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                return;
  
-       cfg = irq_cfg(irq);
 -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        hpet_msi_read(irq, &msg);
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
        hpet_msi_write(irq, &msg);
-       desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #endif /* CONFIG_SMP */
  
  struct irq_chip hpet_msi_type = {
@@@ -3449,27 -3564,30 +3564,29 @@@ static void target_ht_irq(unsigned int 
        write_ht_irq_msg(irq, &msg);
  }
  
 -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 +static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        unsigned int dest;
        cpumask_t tmp;
-       struct irq_desc *desc;
  
 -      cpus_and(tmp, mask, cpu_online_map);
 -      if (cpus_empty(tmp))
 +      if (!cpumask_intersects(mask, cpu_online_mask))
                return;
  
-       if (assign_irq_vector(irq, *mask))
+       cfg = desc->chip_data;
 -      if (assign_irq_vector(irq, cfg, mask))
++      if (assign_irq_vector(irq, cfg, *mask))
                return;
  
-       cfg = irq_cfg(irq);
 -      set_extra_move_desc(desc, mask);
++      set_extra_move_desc(desc, *mask);
 -      cpus_and(tmp, cfg->domain, mask);
 +      cpumask_and(&tmp, &cfg->domain, mask);
        dest = cpu_mask_to_apicid(tmp);
  
        target_ht_irq(irq, dest, cfg->vector);
-       desc = irq_to_desc(irq);
 -      desc->affinity = mask;
 +      cpumask_copy(&desc->affinity, mask);
  }
  #endif
  
  static struct irq_chip ht_irq_chip = {
@@@ -3489,13 -3607,13 +3606,13 @@@ int arch_setup_ht_irq(unsigned int irq
        int err;
        cpumask_t tmp;
  
+       cfg = irq_cfg(irq);
        tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, tmp);
+       err = assign_irq_vector(irq, cfg, tmp);
        if (!err) {
                struct ht_irq_msg msg;
                unsigned dest;
  
-               cfg = irq_cfg(irq);
                cpus_and(tmp, cfg->domain, tmp);
                dest = cpu_mask_to_apicid(tmp);
  
@@@ -3541,7 -3659,9 +3658,9 @@@ int arch_enable_uv_irq(char *irq_name, 
        unsigned long flags;
        int err;
  
-       err = assign_irq_vector(irq, *eligible_cpu);
+       cfg = irq_cfg(irq);
+       err = assign_irq_vector(irq, cfg, *eligible_cpu);
        if (err != 0)
                return err;
  
                                      irq_name);
        spin_unlock_irqrestore(&vector_lock, flags);
  
-       cfg = irq_cfg(irq);
        mmr_value = 0;
        entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
        BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@@ -3603,9 -3721,16 +3720,16 @@@ int __init io_apic_get_redir_entries (i
        return reg_01.bits.entries;
  }
  
int __init probe_nr_irqs(void)
void __init probe_nr_irqs_gsi(void)
  {
-       return NR_IRQS;
+       int idx;
+       int nr = 0;
+       for (idx = 0; idx < nr_ioapics; idx++)
+               nr += io_apic_get_redir_entries(idx) + 1;
+       if (nr > nr_irqs_gsi)
+               nr_irqs_gsi = nr;
  }
  
  /* --------------------------------------------------------------------------
@@@ -3704,19 -3829,31 +3828,31 @@@ int __init io_apic_get_version(int ioap
  
  int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
  {
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+       int cpu = boot_cpu_id;
        if (!IO_APIC_IRQ(irq)) {
                apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
                        ioapic);
                return -EINVAL;
        }
  
+       desc = irq_to_desc_alloc_cpu(irq, cpu);
+       if (!desc) {
+               printk(KERN_INFO "can not get irq_desc %d\n", irq);
+               return 0;
+       }
        /*
         * IRQs < 16 are already in the irq_2_pin[] map
         */
-       if (irq >= 16)
-               add_pin_to_irq(irq, ioapic, pin);
+       if (irq >= NR_IRQS_LEGACY) {
+               cfg = desc->chip_data;
+               add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+       }
  
-       setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+       setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
  
        return 0;
  }
@@@ -3770,9 -3907,10 +3906,10 @@@ void __init setup_ioapic_dest(void
                         * when you have too many devices, because at that time only boot
                         * cpu is online.
                         */
-                       cfg = irq_cfg(irq);
+                       desc = irq_to_desc(irq);
+                       cfg = desc->chip_data;
                        if (!cfg->vector) {
-                               setup_IO_APIC_irq(ioapic, pin, irq,
+                               setup_IO_APIC_irq(ioapic, pin, irq, desc,
                                                  irq_trigger(irq_entry),
                                                  irq_polarity(irq_entry));
                                continue;
                        /*
                         * Honour affinities which have been set in early boot
                         */
-                       desc = irq_to_desc(irq);
                        if (desc->status &
                            (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
                                mask = desc->affinity;
  
  #ifdef CONFIG_INTR_REMAP
                        if (intr_remapping_enabled)
-                               set_ir_ioapic_affinity_irq(irq, &mask);
 -                              set_ir_ioapic_affinity_irq_desc(desc, mask);
++                              set_ir_ioapic_affinity_irq_desc(desc, &mask);
                        else
  #endif
-                               set_ioapic_affinity_irq(irq, &mask);
 -                              set_ioapic_affinity_irq_desc(desc, mask);
++                              set_ioapic_affinity_irq_desc(desc, &mask);
                }
  
        }
@@@ -3843,7 -3980,6 +3979,6 @@@ void __init ioapic_init_mappings(void
        struct resource *ioapic_res;
        int i;
  
-       irq_2_pin_init();
        ioapic_res = ioapic_setup_resources();
        for (i = 0; i < nr_ioapics; i++) {
                if (smp_found_config) {
diff --combined arch/x86/kernel/irq_32.c
@@@ -242,6 -242,8 +242,8 @@@ void fixup_irqs(cpumask_t map
        for_each_irq_desc(irq, desc) {
                cpumask_t mask;
  
+               if (!desc)
+                       continue;
                if (irq == 2)
                        continue;
  
                        mask = map;
                }
                if (desc->chip->set_affinity)
 -                      desc->chip->set_affinity(irq, mask);
 +                      desc->chip->set_affinity(irq, &mask);
                else if (desc->action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
diff --combined arch/x86/kernel/irq_64.c
@@@ -94,6 -94,8 +94,8 @@@ void fixup_irqs(cpumask_t map
                int break_affinity = 0;
                int set_affinity = 1;
  
+               if (!desc)
+                       continue;
                if (irq == 2)
                        continue;
  
                        desc->chip->mask(irq);
  
                if (desc->chip->set_affinity)
 -                      desc->chip->set_affinity(irq, mask);
 +                      desc->chip->set_affinity(irq, &mask);
                else if (!(warned++))
                        set_affinity = 0;
  
@@@ -62,6 -62,7 +62,7 @@@
  #include <asm/mtrr.h>
  #include <asm/vmi.h>
  #include <asm/genapic.h>
+ #include <asm/setup.h>
  #include <linux/mc146818rtc.h>
  
  #include <mach_apic.h>
@@@ -101,8 -102,14 +102,8 @@@ EXPORT_SYMBOL(smp_num_siblings)
  /* Last level cache ID of each logical CPU */
  DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
  
 -/* bitmap of online cpus */
 -cpumask_t cpu_online_map __read_mostly;
 -EXPORT_SYMBOL(cpu_online_map);
 -
  cpumask_t cpu_callin_map;
  cpumask_t cpu_callout_map;
 -cpumask_t cpu_possible_map;
 -EXPORT_SYMBOL(cpu_possible_map);
  
  /* representing HT siblings of each logical CPU */
  DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@@ -530,7 -537,7 +531,7 @@@ static void impress_friends(void
        pr_debug("Before bogocount - setting activated=1.\n");
  }
  
static inline void __inquire_remote_apic(int apicid)
+ void __inquire_remote_apic(int apicid)
  {
        unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
        char *names[] = { "ID", "VERSION", "SPIV" };
        }
  }
  
- #ifdef WAKE_SECONDARY_VIA_NMI
  /*
   * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
   * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
   * won't ... remember to clear down the APIC, etc later.
   */
static int __devinit
- wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+ int __devinit
+ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
  {
        unsigned long send_status, accept_status = 0;
        int maxlvt;
         * Give the other CPU some time to accept the IPI.
         */
        udelay(200);
-       if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+       if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
                maxlvt = lapic_get_maxlvt();
                if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
                        apic_write(APIC_ESR, 0);
  
        return (send_status | accept_status);
  }
- #endif        /* WAKE_SECONDARY_VIA_NMI */
  
- #ifdef WAKE_SECONDARY_VIA_INIT
- static int __devinit
- wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+ int __devinit
+ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
  {
        unsigned long send_status, accept_status = 0;
        int maxlvt, num_starts, j;
  
        return (send_status | accept_status);
  }
- #endif        /* WAKE_SECONDARY_VIA_INIT */
  
  struct create_idle {
        struct work_struct work;
diff --combined drivers/xen/events.c
@@@ -141,8 -141,12 +141,12 @@@ static void init_evtchn_cpu_bindings(vo
        int i;
  
        /* By default all event channels notify CPU#0. */
-       for_each_irq_desc(i, desc)
+       for_each_irq_desc(i, desc) {
+               if (!desc)
+                       continue;
                desc->affinity = cpumask_of_cpu(0);
+       }
  #endif
  
        memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@@ -231,7 -235,7 +235,7 @@@ static int find_unbound_irq(void
        int irq;
  
        /* Only allocate from dynirq range */
-       for_each_irq_nr(irq)
+       for (irq = 0; irq < nr_irqs; irq++)
                if (irq_bindcount[irq] == 0)
                        break;
  
@@@ -579,7 -583,7 +583,7 @@@ void rebind_evtchn_irq(int evtchn, int 
        spin_unlock(&irq_mapping_update_lock);
  
        /* new event channels are always bound to cpu 0 */
 -      irq_set_affinity(irq, cpumask_of_cpu(0));
 +      irq_set_affinity(irq, cpumask_of(0));
  
        /* Unmask the event channel. */
        enable_irq(irq);
@@@ -608,9 -612,9 +612,9 @@@ static void rebind_irq_to_cpu(unsigned 
  }
  
  
 -static void set_affinity_irq(unsigned irq, cpumask_t dest)
 +static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
  {
 -      unsigned tcpu = first_cpu(dest);
 +      unsigned tcpu = cpumask_first(dest);
        rebind_irq_to_cpu(irq, tcpu);
  }
  
@@@ -792,7 -796,7 +796,7 @@@ void xen_irq_resume(void
                mask_evtchn(evtchn);
  
        /* No IRQ <-> event-channel mappings. */
-       for_each_irq_nr(irq)
+       for (irq = 0; irq < nr_irqs; irq++)
                irq_info[irq].evtchn = 0; /* zap event-channel binding */
  
        for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@@ -824,7 -828,7 +828,7 @@@ void __init xen_init_IRQ(void
                mask_evtchn(i);
  
        /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-       for_each_irq_nr(i)
+       for (i = 0; i < nr_irqs; i++)
                irq_bindcount[i] = 0;
  
        irq_ctx_init(smp_processor_id());
@@@ -14,6 -14,8 +14,8 @@@
  #include <linux/irqflags.h>
  #include <linux/smp.h>
  #include <linux/percpu.h>
+ #include <linux/irqnr.h>
  #include <asm/atomic.h>
  #include <asm/ptrace.h>
  #include <asm/system.h>
@@@ -109,13 -111,13 +111,13 @@@ extern void enable_irq(unsigned int irq
  
  extern cpumask_t irq_default_affinity;
  
 -extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
 +extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
  extern int irq_can_set_affinity(unsigned int irq);
  extern int irq_select_affinity(unsigned int irq);
  
  #else /* CONFIG_SMP */
  
 -static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 +static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
  {
        return -EINVAL;
  }
diff --combined include/linux/irq.h
@@@ -113,8 -113,7 +113,8 @@@ struct irq_chip 
        void            (*eoi)(unsigned int irq);
  
        void            (*end)(unsigned int irq);
 -      void            (*set_affinity)(unsigned int irq, cpumask_t dest);
 +      void            (*set_affinity)(unsigned int irq,
 +                                      const struct cpumask *dest);
        int             (*retrigger)(unsigned int irq);
        int             (*set_type)(unsigned int irq, unsigned int flow_type);
        int             (*set_wake)(unsigned int irq, unsigned int on);
        const char      *typename;
  };
  
+ struct timer_rand_state;
+ struct irq_2_iommu;
  /**
   * struct irq_desc - interrupt descriptor
   * @irq:              interrupt number for this descriptor
   */
  struct irq_desc {
        unsigned int            irq;
+ #ifdef CONFIG_SPARSE_IRQ
+       struct timer_rand_state *timer_rand_state;
+       unsigned int            *kstat_irqs;
+ # ifdef CONFIG_INTR_REMAP
+       struct irq_2_iommu      *irq_2_iommu;
+ # endif
+ #endif
        irq_flow_handler_t      handle_irq;
        struct irq_chip         *chip;
        struct msi_desc         *msi_desc;
        const char              *name;
  } ____cacheline_internodealigned_in_smp;
  
+ extern void early_irq_init(void);
+ extern void arch_early_irq_init(void);
+ extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+ extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+                                       struct irq_desc *desc, int cpu);
+ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
  
+ #ifndef CONFIG_SPARSE_IRQ
  extern struct irq_desc irq_desc[NR_IRQS];
  
  static inline struct irq_desc *irq_to_desc(unsigned int irq)
  {
-       return (irq < nr_irqs) ? irq_desc + irq : NULL;
+       return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+ }
+ static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+ {
+       return irq_to_desc(irq);
  }
  
+ #else
+ extern struct irq_desc *irq_to_desc(unsigned int irq);
+ extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+ # define for_each_irq_desc(irq, desc)         \
+       for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq))
+ # define for_each_irq_desc_reverse(irq, desc)                          \
+       for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq))
+ #define kstat_irqs_this_cpu(DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()])
+ #define kstat_incr_irqs_this_cpu(irqno, DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()]++)
+ #endif
  /*
   * Migration helpers for obsolete names, they will go away:
   */
@@@ -381,6 -418,11 +419,11 @@@ extern int set_irq_msi(unsigned int irq
  #define get_irq_data(irq)     (irq_to_desc(irq)->handler_data)
  #define get_irq_msi(irq)      (irq_to_desc(irq)->msi_desc)
  
+ #define get_irq_desc_chip(desc)               ((desc)->chip)
+ #define get_irq_desc_chip_data(desc)  ((desc)->chip_data)
+ #define get_irq_desc_data(desc)               ((desc)->handler_data)
+ #define get_irq_desc_msi(desc)                ((desc)->msi_desc)
  #endif /* CONFIG_GENERIC_HARDIRQS */
  
  #endif /* !CONFIG_S390 */
diff --combined init/Kconfig
@@@ -808,6 -808,7 +808,7 @@@ config TRACEPOINT
  
  config MARKERS
        bool "Activate markers"
+       depends on TRACEPOINTS
        help
          Place an empty function call at each marker site. Can be
          dynamically changed for a probe function.
@@@ -916,15 -917,6 +917,15 @@@ config KMO
  
  endif # MODULES
  
 +config INIT_ALL_POSSIBLE
 +      bool
 +      help
 +        Back when each arch used to define their own cpu_online_map and
 +        cpu_possible_map, some of them chose to initialize cpu_possible_map
 +        with all 1s, and others with all 0s.  When they were centralised,
 +        it was better to provide this option than to break all the archs
 +        and have several arch maintainers persuing me down dark alleys.
 +
  config STOP_MACHINE
        bool
        default y
diff --combined kernel/irq/chip.c
   */
  void dynamic_irq_init(unsigned int irq)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_desc *desc;
        unsigned long flags;
  
+       desc = irq_to_desc(irq);
        if (!desc) {
                WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
                return;
@@@ -45,7 -46,7 +46,7 @@@
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
  #ifdef CONFIG_SMP
 -      cpus_setall(desc->affinity);
 +      cpumask_setall(&desc->affinity);
  #endif
        spin_unlock_irqrestore(&desc->lock, flags);
  }
diff --combined kernel/irq/proc.c
@@@ -40,42 -40,33 +40,42 @@@ static ssize_t irq_affinity_proc_write(
                const char __user *buffer, size_t count, loff_t *pos)
  {
        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
 -      cpumask_t new_value;
 +      cpumask_var_t new_value;
        int err;
  
        if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
                return -EIO;
  
 +      if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
 +              return -ENOMEM;
 +
        err = cpumask_parse_user(buffer, count, new_value);
        if (err)
 -              return err;
 +              goto free_cpumask;
  
 -      if (!is_affinity_mask_valid(new_value))
 -              return -EINVAL;
 +      if (!is_affinity_mask_valid(*new_value)) {
 +              err = -EINVAL;
 +              goto free_cpumask;
 +      }
  
        /*
         * Do not allow disabling IRQs completely - it's a too easy
         * way to make the system unusable accidentally :-) At least
         * one online CPU still has to be targeted.
         */
 -      if (!cpus_intersects(new_value, cpu_online_map))
 +      if (!cpumask_intersects(new_value, cpu_online_mask)) {
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
 -              return irq_select_affinity_usr(irq) ? -EINVAL : count;
 -
 -      irq_set_affinity(irq, new_value);
 -
 -      return count;
 +              err = irq_select_affinity_usr(irq) ? -EINVAL : count;
 +      } else {
 +              irq_set_affinity(irq, new_value);
 +              err = count;
 +      }
 +
 +free_cpumask:
 +      free_cpumask_var(new_value);
 +      return err;
  }
  
  static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@@ -104,7 -95,7 +104,7 @@@ static ssize_t default_affinity_write(s
        cpumask_t new_value;
        int err;
  
 -      err = cpumask_parse_user(buffer, count, new_value);
 +      err = cpumask_parse_user(buffer, count, &new_value);
        if (err)
                return err;
  
@@@ -252,7 -243,11 +252,11 @@@ void init_irq_proc(void
        /*
         * Create entries for all existing IRQs.
         */
-       for_each_irq_desc(irq, desc)
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
                register_irq_proc(irq, desc);
+       }
  }
  
diff --combined kernel/profile.c
@@@ -442,7 -442,7 +442,7 @@@ void profile_tick(int type
  static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
                        int count, int *eof, void *data)
  {
 -      int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
 +      int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
        if (count - len < 2)
                return -EINVAL;
        len += sprintf(page + len, "\n");
@@@ -456,7 -456,7 +456,7 @@@ static int prof_cpu_mask_write_proc(str
        unsigned long full_count = count, err;
        cpumask_t new_value;
  
 -      err = cpumask_parse_user(buffer, count, new_value);
 +      err = cpumask_parse_user(buffer, count, &new_value);
        if (err)
                return err;
  
@@@ -544,7 -544,7 +544,7 @@@ static const struct file_operations pro
  };
  
  #ifdef CONFIG_SMP
- static inline void profile_nop(void *unused)
+ static void profile_nop(void *unused)
  {
  }
  
diff --combined kernel/sched.c
   */
  #define RUNTIME_INF   ((u64)~0ULL)
  
+ DEFINE_TRACE(sched_wait_task);
+ DEFINE_TRACE(sched_wakeup);
+ DEFINE_TRACE(sched_wakeup_new);
+ DEFINE_TRACE(sched_switch);
+ DEFINE_TRACE(sched_migrate_task);
  #ifdef CONFIG_SMP
  /*
   * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@@ -261,6 -267,10 +267,10 @@@ struct task_group 
        struct cgroup_subsys_state css;
  #endif
  
+ #ifdef CONFIG_USER_SCHED
+       uid_t uid;
+ #endif
  #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
  
  #ifdef CONFIG_USER_SCHED
  
+ /* Helper function to pass uid information to create_sched_user() */
+ void set_tg_uid(struct user_struct *user)
+ {
+       user->tg->uid = user->uid;
+ }
  /*
   * Root task group.
   *    Every UID task group (including init_task_group aka UID-0) will
@@@ -481,14 -497,14 +497,14 @@@ struct rt_rq 
   */
  struct root_domain {
        atomic_t refcount;
-       cpumask_t span;
-       cpumask_t online;
+       cpumask_var_t span;
+       cpumask_var_t online;
  
        /*
         * The "RT overload" flag: it gets set if a CPU has more than
         * one runnable RT task.
         */
-       cpumask_t rto_mask;
+       cpumask_var_t rto_mask;
        atomic_t rto_count;
  #ifdef CONFIG_SMP
        struct cpupri cpupri;
@@@ -703,45 -719,18 +719,18 @@@ static __read_mostly char *sched_feat_n
  
  #undef SCHED_FEAT
  
- static int sched_feat_open(struct inode *inode, struct file *filp)
+ static int sched_feat_show(struct seq_file *m, void *v)
  {
        int i;
  
        for (i = 0; sched_feat_names[i]; i++) {
-               len += strlen(sched_feat_names[i]);
-               len += 4;
-       }
-       buf = kmalloc(len + 2, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-       for (i = 0; sched_feat_names[i]; i++) {
-               if (sysctl_sched_features & (1UL << i))
-                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-               else
-                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
        }
+       seq_puts(m, "\n");
  
-       r += sprintf(buf + r, "\n");
-       WARN_ON(r >= len + 2);
-       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-       kfree(buf);
-       return r;
+       return 0;
  }
  
  static ssize_t
@@@ -786,10 -775,17 +775,17 @@@ sched_feat_write(struct file *filp, con
        return cnt;
  }
  
+ static int sched_feat_open(struct inode *inode, struct file *filp)
+ {
+       return single_open(filp, sched_feat_show, NULL);
+ }
  static struct file_operations sched_feat_fops = {
-       .open   = sched_feat_open,
-       .read   = sched_feat_read,
-       .write  = sched_feat_write,
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
  };
  
  static __init int sched_init_debug(void)
@@@ -1474,27 -1470,13 +1470,13 @@@ static voi
  update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
  {
-       int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
  
        if (!tg->se[cpu])
                return;
  
-       rq_weight = tg->cfs_rq[cpu]->load.weight;
-       /*
-        * If there are currently no tasks on the cpu pretend there is one of
-        * average load so that when a new task gets to run here it will not
-        * get delayed by group starvation.
-        */
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-       if (unlikely(rq_weight > sd_rq_weight))
-               rq_weight = sd_rq_weight;
+       rq_weight = tg->cfs_rq[cpu]->rq_weight;
  
        /*
         *           \Sum shares * rq_weight
         *               \Sum rq_weight
         *
         */
-       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
  
        if (abs(shares - tg->se[cpu]->load.weight) >
                unsigned long flags;
  
                spin_lock_irqsave(&rq->lock, flags);
-               /*
-                * record the actual number of shares, not the boosted amount.
-                */
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               tg->cfs_rq[cpu]->rq_weight = rq_weight;
+               tg->cfs_rq[cpu]->shares = shares;
  
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
   */
  static int tg_shares_up(struct task_group *tg, void *data)
  {
-       unsigned long rq_weight = 0;
+       unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
  
-       for_each_cpu_mask(i, sd->span) {
-               rq_weight += tg->cfs_rq[i]->load.weight;
+       for_each_cpu(i, sched_domain_span(sd)) {
+               /*
+                * If there are currently no tasks on the cpu pretend there
+                * is one of average load so that when a new task gets to
+                * run here it will not get delayed by group starvation.
+                */
+               weight = tg->cfs_rq[i]->load.weight;
+               if (!weight)
+                       weight = NICE_0_LOAD;
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
  
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
  
-       if (!rq_weight)
-               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-       for_each_cpu_mask(i, sd->span)
+       for_each_cpu(i, sched_domain_span(sd))
                update_group_shares_cpu(tg, i, shares, rq_weight);
  
        return 0;
@@@ -1612,6 -1597,39 +1597,39 @@@ static inline void update_shares_locked
  
  #endif
  
+ /*
+  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  */
+ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+ {
+       int ret = 0;
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       if (unlikely(!spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       spin_unlock(&this_rq->lock);
+                       spin_lock(&busiest->lock);
+                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+ }
+ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+ {
+       spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ }
  #endif
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -2079,15 -2097,17 +2097,17 @@@ find_idlest_group(struct sched_domain *
                int i;
  
                /* Skip over this group if it has no CPUs allowed */
-               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
                        continue;
  
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
  
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
  
-               for_each_cpu_mask_nr(i, group->cpumask) {
+               for_each_cpu(i, sched_group_cpus(group)) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
   * find_idlest_cpu - find the idlest cpu among the cpus in group.
   */
  static int
- find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-               cpumask_t *tmp)
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
        unsigned long load, min_load = ULONG_MAX;
        int idlest = -1;
        int i;
  
        /* Traverse only the allowed CPUs */
-       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-       for_each_cpu_mask_nr(i, *tmp) {
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                load = weighted_cpuload(i);
  
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2171,7 -2188,6 +2188,6 @@@ static int sched_balance_self(int cpu, 
                update_shares(sd);
  
        while (sd) {
-               cpumask_t span, tmpmask;
                struct sched_group *group;
                int new_cpu, weight;
  
                        continue;
                }
  
-               span = sd->span;
                group = find_idlest_group(sd, t, cpu);
                if (!group) {
                        sd = sd->child;
                        continue;
                }
  
-               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                if (new_cpu == -1 || new_cpu == cpu) {
                        /* Now try balancing at a lower domain level of cpu */
                        sd = sd->child;
  
                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
                sd = NULL;
-               weight = cpus_weight(span);
                for_each_domain(cpu, tmp) {
-                       if (weight <= cpus_weight(tmp->span))
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
                                break;
                        if (tmp->flags & flag)
                                sd = tmp;
@@@ -2244,7 -2259,7 +2259,7 @@@ static int try_to_wake_up(struct task_s
                cpu = task_cpu(p);
  
                for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                update_shares(sd);
                                break;
                        }
        else {
                struct sched_domain *sd;
                for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                schedstat_inc(sd, ttwu_wake_remote);
                                break;
                        }
@@@ -2812,40 -2827,6 +2827,6 @@@ static void double_rq_unlock(struct rq 
  }
  
  /*
-  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
-  */
- static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
- {
-       int ret = 0;
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-       if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       spin_unlock(&this_rq->lock);
-                       spin_lock(&busiest->lock);
-                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-       }
-       return ret;
- }
- static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
- {
-       spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
- }
- /*
   * If dest_cpu is allowed for this process, migrate the task to it.
   * This is accomplished by forcing the cpu_allowed mask to only
   * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@@ -2858,7 -2839,7 +2839,7 @@@ static void sched_migrate_task(struct t
        struct rq *rq;
  
        rq = task_rq_lock(p, &flags);
-       if (!cpu_isset(dest_cpu, p->cpus_allowed)
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
  
@@@ -2924,7 -2905,7 +2905,7 @@@ int can_migrate_task(struct task_struc
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+       if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
                schedstat_inc(p, se.nr_failed_migrations_affine);
                return 0;
        }
@@@ -3099,7 -3080,7 +3080,7 @@@ static int move_one_task(struct rq *thi
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const cpumask_t *cpus, int *balance)
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
  {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
                unsigned long sum_avg_load_per_task;
                unsigned long avg_load_per_task;
  
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
  
                if (local_group)
-                       balance_cpu = first_cpu(group->cpumask);
+                       balance_cpu = cpumask_first(sched_group_cpus(group));
  
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
  
-               for_each_cpu_mask_nr(i, group->cpumask) {
-                       struct rq *rq;
-                       if (!cpu_isset(i, *cpus))
-                               continue;
-                       rq = cpu_rq(i);
+               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                       struct rq *rq = cpu_rq(i);
  
                        if (*sd_idle && rq->nr_running)
                                *sd_idle = 0;
                 */
                if ((sum_nr_running < min_nr_running) ||
                    (sum_nr_running == min_nr_running &&
-                    first_cpu(group->cpumask) <
-                    first_cpu(group_min->cpumask))) {
+                    cpumask_first(sched_group_cpus(group)) <
+                    cpumask_first(sched_group_cpus(group_min)))) {
                        group_min = group;
                        min_nr_running = sum_nr_running;
                        min_load_per_task = sum_weighted_load /
                if (sum_nr_running <= group_capacity - 1) {
                        if (sum_nr_running > leader_nr_running ||
                            (sum_nr_running == leader_nr_running &&
-                            first_cpu(group->cpumask) >
-                             first_cpu(group_leader->cpumask))) {
+                            cpumask_first(sched_group_cpus(group)) >
+                            cpumask_first(sched_group_cpus(group_leader)))) {
                                group_leader = group;
                                leader_nr_running = sum_nr_running;
                        }
   */
  static struct rq *
  find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                  unsigned long imbalance, const cpumask_t *cpus)
+                  unsigned long imbalance, const struct cpumask *cpus)
  {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
        int i;
  
-       for_each_cpu_mask_nr(i, group->cpumask) {
+       for_each_cpu(i, sched_group_cpus(group)) {
                unsigned long wl;
  
-               if (!cpu_isset(i, *cpus))
+               if (!cpumask_test_cpu(i, cpus))
                        continue;
  
                rq = cpu_rq(i);
   */
  static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, cpumask_t *cpus)
+                       int *balance, struct cpumask *cpus)
  {
        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
  
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
  
        /*
         * When power savings policy is enabled for the parent domain, idle
@@@ -3527,8 -3504,8 +3504,8 @@@ redo
  
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                goto redo;
                        goto out_balanced;
                }
                        /* don't kick the migration_thread, if the curr
                         * task on busiest cpu can't be moved to this_cpu
                         */
-                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       if (!cpumask_test_cpu(this_cpu,
+                                             &busiest->curr->cpus_allowed)) {
                                spin_unlock_irqrestore(&busiest->lock, flags);
                                all_pinned = 1;
                                goto out_one_pinned;
@@@ -3620,7 -3598,7 +3598,7 @@@ out
   */
  static int
  load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       cpumask_t *cpus)
+                       struct cpumask *cpus)
  {
        struct sched_group *group;
        struct rq *busiest = NULL;
        int sd_idle = 0;
        int all_pinned = 0;
  
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
  
        /*
         * When power savings policy is enabled for the parent domain, idle
@@@ -3673,8 -3651,8 +3651,8 @@@ redo
                double_unlock_balance(this_rq, busiest);
  
                if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                goto redo;
                }
        }
@@@ -3707,9 -3685,12 +3685,12 @@@ out_balanced
  static void idle_balance(int this_cpu, struct rq *this_rq)
  {
        struct sched_domain *sd;
-       int pulled_task = -1;
+       int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
-       cpumask_t tmpmask;
+       cpumask_var_t tmpmask;
+       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+               return;
  
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                if (sd->flags & SD_BALANCE_NEWIDLE)
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, &tmpmask);
+                                                          sd, tmpmask);
  
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                 */
                this_rq->next_balance = next_balance;
        }
+       free_cpumask_var(tmpmask);
  }
  
  /*
@@@ -3772,7 -3754,7 +3754,7 @@@ static void active_load_balance(struct 
        /* Search for an sd spanning us and the target CPU. */
        for_each_domain(target_cpu, sd) {
                if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpu_isset(busiest_cpu, sd->span))
+                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                                break;
        }
  
  #ifdef CONFIG_NO_HZ
  static struct {
        atomic_t load_balancer;
-       cpumask_t cpu_mask;
+       cpumask_var_t cpu_mask;
  } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
-       .cpu_mask = CPU_MASK_NONE,
  };
  
  /*
@@@ -3822,7 -3803,7 +3803,7 @@@ int select_nohz_load_balancer(int stop_
        int cpu = smp_processor_id();
  
        if (stop_tick) {
-               cpu_set(cpu, nohz.cpu_mask);
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
                cpu_rq(cpu)->in_nohz_recently = 1;
  
                /*
                }
  
                /* time for ilb owner also to sleep */
-               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
                                atomic_set(&nohz.load_balancer, -1);
                        return 0;
                } else if (atomic_read(&nohz.load_balancer) == cpu)
                        return 1;
        } else {
-               if (!cpu_isset(cpu, nohz.cpu_mask))
+               if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
  
-               cpu_clear(cpu, nohz.cpu_mask);
+               cpumask_clear_cpu(cpu, nohz.cpu_mask);
  
                if (atomic_read(&nohz.load_balancer) == cpu)
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@@ -3880,7 -3861,11 +3861,11 @@@ static void rebalance_domains(int cpu, 
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
        int need_serialize;
-       cpumask_t tmp;
+       cpumask_var_t tmp;
+       /* Fails alloc?  Rebalancing probably not a priority right now. */
+       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+               return;
  
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                }
  
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@@ -3939,6 -3924,8 +3924,8 @@@ out
         */
        if (likely(update_next_balance))
                rq->next_balance = next_balance;
+       free_cpumask_var(tmp);
  }
  
  /*
@@@ -3963,12 -3950,13 +3950,13 @@@ static void run_rebalance_domains(struc
         */
        if (this_rq->idle_at_tick &&
            atomic_read(&nohz.load_balancer) == this_cpu) {
                struct rq *rq;
                int balance_cpu;
  
-               cpu_clear(this_cpu, cpus);
-               for_each_cpu_mask_nr(balance_cpu, cpus) {
+               for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                       if (balance_cpu == this_cpu)
+                               continue;
                        /*
                         * If this cpu gets work to do, stop the load balancing
                         * work being done for other cpus. Next load
@@@ -4006,7 -3994,7 +3994,7 @@@ static inline void trigger_load_balance
                rq->in_nohz_recently = 0;
  
                if (atomic_read(&nohz.load_balancer) == cpu) {
-                       cpu_clear(cpu, nohz.cpu_mask);
+                       cpumask_clear_cpu(cpu, nohz.cpu_mask);
                        atomic_set(&nohz.load_balancer, -1);
                }
  
                         * TBD: Traverse the sched domains and nominate
                         * the nearest cpu in the nohz.cpu_mask.
                         */
-                       int ilb = first_cpu(nohz.cpu_mask);
+                       int ilb = cpumask_first(nohz.cpu_mask);
  
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
         * cpus with ticks stopped, is it time for that to stop?
         */
        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+           cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                resched_cpu(cpu);
                return;
        }
         * someone else, then no need raise the SCHED_SOFTIRQ
         */
        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-           cpu_isset(cpu, nohz.cpu_mask))
+           cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
  #endif
        if (time_after_eq(jiffies, rq->next_balance))
@@@ -4203,7 -4191,6 +4191,6 @@@ void account_steal_time(struct task_str
  
        if (p == rq->idle) {
                p->stime = cputime_add(p->stime, steal);
-               account_group_system_time(p, steal);
                if (atomic_read(&rq->nr_iowait) > 0)
                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                else
@@@ -4339,7 -4326,7 +4326,7 @@@ void __kprobes sub_preempt_count(int va
        /*
         * Underflow?
         */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+        if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
                return;
        /*
         * Is the spinlock portion underflowing?
@@@ -5400,10 -5387,9 +5387,9 @@@ out_unlock
        return retval;
  }
  
- long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
  {
-       cpumask_t cpus_allowed;
-       cpumask_t new_mask = *in_mask;
+       cpumask_var_t cpus_allowed, new_mask;
        struct task_struct *p;
        int retval;
  
        get_task_struct(p);
        read_unlock(&tasklist_lock);
  
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
        retval = -EPERM;
        if ((current->euid != p->euid) && (current->euid != p->uid) &&
                        !capable(CAP_SYS_NICE))
        if (retval)
                goto out_unlock;
  
-       cpuset_cpus_allowed(p, &cpus_allowed);
-       cpus_and(new_mask, new_mask, cpus_allowed);
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, in_mask, cpus_allowed);
   again:
-       retval = set_cpus_allowed_ptr(p, &new_mask);
+       retval = set_cpus_allowed_ptr(p, new_mask);
  
        if (!retval) {
-               cpuset_cpus_allowed(p, &cpus_allowed);
-               if (!cpus_subset(new_mask, cpus_allowed)) {
+               cpuset_cpus_allowed(p, cpus_allowed);
+               if (!cpumask_subset(new_mask, cpus_allowed)) {
                        /*
                         * We must have raced with a concurrent cpuset
                         * update. Just reset the cpus_allowed to the
                         * cpuset's cpus_allowed
                         */
-                       new_mask = cpus_allowed;
+                       cpumask_copy(new_mask, cpus_allowed);
                        goto again;
                }
        }
  out_unlock:
+       free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+ out_put_task:
        put_task_struct(p);
        put_online_cpus();
        return retval;
  }
  
  static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                            cpumask_t *new_mask)
+                            struct cpumask *new_mask)
  {
-       if (len < sizeof(cpumask_t)) {
-               memset(new_mask, 0, sizeof(cpumask_t));
-       } else if (len > sizeof(cpumask_t)) {
-               len = sizeof(cpumask_t);
-       }
+       if (len < cpumask_size())
+               cpumask_clear(new_mask);
+       else if (len > cpumask_size())
+               len = cpumask_size();
        return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
  }
  
  asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                      unsigned long __user *user_mask_ptr)
  {
-       cpumask_t new_mask;
+       cpumask_var_t new_mask;
        int retval;
  
-       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
-       if (retval)
-               return retval;
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+               return -ENOMEM;
  
-       return sched_setaffinity(pid, &new_mask);
+       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+       if (retval == 0)
+               retval = sched_setaffinity(pid, new_mask);
+       free_cpumask_var(new_mask);
+       return retval;
  }
  
- long sched_getaffinity(pid_t pid, cpumask_t *mask)
+ long sched_getaffinity(pid_t pid, struct cpumask *mask)
  {
        struct task_struct *p;
        int retval;
        if (retval)
                goto out_unlock;
  
-       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
  
  out_unlock:
        read_unlock(&tasklist_lock);
@@@ -5523,19 -5524,24 +5524,24 @@@ asmlinkage long sys_sched_getaffinity(p
                                      unsigned long __user *user_mask_ptr)
  {
        int ret;
-       cpumask_t mask;
+       cpumask_var_t mask;
  
-       if (len < sizeof(cpumask_t))
+       if (len < cpumask_size())
                return -EINVAL;
  
-       ret = sched_getaffinity(pid, &mask);
-       if (ret < 0)
-               return ret;
+       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+               return -ENOMEM;
  
-       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
-               return -EFAULT;
+       ret = sched_getaffinity(pid, mask);
+       if (ret == 0) {
+               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                       ret = -EFAULT;
+               else
+                       ret = cpumask_size();
+       }
+       free_cpumask_var(mask);
  
-       return sizeof(cpumask_t);
+       return ret;
  }
  
  /**
@@@ -5877,7 -5883,7 +5883,7 @@@ void __cpuinit init_idle(struct task_st
        idle->se.exec_start = sched_clock();
  
        idle->prio = idle->normal_prio = MAX_PRIO;
-       idle->cpus_allowed = cpumask_of_cpu(cpu);
+       cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
        __set_task_cpu(idle, cpu);
  
        rq->curr = rq->idle = idle;
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_task(idle);
  }
  
  /*
   * indicates which cpus entered this state. This is used
   * in the rcu update to wait only for active cpus. For system
   * which do not switch off the HZ timer nohz_cpu_mask should
-  * always be CPU_MASK_NONE.
+  * always be CPU_BITS_NONE.
   */
- cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ cpumask_var_t nohz_cpu_mask;
  
  /*
   * Increase the granularity value when there are more CPUs,
@@@ -5960,7 -5967,7 +5967,7 @@@ static inline void sched_init_granulari
   * task must not exit() & deallocate itself prematurely. The
   * call is not atomic; no spinlocks may be held.
   */
- int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  {
        struct migration_req req;
        unsigned long flags;
        int ret = 0;
  
        rq = task_rq_lock(p, &flags);
-       if (!cpus_intersects(*new_mask, cpu_online_map)) {
+       if (!cpumask_intersects(new_mask, cpu_online_mask)) {
                ret = -EINVAL;
                goto out;
        }
  
        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+                    !cpumask_equal(&p->cpus_allowed, new_mask))) {
                ret = -EINVAL;
                goto out;
        }
        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        else {
-               p->cpus_allowed = *new_mask;
-               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
        }
  
        /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpu_isset(task_cpu(p), *new_mask))
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
  
-       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+       if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
@@@ -6032,7 -6039,7 +6039,7 @@@ static int __migrate_task(struct task_s
        if (task_cpu(p) != src_cpu)
                goto done;
        /* Affinity changed (again). */
-       if (!cpu_isset(dest_cpu, p->cpus_allowed))
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                goto fail;
  
        on_rq = p->se.on_rq;
@@@ -6126,54 -6133,46 +6133,46 @@@ static int __migrate_task_irq(struct ta
  
  /*
   * Figure out where task on dead CPU should go, use force if necessary.
   */
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
-       unsigned long flags;
-       cpumask_t mask;
-       struct rq *rq;
        int dest_cpu;
+       /* FIXME: Use cpumask_of_node here. */
+       cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
+       const struct cpumask *nodemask = &_nodemask;
+ again:
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                       goto move;
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+       if (dest_cpu < nr_cpu_ids)
+               goto move;
+       /* No more Mr. Nice Guy. */
+       if (dest_cpu >= nr_cpu_ids) {
+               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+               dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
  
-       do {
-               /* On same node? */
-               mask = node_to_cpumask(cpu_to_node(dead_cpu));
-               cpus_and(mask, mask, p->cpus_allowed);
-               dest_cpu = any_online_cpu(mask);
-               /* On any allowed CPU? */
-               if (dest_cpu >= nr_cpu_ids)
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-               /* No more Mr. Nice Guy. */
-               if (dest_cpu >= nr_cpu_ids) {
-                       cpumask_t cpus_allowed;
-                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
-                       /*
-                        * Try to stay on the same cpuset, where the
-                        * current cpuset may be a subset of all cpus.
-                        * The cpuset_cpus_allowed_locked() variant of
-                        * cpuset_cpus_allowed() will not block. It must be
-                        * called within calls to cpuset_lock/cpuset_unlock.
-                        */
-                       rq = task_rq_lock(p, &flags);
-                       p->cpus_allowed = cpus_allowed;
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-                       task_rq_unlock(rq, &flags);
-                       /*
-                        * Don't tell them about moving exiting tasks or
-                        * kernel threads (both mm NULL), since they never
-                        * leave kernel.
-                        */
-                       if (p->mm && printk_ratelimit()) {
-                               printk(KERN_INFO "process %d (%s) no "
-                                      "longer affine to cpu%d\n",
-                                       task_pid_nr(p), p->comm, dead_cpu);
-                       }
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk(KERN_INFO "process %d (%s) no "
+                              "longer affine to cpu%d\n",
+                              task_pid_nr(p), p->comm, dead_cpu);
                }
-       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+       }
+ move:
+       /* It can have affinity changed while we were choosing. */
+       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+               goto again;
  }
  
  /*
   */
  static void migrate_nr_uninterruptible(struct rq *rq_src)
  {
-       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
        unsigned long flags;
  
        local_irq_save(flags);
@@@ -6475,7 -6474,7 +6474,7 @@@ static void set_rq_online(struct rq *rq
        if (!rq->online) {
                const struct sched_class *class;
  
-               cpu_set(rq->cpu, rq->rd->online);
+               cpumask_set_cpu(rq->cpu, rq->rd->online);
                rq->online = 1;
  
                for_each_class(class) {
@@@ -6495,7 -6494,7 +6494,7 @@@ static void set_rq_offline(struct rq *r
                                class->rq_offline(rq);
                }
  
-               cpu_clear(rq->cpu, rq->rd->online);
+               cpumask_clear_cpu(rq->cpu, rq->rd->online);
                rq->online = 0;
        }
  }
@@@ -6536,7 -6535,7 +6535,7 @@@ migration_call(struct notifier_block *n
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  
                        set_rq_online(rq);
                }
                        break;
                /* Unbind it from offline cpu so it can run. Fall thru. */
                kthread_bind(cpu_rq(cpu)->migration_thread,
-                            any_online_cpu(cpu_online_map));
+                            cpumask_any(cpu_online_mask));
                kthread_stop(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
@@@ -6638,36 -6637,14 +6637,14 @@@ early_initcall(migration_init)
  
  #ifdef CONFIG_SCHED_DEBUG
  
- static inline const char *sd_level_to_string(enum sched_domain_level lvl)
- {
-       switch (lvl) {
-       case SD_LV_NONE:
-                       return "NONE";
-       case SD_LV_SIBLING:
-                       return "SIBLING";
-       case SD_LV_MC:
-                       return "MC";
-       case SD_LV_CPU:
-                       return "CPU";
-       case SD_LV_NODE:
-                       return "NODE";
-       case SD_LV_ALLNODES:
-                       return "ALLNODES";
-       case SD_LV_MAX:
-                       return "MAX";
-       }
-       return "MAX";
- }
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                 cpumask_t *groupmask)
+                                 struct cpumask *groupmask)
  {
        struct sched_group *group = sd->groups;
        char str[256];
  
-       cpulist_scnprintf(str, sizeof(str), &sd->span);
-       cpus_clear(*groupmask);
 -      cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
++      cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+       cpumask_clear(groupmask);
  
        printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
  
                return -1;
        }
  
-       printk(KERN_CONT "span %s level %s\n",
-               str, sd_level_to_string(sd->level));
+       printk(KERN_CONT "span %s level %s\n", str, sd->name);
  
-       if (!cpu_isset(cpu, sd->span)) {
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
                                "CPU%d\n", cpu);
        }
-       if (!cpu_isset(cpu, group->cpumask)) {
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
                printk(KERN_ERR "ERROR: domain->groups does not contain"
                                " CPU%d\n", cpu);
        }
                        break;
                }
  
-               if (!cpus_weight(group->cpumask)) {
+               if (!cpumask_weight(sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: empty group\n");
                        break;
                }
  
-               if (cpus_intersects(*groupmask, group->cpumask)) {
+               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
                }
  
-               cpus_or(*groupmask, *groupmask, group->cpumask);
+               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
-               cpulist_scnprintf(str, sizeof(str), &group->cpumask);
 -              cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group));
++              cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
  
                group = group->next;
        } while (group != sd->groups);
        printk(KERN_CONT "\n");
  
-       if (!cpus_equal(sd->span, *groupmask))
+       if (!cpumask_equal(sched_domain_span(sd), groupmask))
                printk(KERN_ERR "ERROR: groups don't span domain->span\n");
  
-       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+       if (sd->parent &&
+           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
                printk(KERN_ERR "ERROR: parent span is not a superset "
                        "of domain->span\n");
        return 0;
  
  static void sched_domain_debug(struct sched_domain *sd, int cpu)
  {
-       cpumask_t *groupmask;
+       cpumask_var_t groupmask;
        int level = 0;
  
        if (!sd) {
  
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
  
-       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-       if (!groupmask) {
+       if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
                printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
                return;
        }
                if (!sd)
                        break;
        }
-       kfree(groupmask);
+       free_cpumask_var(groupmask);
  }
  #else /* !CONFIG_SCHED_DEBUG */
  # define sched_domain_debug(sd, cpu) do { } while (0)
  
  static int sd_degenerate(struct sched_domain *sd)
  {
-       if (cpus_weight(sd->span) == 1)
+       if (cpumask_weight(sched_domain_span(sd)) == 1)
                return 1;
  
        /* Following flags need at least 2 groups */
@@@ -6801,7 -6777,7 +6777,7 @@@ sd_parent_degenerate(struct sched_domai
        if (sd_degenerate(parent))
                return 1;
  
-       if (!cpus_equal(sd->span, parent->span))
+       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                return 0;
  
        /* Does parent contain flags not in child? */
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
        }
        if (~cflags & pflags)
                return 0;
        return 1;
  }
  
+ static void free_rootdomain(struct root_domain *rd)
+ {
+       cpupri_cleanup(&rd->cpupri);
+       free_cpumask_var(rd->rto_mask);
+       free_cpumask_var(rd->online);
+       free_cpumask_var(rd->span);
+       kfree(rd);
+ }
  static void rq_attach_root(struct rq *rq, struct root_domain *rd)
  {
        unsigned long flags;
        if (rq->rd) {
                struct root_domain *old_rd = rq->rd;
  
-               if (cpu_isset(rq->cpu, old_rd->online))
+               if (cpumask_test_cpu(rq->cpu, old_rd->online))
                        set_rq_offline(rq);
  
-               cpu_clear(rq->cpu, old_rd->span);
+               cpumask_clear_cpu(rq->cpu, old_rd->span);
  
                if (atomic_dec_and_test(&old_rd->refcount))
-                       kfree(old_rd);
+                       free_rootdomain(old_rd);
        }
  
        atomic_inc(&rd->refcount);
        rq->rd = rd;
  
-       cpu_set(rq->cpu, rd->span);
-       if (cpu_isset(rq->cpu, cpu_online_map))
+       cpumask_set_cpu(rq->cpu, rd->span);
+       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
                set_rq_online(rq);
  
        spin_unlock_irqrestore(&rq->lock, flags);
  }
  
- static void init_rootdomain(struct root_domain *rd)
+ static int init_rootdomain(struct root_domain *rd, bool bootmem)
  {
        memset(rd, 0, sizeof(*rd));
  
-       cpus_clear(rd->span);
-       cpus_clear(rd->online);
+       if (bootmem) {
+               alloc_bootmem_cpumask_var(&def_root_domain.span);
+               alloc_bootmem_cpumask_var(&def_root_domain.online);
+               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+               cpupri_init(&rd->cpupri, true);
+               return 0;
+       }
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+               goto free_rd;
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+               goto free_span;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_online;
+       if (cpupri_init(&rd->cpupri, false) != 0)
+               goto free_rto_mask;
+       return 0;
  
-       cpupri_init(&rd->cpupri);
+ free_rto_mask:
+       free_cpumask_var(rd->rto_mask);
+ free_online:
+       free_cpumask_var(rd->online);
+ free_span:
+       free_cpumask_var(rd->span);
+ free_rd:
+       kfree(rd);
+       return -ENOMEM;
  }
  
  static void init_defrootdomain(void)
  {
-       init_rootdomain(&def_root_domain);
+       init_rootdomain(&def_root_domain, true);
        atomic_set(&def_root_domain.refcount, 1);
  }
  
@@@ -6875,7 -6888,10 +6888,10 @@@ static struct root_domain *alloc_rootdo
        if (!rd)
                return NULL;
  
-       init_rootdomain(rd);
+       if (init_rootdomain(rd, false) != 0) {
+               kfree(rd);
+               return NULL;
+       }
  
        return rd;
  }
@@@ -6917,19 -6933,12 +6933,12 @@@ cpu_attach_domain(struct sched_domain *
  }
  
  /* cpus with isolated domains */
- static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+ static cpumask_var_t cpu_isolated_map;
  
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
-       static int __initdata ints[NR_CPUS];
-       int i;
-       str = get_options(str, ARRAY_SIZE(ints), ints);
-       cpus_clear(cpu_isolated_map);
-       for (i = 1; i <= ints[0]; i++)
-               if (ints[i] < NR_CPUS)
-                       cpu_set(ints[i], cpu_isolated_map);
 -      cpulist_parse(str, *cpu_isolated_map);
++      cpulist_parse(str, cpu_isolated_map);
        return 1;
  }
  
@@@ -6938,42 -6947,43 +6947,43 @@@ __setup("isolcpus=", isolated_cpu_setup
  /*
   * init_sched_build_groups takes the cpumask we wish to span, and a pointer
   * to a function which identifies what group(along with sched group) a CPU
-  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
-  * (due to the fact that we keep track of groups covered with a cpumask_t).
+  * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+  * (due to the fact that we keep track of groups covered with a struct cpumask).
   *
   * init_sched_build_groups will build a circular linked list of the groups
   * covered by the given span, and will set each group's ->cpumask correctly,
   * and ->cpu_power to 0.
   */
  static void
- init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ init_sched_build_groups(const struct cpumask *span,
+                       const struct cpumask *cpu_map,
+                       int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                                        struct sched_group **sg,
-                                       cpumask_t *tmpmask),
-                       cpumask_t *covered, cpumask_t *tmpmask)
+                                       struct cpumask *tmpmask),
+                       struct cpumask *covered, struct cpumask *tmpmask)
  {
        struct sched_group *first = NULL, *last = NULL;
        int i;
  
-       cpus_clear(*covered);
+       cpumask_clear(covered);
  
-       for_each_cpu_mask_nr(i, *span) {
+       for_each_cpu(i, span) {
                struct sched_group *sg;
                int group = group_fn(i, cpu_map, &sg, tmpmask);
                int j;
  
-               if (cpu_isset(i, *covered))
+               if (cpumask_test_cpu(i, covered))
                        continue;
  
-               cpus_clear(sg->cpumask);
+               cpumask_clear(sched_group_cpus(sg));
                sg->__cpu_power = 0;
  
-               for_each_cpu_mask_nr(j, *span) {
+               for_each_cpu(j, span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                continue;
  
-                       cpu_set(j, *covered);
-                       cpu_set(j, sg->cpumask);
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
                }
                if (!first)
                        first = sg;
@@@ -7037,9 -7047,10 +7047,10 @@@ static int find_next_best_node(int node
   * should be one that prevents unnecessary balancing, but also spreads tasks
   * out optimally.
   */
- static void sched_domain_node_span(int node, cpumask_t *span)
+ static void sched_domain_node_span(int node, struct cpumask *span)
  {
        nodemask_t used_nodes;
+       /* FIXME: use cpumask_of_node() */
        node_to_cpumask_ptr(nodemask, node);
        int i;
  
  int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  
  /*
+  * The cpus mask in sched_group and sched_domain hangs off the end.
+  * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+  * for nr_cpu_ids < CONFIG_NR_CPUS.
+  */
+ struct static_sched_group {
+       struct sched_group sg;
+       DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+ };
+ struct static_sched_domain {
+       struct sched_domain sd;
+       DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ };
+ /*
   * SMT sched-domains:
   */
  #ifdef CONFIG_SCHED_SMT
- static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
  
  static int
- cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                cpumask_t *unused)
+ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+                struct sched_group **sg, struct cpumask *unused)
  {
        if (sg)
-               *sg = &per_cpu(sched_group_cpus, cpu);
+               *sg = &per_cpu(sched_group_cpus, cpu).sg;
        return cpu;
  }
  #endif /* CONFIG_SCHED_SMT */
   * multi-core sched-domains:
   */
  #ifdef CONFIG_SCHED_MC
- static DEFINE_PER_CPU(struct sched_domain, core_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+ static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
  #endif /* CONFIG_SCHED_MC */
  
  #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
  static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
  {
        int group;
  
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
        if (sg)
-               *sg = &per_cpu(sched_group_core, group);
+               *sg = &per_cpu(sched_group_core, group).sg;
        return group;
  }
  #elif defined(CONFIG_SCHED_MC)
  static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *unused)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *unused)
  {
        if (sg)
-               *sg = &per_cpu(sched_group_core, cpu);
+               *sg = &per_cpu(sched_group_core, cpu).sg;
        return cpu;
  }
  #endif
  
- static DEFINE_PER_CPU(struct sched_domain, phys_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
  
  static int
- cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
  {
        int group;
  #ifdef CONFIG_SCHED_MC
+       /* FIXME: Use cpu_coregroup_mask. */
        *mask = cpu_coregroup_map(cpu);
        cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
  #else
        group = cpu;
  #endif
        if (sg)
-               *sg = &per_cpu(sched_group_phys, group);
+               *sg = &per_cpu(sched_group_phys, group).sg;
        return group;
  }
  
@@@ -7144,19 -7169,21 +7169,21 @@@ static DEFINE_PER_CPU(struct sched_doma
  static struct sched_group ***sched_group_nodes_bycpu;
  
  static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
  
- static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-                                struct sched_group **sg, cpumask_t *nodemask)
+ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                                struct sched_group **sg,
+                                struct cpumask *nodemask)
  {
        int group;
+       /* FIXME: use cpumask_of_node */
+       node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
  
-       *nodemask = node_to_cpumask(cpu_to_node(cpu));
-       cpus_and(*nodemask, *nodemask, *cpu_map);
-       group = first_cpu(*nodemask);
+       cpumask_and(nodemask, pnodemask, cpu_map);
+       group = cpumask_first(nodemask);
  
        if (sg)
-               *sg = &per_cpu(sched_group_allnodes, group);
+               *sg = &per_cpu(sched_group_allnodes, group).sg;
        return group;
  }
  
@@@ -7168,11 -7195,11 +7195,11 @@@ static void init_numa_sched_groups_powe
        if (!sg)
                return;
        do {
-               for_each_cpu_mask_nr(j, sg->cpumask) {
+               for_each_cpu(j, sched_group_cpus(sg)) {
                        struct sched_domain *sd;
  
-                       sd = &per_cpu(phys_domains, j);
-                       if (j != first_cpu(sd->groups->cpumask)) {
+                       sd = &per_cpu(phys_domains, j).sd;
+                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
  
  #ifdef CONFIG_NUMA
  /* Free memory allocated for various sched_group structures */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
  {
        int cpu, i;
  
-       for_each_cpu_mask_nr(cpu, *cpu_map) {
+       for_each_cpu(cpu, cpu_map) {
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
  
  
                for (i = 0; i < nr_node_ids; i++) {
                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                       /* FIXME: Use cpumask_of_node */
+                       node_to_cpumask_ptr(pnodemask, i);
  
-                       *nodemask = node_to_cpumask(i);
-                       cpus_and(*nodemask, *nodemask, *cpu_map);
-                       if (cpus_empty(*nodemask))
+                       cpus_and(*nodemask, *pnodemask, *cpu_map);
+                       if (cpumask_empty(nodemask))
                                continue;
  
                        if (sg == NULL)
@@@ -7223,7 -7252,8 +7252,8 @@@ next_sg
        }
  }
  #else /* !CONFIG_NUMA */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
  {
  }
  #endif /* CONFIG_NUMA */
@@@ -7249,7 -7279,7 +7279,7 @@@ static void init_sched_groups_power(in
  
        WARN_ON(!sd || !sd->groups);
  
-       if (cpu != first_cpu(sd->groups->cpumask))
+       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
                return;
  
        child = sd->child;
@@@ -7314,40 -7344,6 +7344,6 @@@ SD_INIT_FUNC(CPU
   SD_INIT_FUNC(MC)
  #endif
  
- /*
-  * To minimize stack usage kmalloc room for cpumasks and share the
-  * space as the usage in build_sched_domains() dictates.  Used only
-  * if the amount of space is significant.
-  */
- struct allmasks {
-       cpumask_t tmpmask;                      /* make this one first */
-       union {
-               cpumask_t nodemask;
-               cpumask_t this_sibling_map;
-               cpumask_t this_core_map;
-       };
-       cpumask_t send_covered;
- #ifdef CONFIG_NUMA
-       cpumask_t domainspan;
-       cpumask_t covered;
-       cpumask_t notcovered;
- #endif
- };
- #if   NR_CPUS > 128
- #define       SCHED_CPUMASK_ALLOC             1
- #define       SCHED_CPUMASK_FREE(v)           kfree(v)
- #define       SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
- #else
- #define       SCHED_CPUMASK_ALLOC             0
- #define       SCHED_CPUMASK_FREE(v)
- #define       SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
- #endif
- #define       SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
-                       ((unsigned long)(a) + offsetof(struct allmasks, v))
  static int default_relax_domain_level = -1;
  
  static int __init setup_relax_domain_level(char *str)
@@@ -7387,17 -7383,38 +7383,38 @@@ static void set_domain_attribute(struc
   * Build sched domains for a given set of cpus and attach the sched domains
   * to the individual cpus
   */
- static int __build_sched_domains(const cpumask_t *cpu_map,
+ static int __build_sched_domains(const struct cpumask *cpu_map,
                                 struct sched_domain_attr *attr)
  {
-       int i;
+       int i, err = -ENOMEM;
        struct root_domain *rd;
-       SCHED_CPUMASK_DECLARE(allmasks);
-       cpumask_t *tmpmask;
+       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+               tmpmask;
  #ifdef CONFIG_NUMA
+       cpumask_var_t domainspan, covered, notcovered;
        struct sched_group **sched_group_nodes = NULL;
        int sd_allnodes = 0;
  
+       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+               goto free_domainspan;
+       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+               goto free_covered;
+ #endif
+       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+               goto free_notcovered;
+       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+               goto free_nodemask;
+       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+               goto free_this_sibling_map;
+       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+               goto free_this_core_map;
+       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               goto free_send_covered;
+ #ifdef CONFIG_NUMA
        /*
         * Allocate the per-node list of sched groups
         */
                                    GFP_KERNEL);
        if (!sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return -ENOMEM;
+               goto free_tmpmask;
        }
  #endif
  
        rd = alloc_rootdomain();
        if (!rd) {
                printk(KERN_WARNING "Cannot alloc root domain\n");
- #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
+               goto free_sched_groups;
        }
  
- #if SCHED_CPUMASK_ALLOC
-       /* get space for all scratch cpumask variables */
-       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
-       if (!allmasks) {
-               printk(KERN_WARNING "Cannot alloc cpumask array\n");
-               kfree(rd);
  #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
-       }
- #endif
-       tmpmask = (cpumask_t *)allmasks;
- #ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
  #endif
  
        /*
         * Set up domains for cpus specified by the cpu_map.
         */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                struct sched_domain *sd = NULL, *p;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
  
+               /* FIXME: use cpumask_of_node */
                *nodemask = node_to_cpumask(cpu_to_node(i));
                cpus_and(*nodemask, *nodemask, *cpu_map);
  
  #ifdef CONFIG_NUMA
-               if (cpus_weight(*cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+               if (cpumask_weight(cpu_map) >
+                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
                        sd = &per_cpu(allnodes_domains, i);
                        SD_INIT(sd, ALLNODES);
                        set_domain_attribute(sd, attr);
-                       sd->span = *cpu_map;
+                       cpumask_copy(sched_domain_span(sd), cpu_map);
                        cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                        p = sd;
                        sd_allnodes = 1;
                sd = &per_cpu(node_domains, i);
                SD_INIT(sd, NODE);
                set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), &sd->span);
+               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                sd->parent = p;
                if (p)
                        p->child = sd;
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
  #endif
  
                p = sd;
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
                SD_INIT(sd, CPU);
                set_domain_attribute(sd, attr);
-               sd->span = *nodemask;
+               cpumask_copy(sched_domain_span(sd), nodemask);
                sd->parent = p;
                if (p)
                        p->child = sd;
  
  #ifdef CONFIG_SCHED_MC
                p = sd;
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
                SD_INIT(sd, MC);
                set_domain_attribute(sd, attr);
-               sd->span = cpu_coregroup_map(i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               *sched_domain_span(sd) = cpu_coregroup_map(i);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
  
  #ifdef CONFIG_SCHED_SMT
                p = sd;
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
                SD_INIT(sd, SIBLING);
                set_domain_attribute(sd, attr);
-               sd->span = per_cpu(cpu_sibling_map, i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
  
  #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-               *this_sibling_map = per_cpu(cpu_sibling_map, i);
-               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-               if (i != first_cpu(*this_sibling_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_sibling_map,
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
+               if (i != cpumask_first(this_sibling_map))
                        continue;
  
                init_sched_build_groups(this_sibling_map, cpu_map,
  
  #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_core_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
+       for_each_cpu(i, cpu_map) {
+               /* FIXME: Use cpu_coregroup_mask */
                *this_core_map = cpu_coregroup_map(i);
                cpus_and(*this_core_map, *this_core_map, *cpu_map);
-               if (i != first_cpu(*this_core_map))
+               if (i != cpumask_first(this_core_map))
                        continue;
  
                init_sched_build_groups(this_core_map, cpu_map,
  
        /* Set up physical groups */
        for (i = 0; i < nr_node_ids; i++) {
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
+               /* FIXME: Use cpumask_of_node */
                *nodemask = node_to_cpumask(i);
                cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask))
+               if (cpumask_empty(nodemask))
                        continue;
  
                init_sched_build_groups(nodemask, cpu_map,
  #ifdef CONFIG_NUMA
        /* Set up node groups */
        if (sd_allnodes) {
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
                init_sched_build_groups(cpu_map, cpu_map,
                                        &cpu_to_allnodes_group,
                                        send_covered, tmpmask);
        for (i = 0; i < nr_node_ids; i++) {
                /* Set up node groups */
                struct sched_group *sg, *prev;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(domainspan, allmasks);
-               SCHED_CPUMASK_VAR(covered, allmasks);
                int j;
  
+               /* FIXME: Use cpumask_of_node */
                *nodemask = node_to_cpumask(i);
-               cpus_clear(*covered);
+               cpumask_clear(covered);
  
                cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask)) {
+               if (cpumask_empty(nodemask)) {
                        sched_group_nodes[i] = NULL;
                        continue;
                }
  
                sched_domain_node_span(i, domainspan);
-               cpus_and(*domainspan, *domainspan, *cpu_map);
+               cpumask_and(domainspan, domainspan, cpu_map);
  
-               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, i);
                if (!sg) {
                        printk(KERN_WARNING "Can not alloc domain group for "
                                "node %d\n", i);
                        goto error;
                }
                sched_group_nodes[i] = sg;
-               for_each_cpu_mask_nr(j, *nodemask) {
+               for_each_cpu(j, nodemask) {
                        struct sched_domain *sd;
  
                        sd = &per_cpu(node_domains, j);
                        sd->groups = sg;
                }
                sg->__cpu_power = 0;
-               sg->cpumask = *nodemask;
+               cpumask_copy(sched_group_cpus(sg), nodemask);
                sg->next = sg;
-               cpus_or(*covered, *covered, *nodemask);
+               cpumask_or(covered, covered, nodemask);
                prev = sg;
  
                for (j = 0; j < nr_node_ids; j++) {
-                       SCHED_CPUMASK_VAR(notcovered, allmasks);
                        int n = (i + j) % nr_node_ids;
+                       /* FIXME: Use cpumask_of_node */
                        node_to_cpumask_ptr(pnodemask, n);
  
-                       cpus_complement(*notcovered, *covered);
-                       cpus_and(*tmpmask, *notcovered, *cpu_map);
-                       cpus_and(*tmpmask, *tmpmask, *domainspan);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_complement(notcovered, covered);
+                       cpumask_and(tmpmask, notcovered, cpu_map);
+                       cpumask_and(tmpmask, tmpmask, domainspan);
+                       if (cpumask_empty(tmpmask))
                                break;
  
-                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_and(tmpmask, tmpmask, pnodemask);
+                       if (cpumask_empty(tmpmask))
                                continue;
  
-                       sg = kmalloc_node(sizeof(struct sched_group),
+                       sg = kmalloc_node(sizeof(struct sched_group) +
+                                         cpumask_size(),
                                          GFP_KERNEL, i);
                        if (!sg) {
                                printk(KERN_WARNING
                                goto error;
                        }
                        sg->__cpu_power = 0;
-                       sg->cpumask = *tmpmask;
+                       cpumask_copy(sched_group_cpus(sg), tmpmask);
                        sg->next = prev->next;
-                       cpus_or(*covered, *covered, *tmpmask);
+                       cpumask_or(covered, covered, tmpmask);
                        prev->next = sg;
                        prev = sg;
                }
  
        /* Calculate CPU power for physical packages and nodes */
  #ifdef CONFIG_SCHED_SMT
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
  
                init_sched_groups_power(i, sd);
        }
  #endif
  #ifdef CONFIG_SCHED_MC
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
  
                init_sched_groups_power(i, sd);
        }
  #endif
  
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
  
                init_sched_groups_power(i, sd);
        }
        if (sd_allnodes) {
                struct sched_group *sg;
  
-               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+               cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
                                                                tmpmask);
                init_numa_sched_groups_power(sg);
        }
  #endif
  
        /* Attach the domains */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
  #elif defined(CONFIG_SCHED_MC)
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
  #else
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
  #endif
                cpu_attach_domain(sd, rd, i);
        }
  
-       SCHED_CPUMASK_FREE((void *)allmasks);
-       return 0;
+       err = 0;
+ free_tmpmask:
+       free_cpumask_var(tmpmask);
+ free_send_covered:
+       free_cpumask_var(send_covered);
+ free_this_core_map:
+       free_cpumask_var(this_core_map);
+ free_this_sibling_map:
+       free_cpumask_var(this_sibling_map);
+ free_nodemask:
+       free_cpumask_var(nodemask);
+ free_notcovered:
+ #ifdef CONFIG_NUMA
+       free_cpumask_var(notcovered);
+ free_covered:
+       free_cpumask_var(covered);
+ free_domainspan:
+       free_cpumask_var(domainspan);
+ out:
+ #endif
+       return err;
+ free_sched_groups:
+ #ifdef CONFIG_NUMA
+       kfree(sched_group_nodes);
+ #endif
+       goto free_tmpmask;
  
  #ifdef CONFIG_NUMA
  error:
        free_sched_groups(cpu_map, tmpmask);
-       SCHED_CPUMASK_FREE((void *)allmasks);
-       kfree(rd);
-       return -ENOMEM;
+       free_rootdomain(rd);
+       goto free_tmpmask;
  #endif
  }
  
- static int build_sched_domains(const cpumask_t *cpu_map)
+ static int build_sched_domains(const struct cpumask *cpu_map)
  {
        return __build_sched_domains(cpu_map, NULL);
  }
  
- static cpumask_t *doms_cur;   /* current sched domains */
+ static struct cpumask *doms_cur;      /* current sched domains */
  static int ndoms_cur;         /* number of sched domains in 'doms_cur' */
  static struct sched_domain_attr *dattr_cur;
                                /* attribues of custom domains in 'doms_cur' */
  
  /*
   * Special case: If a kmalloc of a doms_cur partition (array of
-  * cpumask_t) fails, then fallback to a single sched domain,
-  * as determined by the single cpumask_t fallback_doms.
+  * cpumask) fails, then fallback to a single sched domain,
+  * as determined by the single cpumask fallback_doms.
   */
- static cpumask_t fallback_doms;
+ static cpumask_var_t fallback_doms;
  
- void __attribute__((weak)) arch_update_cpu_topology(void)
+ /*
+  * arch_update_cpu_topology lets virtualized architectures update the
+  * cpu core maps. It is supposed to return 1 if the topology changed
+  * or 0 if it stayed the same.
+  */
+ int __attribute__((weak)) arch_update_cpu_topology(void)
  {
+       return 0;
  }
  
  /*
   * For now this just excludes isolated cpus, but could be used to
   * exclude other special cases in the future.
   */
- static int arch_init_sched_domains(const cpumask_t *cpu_map)
+ static int arch_init_sched_domains(const struct cpumask *cpu_map)
  {
        int err;
  
        arch_update_cpu_topology();
        ndoms_cur = 1;
-       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+       doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
        if (!doms_cur)
-               doms_cur = &fallback_doms;
-       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+               doms_cur = fallback_doms;
+       cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
        err = build_sched_domains(doms_cur);
        register_sched_domain_sysctl();
        return err;
  }
  
- static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-                                      cpumask_t *tmpmask)
+ static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+                                      struct cpumask *tmpmask)
  {
        free_sched_groups(cpu_map, tmpmask);
  }
   * Detach sched domains from a group of cpus specified in cpu_map
   * These cpus will now be attached to the NULL domain
   */
- static void detach_destroy_domains(const cpumask_t *cpu_map)
+ static void detach_destroy_domains(const struct cpumask *cpu_map)
  {
-       cpumask_t tmpmask;
+       /* Save because hotplug lock held. */
+       static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
        int i;
  
-       unregister_sched_domain_sysctl();
-       for_each_cpu_mask_nr(i, *cpu_map)
+       for_each_cpu(i, cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
-       arch_destroy_sched_domains(cpu_map, &tmpmask);
+       arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
  }
  
  /* handle null as "default" */
@@@ -7783,7 -7805,7 +7805,7 @@@ static int dattrs_equal(struct sched_do
   * doms_new[] to the current sched domain partitioning, doms_cur[].
   * It destroys each deleted domain and builds each new domain.
   *
-  * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+  * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
   * The masks don't intersect (don't overlap.) We should setup one
   * sched domain for each mask. CPUs not in any of the cpumasks will
   * not be load balanced. If the same cpumask appears both in the
   * the single partition 'fallback_doms', it also forces the domains
   * to be rebuilt.
   *
-  * If doms_new == NULL it will be replaced with cpu_online_map.
+  * If doms_new == NULL it will be replaced with cpu_online_mask.
   * ndoms_new == 0 is a special case for destroying existing domains,
   * and it will not create the default domain.
   *
   * Call with hotplug lock held
   */
- void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ /* FIXME: Change to struct cpumask *doms_new[] */
+ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                             struct sched_domain_attr *dattr_new)
  {
        int i, j, n;
+       int new_topology;
  
        mutex_lock(&sched_domains_mutex);
  
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
  
+       /* Let architecture update cpu core mappings. */
+       new_topology = arch_update_cpu_topology();
        n = doms_new ? ndoms_new : 0;
  
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < n; j++) {
-                       if (cpus_equal(doms_cur[i], doms_new[j])
+               for (j = 0; j < n && !new_topology; j++) {
+                       if (cpumask_equal(&doms_cur[i], &doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
                }
@@@ -7830,15 -7857,15 +7857,15 @@@ match1
  
        if (doms_new == NULL) {
                ndoms_cur = 0;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-               dattr_new = NULL;
+               doms_new = fallback_doms;
+               cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
+               WARN_ON_ONCE(dattr_new);
        }
  
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur; j++) {
-                       if (cpus_equal(doms_new[i], doms_cur[j])
+               for (j = 0; j < ndoms_cur && !new_topology; j++) {
+                       if (cpumask_equal(&doms_new[i], &doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
                }
@@@ -7850,7 -7877,7 +7877,7 @@@ match2
        }
  
        /* Remember the new sched domains */
-       if (doms_cur != &fallback_doms)
+       if (doms_cur != fallback_doms)
                kfree(doms_cur);
        kfree(dattr_cur);       /* kfree(NULL) is safe */
        doms_cur = doms_new;
@@@ -7990,7 -8017,9 +8017,9 @@@ static int update_runtime(struct notifi
  
  void __init sched_init_smp(void)
  {
-       cpumask_t non_isolated_cpus;
+       cpumask_var_t non_isolated_cpus;
+       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
  
  #if defined(CONFIG_NUMA)
        sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
  #endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-       arch_init_sched_domains(&cpu_online_map);
-       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
-       if (cpus_empty(non_isolated_cpus))
-               cpu_set(smp_processor_id(), non_isolated_cpus);
+       arch_init_sched_domains(cpu_online_mask);
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+       if (cpumask_empty(non_isolated_cpus))
+               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
  
        init_hrtick();
  
        /* Move init over to a non-isolated CPU */
-       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                BUG();
        sched_init_granularity();
+       free_cpumask_var(non_isolated_cpus);
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+       init_sched_rt_class();
  }
  #else
  void __init sched_init_smp(void)
@@@ -8334,6 -8367,15 +8367,15 @@@ void __init sched_init(void
         */
        current->sched_class = &fair_sched_class;
  
+       /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ #endif
+       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ #endif /* SMP */
        scheduler_running = 1;
  }
  
@@@ -8492,7 -8534,7 +8534,7 @@@ stati
  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
        struct cfs_rq *cfs_rq;
-       struct sched_entity *se, *parent_se;
+       struct sched_entity *se;
        struct rq *rq;
        int i;
  
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
  
-               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
  
-               se = kmalloc_node(sizeof(struct sched_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               se = kzalloc_node(sizeof(struct sched_entity),
+                                 GFP_KERNEL, cpu_to_node(i));
                if (!se)
                        goto err;
  
-               parent_se = parent ? parent->se[i] : NULL;
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
  
        return 1;
@@@ -8580,7 -8621,7 +8621,7 @@@ stati
  int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  {
        struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se, *parent_se;
+       struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
  
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
  
-               rt_rq = kmalloc_node(sizeof(struct rt_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
  
-               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
                        goto err;
  
-               parent_se = parent ? parent->rt_se[i] : NULL;
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
  
        return 1;
@@@ -9251,11 -9291,12 +9291,12 @@@ struct cgroup_subsys cpu_cgroup_subsys 
   * (balbir@in.ibm.com).
   */
  
- /* track cpu usage of a group of tasks */
+ /* track cpu usage of a group of tasks and its child groups */
  struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+       struct cpuacct *parent;
  };
  
  struct cgroup_subsys cpuacct_subsys;
@@@ -9289,6 -9330,9 +9330,9 @@@ static struct cgroup_subsys_state *cpua
                return ERR_PTR(-ENOMEM);
        }
  
+       if (cgrp->parent)
+               ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
  }
  
@@@ -9368,14 -9412,16 +9412,16 @@@ static int cpuacct_populate(struct cgro
  static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  {
        struct cpuacct *ca;
+       int cpu;
  
        if (!cpuacct_subsys.active)
                return;
  
+       cpu = task_cpu(tsk);
        ca = task_ca(tsk);
-       if (ca) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
  
+       for (; ca; ca = ca->parent) {
+               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
  }
diff --combined kernel/sched_stats.h
@@@ -42,7 -42,8 +42,8 @@@ static int show_schedstat(struct seq_fi
                for_each_domain(cpu, sd) {
                        enum cpu_idle_type itype;
  
-                       cpumask_scnprintf(mask_str, mask_len, &sd->span);
+                       cpumask_scnprintf(mask_str, mask_len,
 -                                        *sched_domain_span(sd));
++                                        sched_domain_span(sd));
                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
                                        itype++) {
diff --combined kernel/trace/trace.c
@@@ -30,6 -30,7 +30,7 @@@
  #include <linux/gfp.h>
  #include <linux/fs.h>
  #include <linux/kprobes.h>
+ #include <linux/seq_file.h>
  #include <linux/writeback.h>
  
  #include <linux/stacktrace.h>
  unsigned long __read_mostly   tracing_max_latency = (cycle_t)ULONG_MAX;
  unsigned long __read_mostly   tracing_thresh;
  
+ /*
+  * We need to change this state when a selftest is running.
+  * A selftest will lurk into the ring-buffer to count the
+  * entries inserted during the selftest although some concurrent
+  * insertions into the ring-buffer such as ftrace_printk could occurred
+  * at the same time, giving false positive or negative results.
+  */
+ static bool __read_mostly tracing_selftest_running;
+ /* For tracers that don't implement custom flags */
+ static struct tracer_opt dummy_tracer_opt[] = {
+       { }
+ };
+ static struct tracer_flags dummy_tracer_flags = {
+       .val = 0,
+       .opts = dummy_tracer_opt
+ };
+ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+ {
+       return 0;
+ }
+ /*
+  * Kill all tracing for good (never come back).
+  * It is initialized to 1 but will turn to zero if the initialization
+  * of the tracer is successful. But that is the only place that sets
+  * this back to zero.
+  */
+ int tracing_disabled = 1;
  static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
  
  static inline void ftrace_disable_cpu(void)
@@@ -62,7 -95,36 +95,36 @@@ static cpumask_t __read_mostly              tracing
  #define for_each_tracing_cpu(cpu)     \
        for_each_cpu_mask(cpu, tracing_buffer_mask)
  
- static int tracing_disabled = 1;
+ /*
+  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+  *
+  * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+  * is set, then ftrace_dump is called. This will output the contents
+  * of the ftrace buffers to the console.  This is very useful for
+  * capturing traces that lead to crashes and outputing it to a
+  * serial console.
+  *
+  * It is default off, but you can enable it with either specifying
+  * "ftrace_dump_on_oops" in the kernel command line, or setting
+  * /proc/sys/kernel/ftrace_dump_on_oops to true.
+  */
+ int ftrace_dump_on_oops;
+ static int tracing_set_tracer(char *buf);
+ static int __init set_ftrace(char *str)
+ {
+       tracing_set_tracer(str);
+       return 1;
+ }
+ __setup("ftrace", set_ftrace);
+ static int __init set_ftrace_dump_on_oops(char *str)
+ {
+       ftrace_dump_on_oops = 1;
+       return 1;
+ }
+ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
  
  long
  ns2usecs(cycle_t nsec)
@@@ -112,6 -174,19 +174,19 @@@ static DEFINE_PER_CPU(struct trace_arra
  /* tracer_enabled is used to toggle activation of a tracer */
  static int                    tracer_enabled = 1;
  
+ /**
+  * tracing_is_enabled - return tracer_enabled status
+  *
+  * This function is used by other tracers to know the status
+  * of the tracer_enabled flag.  Tracers may use this function
+  * to know if it should enable their features when starting
+  * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+  */
+ int tracing_is_enabled(void)
+ {
+       return tracer_enabled;
+ }
  /* function tracing enabled */
  int                           ftrace_function_enabled;
  
@@@ -153,8 -228,9 +228,9 @@@ static DEFINE_MUTEX(trace_types_lock)
  /* trace_wait is a waitqueue for tasks blocked on trace_poll */
  static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
  
- /* trace_flags holds iter_ctrl options */
- unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+ /* trace_flags holds trace_options default values */
+ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+       TRACE_ITER_ANNOTATE;
  
  /**
   * trace_wake_up - wake up tasks waiting for trace input
@@@ -193,13 -269,6 +269,6 @@@ unsigned long nsecs_to_usecs(unsigned l
        return nsecs / 1000;
  }
  
- /*
-  * TRACE_ITER_SYM_MASK masks the options in trace_flags that
-  * control the output of kernel symbols.
-  */
- #define TRACE_ITER_SYM_MASK \
-       (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
  /* These must match the bit postions in trace_iterator_flags */
  static const char *trace_options[] = {
        "print-parent",
        "stacktrace",
        "sched-tree",
        "ftrace_printk",
+       "ftrace_preempt",
+       "branch",
+       "annotate",
+       "userstacktrace",
+       "sym-userobj",
        NULL
  };
  
@@@ -359,6 -433,28 +433,28 @@@ trace_seq_putmem_hex(struct trace_seq *
        return trace_seq_putmem(s, hex, j);
  }
  
+ static int
+ trace_seq_path(struct trace_seq *s, struct path *path)
+ {
+       unsigned char *p;
+       if (s->len >= (PAGE_SIZE - 1))
+               return 0;
+       p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+       if (!IS_ERR(p)) {
+               p = mangle_path(s->buffer + s->len, p, "\n");
+               if (p) {
+                       s->len = p - s->buffer;
+                       return 1;
+               }
+       } else {
+               s->buffer[s->len++] = '?';
+               return 1;
+       }
+       return 0;
+ }
  static void
  trace_seq_reset(struct trace_seq *s)
  {
@@@ -470,7 -566,17 +566,17 @@@ int register_tracer(struct tracer *type
                return -1;
        }
  
+       /*
+        * When this gets called we hold the BKL which means that
+        * preemption is disabled. Various trace selftests however
+        * need to disable and enable preemption for successful tests.
+        * So we drop the BKL here and grab it after the tests again.
+        */
+       unlock_kernel();
        mutex_lock(&trace_types_lock);
+       tracing_selftest_running = true;
        for (t = trace_types; t; t = t->next) {
                if (strcmp(type->name, t->name) == 0) {
                        /* already found */
                }
        }
  
+       if (!type->set_flag)
+               type->set_flag = &dummy_set_flag;
+       if (!type->flags)
+               type->flags = &dummy_tracer_flags;
+       else
+               if (!type->flags->opts)
+                       type->flags->opts = dummy_tracer_opt;
  #ifdef CONFIG_FTRACE_STARTUP_TEST
        if (type->selftest) {
                struct tracer *saved_tracer = current_trace;
                struct trace_array *tr = &global_trace;
-               int saved_ctrl = tr->ctrl;
                int i;
                /*
                 * Run a selftest on this tracer.
                 * Here we reset the trace buffer, and set the current
                 * internal tracing to verify that everything is in order.
                 * If we fail, we do not register this tracer.
                 */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                        tracing_reset(tr, i);
-               }
                current_trace = type;
-               tr->ctrl = 0;
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
                ret = type->selftest(type, tr);
                /* the test is responsible for resetting too */
                current_trace = saved_tracer;
-               tr->ctrl = saved_ctrl;
                if (ret) {
                        printk(KERN_CONT "FAILED!\n");
                        goto out;
                }
                /* Only reset on passing, to avoid touching corrupted buffers */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                        tracing_reset(tr, i);
-               }
                printk(KERN_CONT "PASSED\n");
        }
  #endif
                max_tracer_type_len = len;
  
   out:
+       tracing_selftest_running = false;
        mutex_unlock(&trace_types_lock);
+       lock_kernel();
  
        return ret;
  }
@@@ -581,6 -695,91 +695,91 @@@ static void trace_init_cmdlines(void
        cmdline_idx = 0;
  }
  
+ static int trace_stop_count;
+ static DEFINE_SPINLOCK(tracing_start_lock);
+ /**
+  * ftrace_off_permanent - disable all ftrace code permanently
+  *
+  * This should only be called when a serious anomally has
+  * been detected.  This will turn off the function tracing,
+  * ring buffers, and other tracing utilites. It takes no
+  * locks and can be called from any context.
+  */
+ void ftrace_off_permanent(void)
+ {
+       tracing_disabled = 1;
+       ftrace_stop();
+       tracing_off_permanent();
+ }
+ /**
+  * tracing_start - quick start of the tracer
+  *
+  * If tracing is enabled but was stopped by tracing_stop,
+  * this will start the tracer back up.
+  */
+ void tracing_start(void)
+ {
+       struct ring_buffer *buffer;
+       unsigned long flags;
+       if (tracing_disabled)
+               return;
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (--trace_stop_count)
+               goto out;
+       if (trace_stop_count < 0) {
+               /* Someone screwed up their debugging */
+               WARN_ON_ONCE(1);
+               trace_stop_count = 0;
+               goto out;
+       }
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+       ftrace_start();
+  out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+ }
+ /**
+  * tracing_stop - quick stop of the tracer
+  *
+  * Light weight way to stop tracing. Use in conjunction with
+  * tracing_start.
+  */
+ void tracing_stop(void)
+ {
+       struct ring_buffer *buffer;
+       unsigned long flags;
+       ftrace_stop();
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (trace_stop_count++)
+               goto out;
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+  out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+ }
  void trace_stop_cmdline_recording(void);
  
  static void trace_save_cmdline(struct task_struct *tsk)
        spin_unlock(&trace_cmdline_lock);
  }
  
static char *trace_find_cmdline(int pid)
+ char *trace_find_cmdline(int pid)
  {
        char *cmdline = "<...>";
        unsigned map;
@@@ -655,6 -854,7 +854,7 @@@ tracing_generic_entry_update(struct tra
  
        entry->preempt_count            = pc & 0xff;
        entry->pid                      = (tsk) ? tsk->pid : 0;
+       entry->tgid                     = (tsk) ? tsk->tgid : 0;
        entry->flags =
  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@@ -691,6 -891,56 +891,56 @@@ trace_function(struct trace_array *tr, 
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
  }
  
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ static void __trace_graph_entry(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ent *trace,
+                               unsigned long flags,
+                               int pc)
+ {
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ent_entry *entry;
+       unsigned long irq_flags;
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_ENT;
+       entry->graph_ent                        = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ }
+ static void __trace_graph_return(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ret *trace,
+                               unsigned long flags,
+                               int pc)
+ {
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ret_entry *entry;
+       unsigned long irq_flags;
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_RET;
+       entry->ret                              = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ }
+ #endif
  void
  ftrace(struct trace_array *tr, struct trace_array_cpu *data,
         unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@@ -742,6 -992,46 +992,46 @@@ void __trace_stack(struct trace_array *
        ftrace_trace_stack(tr, data, flags, skip, preempt_count());
  }
  
+ static void ftrace_trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags, int pc)
+ {
+ #ifdef CONFIG_STACKTRACE
+       struct ring_buffer_event *event;
+       struct userstack_entry *entry;
+       struct stack_trace trace;
+       unsigned long irq_flags;
+       if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+               return;
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type         = TRACE_USER_STACK;
+       memset(&entry->caller, 0, sizeof(entry->caller));
+       trace.nr_entries        = 0;
+       trace.max_entries       = FTRACE_STACK_ENTRIES;
+       trace.skip              = 0;
+       trace.entries           = entry->caller;
+       save_stack_trace_user(&trace);
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ #endif
+ }
+ void __trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags)
+ {
+       ftrace_trace_userstack(tr, data, flags, preempt_count());
+ }
  static void
  ftrace_trace_special(void *__tr, void *__data,
                     unsigned long arg1, unsigned long arg2, unsigned long arg3,
        entry->arg3                     = arg3;
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+       ftrace_trace_userstack(tr, data, irq_flags, pc);
  
        trace_wake_up();
  }
@@@ -803,6 -1094,7 +1094,7 @@@ tracing_sched_switch_trace(struct trace
        entry->next_cpu = task_cpu(next);
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, flags, 5, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
  }
  
  void
@@@ -832,6 -1124,7 +1124,7 @@@ tracing_sched_wakeup_trace(struct trace
        entry->next_cpu                 = task_cpu(wakee);
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, flags, 6, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
  
        trace_wake_up();
  }
@@@ -841,26 -1134,28 +1134,28 @@@ ftrace_special(unsigned long arg1, unsi
  {
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
+       unsigned long flags;
        int cpu;
        int pc;
  
-       if (tracing_disabled || !tr->ctrl)
+       if (tracing_disabled)
                return;
  
        pc = preempt_count();
-       preempt_disable_notrace();
+       local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
  
-       if (likely(!atomic_read(&data->disabled)))
+       if (likely(atomic_inc_return(&data->disabled) == 1))
                ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
  
-       preempt_enable_notrace();
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
  }
  
  #ifdef CONFIG_FUNCTION_TRACER
  static void
- function_trace_call(unsigned long ip, unsigned long parent_ip)
+ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
  {
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
                return;
  
        pc = preempt_count();
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
        local_save_flags(flags);
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
                trace_function(tr, data, ip, parent_ip, flags, pc);
  
        atomic_dec(&data->disabled);
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
+ }
+ static void
+ function_trace_call(unsigned long ip, unsigned long parent_ip)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+       if (unlikely(!ftrace_function_enabled))
+               return;
+       /*
+        * Need to use raw, since this must be called before the
+        * recursive protection is performed.
+        */
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               trace_function(tr, data, ip, parent_ip, flags, pc);
+       }
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+ }
+ #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ int trace_graph_entry(struct ftrace_graph_ent *trace)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+       if (!ftrace_trace_task(current))
+               return 0;
+       if (!ftrace_graph_addr(trace->func))
+               return 0;
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_entry(tr, data, trace, flags, pc);
+       }
+       /* Only do the atomic if it is not already set */
+       if (!test_tsk_trace_graph(current))
+               set_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+       return 1;
+ }
+ void trace_graph_return(struct ftrace_graph_ret *trace)
+ {
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_return(tr, data, trace, flags, pc);
+       }
+       if (!trace->depth)
+               clear_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
  }
+ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
  
  static struct ftrace_ops trace_ops __read_mostly =
  {
  void tracing_start_function_trace(void)
  {
        ftrace_function_enabled = 0;
+       if (trace_flags & TRACE_ITER_PREEMPTONLY)
+               trace_ops.func = function_trace_call_preempt_only;
+       else
+               trace_ops.func = function_trace_call;
        register_ftrace_function(&trace_ops);
-       if (tracer_enabled)
-               ftrace_function_enabled = 1;
+       ftrace_function_enabled = 1;
  }
  
  void tracing_stop_function_trace(void)
  
  enum trace_file_type {
        TRACE_FILE_LAT_FMT      = 1,
+       TRACE_FILE_ANNOTATE     = 2,
  };
  
  static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@@ -1047,10 -1432,6 +1432,6 @@@ static void *s_start(struct seq_file *m
  
        atomic_inc(&trace_record_cmdline_disabled);
  
-       /* let the tracer grab locks here if needed */
-       if (current_trace->start)
-               current_trace->start(iter);
        if (*pos != iter->pos) {
                iter->ent = NULL;
                iter->cpu = 0;
  
  static void s_stop(struct seq_file *m, void *p)
  {
-       struct trace_iterator *iter = m->private;
        atomic_dec(&trace_record_cmdline_disabled);
-       /* let the tracer release locks here if needed */
-       if (current_trace && current_trace == iter->trace && iter->trace->stop)
-               iter->trace->stop(iter);
        mutex_unlock(&trace_types_lock);
  }
  
@@@ -1143,7 -1517,7 +1517,7 @@@ seq_print_sym_offset(struct trace_seq *
  # define IP_FMT "%016lx"
  #endif
  
static int
+ int
  seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
  {
        int ret;
        return ret;
  }
  
+ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+                                   unsigned long ip, unsigned long sym_flags)
+ {
+       struct file *file = NULL;
+       unsigned long vmstart = 0;
+       int ret = 1;
+       if (mm) {
+               const struct vm_area_struct *vma;
+               down_read(&mm->mmap_sem);
+               vma = find_vma(mm, ip);
+               if (vma) {
+                       file = vma->vm_file;
+                       vmstart = vma->vm_start;
+               }
+               if (file) {
+                       ret = trace_seq_path(s, &file->f_path);
+                       if (ret)
+                               ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
+               }
+               up_read(&mm->mmap_sem);
+       }
+       if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+               ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+       return ret;
+ }
+ static int
+ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+                     unsigned long sym_flags)
+ {
+       struct mm_struct *mm = NULL;
+       int ret = 1;
+       unsigned int i;
+       if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+               struct task_struct *task;
+               /*
+                * we do the lookup on the thread group leader,
+                * since individual threads might have already quit!
+                */
+               rcu_read_lock();
+               task = find_task_by_vpid(entry->ent.tgid);
+               if (task)
+                       mm = get_task_mm(task);
+               rcu_read_unlock();
+       }
+       for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+               unsigned long ip = entry->caller[i];
+               if (ip == ULONG_MAX || !ret)
+                       break;
+               if (i && ret)
+                       ret = trace_seq_puts(s, " <- ");
+               if (!ip) {
+                       if (ret)
+                               ret = trace_seq_puts(s, "??");
+                       continue;
+               }
+               if (!ret)
+                       break;
+               if (ret)
+                       ret = seq_print_user_ip(s, mm, ip, sym_flags);
+       }
+       if (mm)
+               mmput(mm);
+       return ret;
+ }
  static void print_lat_help_header(struct seq_file *m)
  {
        seq_puts(m, "#                  _------=> CPU#            \n");
@@@ -1338,6 -1784,23 +1784,23 @@@ void trace_seq_print_cont(struct trace_
                trace_seq_putc(s, '\n');
  }
  
+ static void test_cpu_buff_start(struct trace_iterator *iter)
+ {
+       struct trace_seq *s = &iter->seq;
+       if (!(trace_flags & TRACE_ITER_ANNOTATE))
+               return;
+       if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+               return;
+       if (cpu_isset(iter->cpu, iter->started))
+               return;
+       cpu_set(iter->cpu, iter->started);
+       trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+ }
  static enum print_line_t
  print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
  {
        if (entry->type == TRACE_CONT)
                return TRACE_TYPE_HANDLED;
  
+       test_cpu_buff_start(iter);
        next_entry = find_next_entry(iter, NULL, &next_ts);
        if (!next_entry)
                next_ts = iter->ts;
                        trace_seq_print_cont(s, iter);
                break;
        }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+               trace_assign_type(field, entry);
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+               trace_assign_type(field, entry);
+               seq_print_userip_objs(field, s, sym_flags);
+               trace_seq_putc(s, '\n');
+               break;
+       }
        default:
                trace_seq_printf(s, "Unknown type %d\n", entry->type);
        }
@@@ -1472,6 -1958,8 +1958,8 @@@ static enum print_line_t print_trace_fm
        if (entry->type == TRACE_CONT)
                return TRACE_TYPE_HANDLED;
  
+       test_cpu_buff_start(iter);
        comm = trace_find_cmdline(iter->ent->pid);
  
        t = ns2usecs(iter->ts);
                        trace_seq_print_cont(s, iter);
                break;
        }
+       case TRACE_GRAPH_RET: {
+               return print_graph_function(iter);
+       }
+       case TRACE_GRAPH_ENT: {
+               return print_graph_function(iter);
+       }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+               trace_assign_type(field, entry);
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+               trace_assign_type(field, entry);
+               ret = seq_print_userip_objs(field, s, sym_flags);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               ret = trace_seq_putc(s, '\n');
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               break;
+       }
        }
        return TRACE_TYPE_HANDLED;
  }
@@@ -1640,6 -2159,7 +2159,7 @@@ static enum print_line_t print_raw_fmt(
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
  
@@@ -1728,6 -2248,7 +2248,7 @@@ static enum print_line_t print_hex_fmt(
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
  
@@@ -1782,6 -2303,7 +2303,7 @@@ static enum print_line_t print_bin_fmt(
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
  
@@@ -1847,7 -2369,9 +2369,9 @@@ static int s_show(struct seq_file *m, v
                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
                        seq_puts(m, "#\n");
                }
-               if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+               if (iter->trace && iter->trace->print_header)
+                       iter->trace->print_header(m);
+               else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                        /* print nothing if the buffers are empty */
                        if (trace_empty(iter))
                                return 0;
@@@ -1899,6 -2423,15 +2423,15 @@@ __tracing_open(struct inode *inode, str
        iter->trace = current_trace;
        iter->pos = -1;
  
+       /* Notify the tracer early; before we stop tracing. */
+       if (iter->trace && iter->trace->open)
+                       iter->trace->open(iter);
+       /* Annotate start of buffers if we had overruns */
+       if (ring_buffer_overruns(iter->tr->buffer))
+               iter->iter_flags |= TRACE_FILE_ANNOTATE;
        for_each_tracing_cpu(cpu) {
  
                iter->buffer_iter[cpu] =
        m->private = iter;
  
        /* stop the trace while dumping */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 0;
-               ftrace_function_enabled = 0;
-       }
-       if (iter->trace && iter->trace->open)
-                       iter->trace->open(iter);
+       tracing_stop();
  
        mutex_unlock(&trace_types_lock);
  
@@@ -1966,14 -2493,7 +2493,7 @@@ int tracing_release(struct inode *inode
                iter->trace->close(iter);
  
        /* reenable tracing if it was previously enabled */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 1;
-               /*
-                * It is safe to enable function tracing even if it
-                * isn't used
-                */
-               ftrace_function_enabled = 1;
-       }
+       tracing_start();
        mutex_unlock(&trace_types_lock);
  
        seq_release(inode, file);
@@@ -2126,7 -2646,7 +2646,7 @@@ tracing_cpumask_read(struct file *filp
  
        mutex_lock(&tracing_cpumask_update_lock);
  
 -      len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
 +      len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
        if (count - len < 2) {
                count = -EINVAL;
                goto out_err;
@@@ -2147,11 -2667,11 +2667,11 @@@ tracing_cpumask_write(struct file *filp
        int err, cpu;
  
        mutex_lock(&tracing_cpumask_update_lock);
 -      err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
 +      err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
        if (err)
                goto err_unlock;
  
-       raw_local_irq_disable();
+       local_irq_disable();
        __raw_spin_lock(&ftrace_max_lock);
        for_each_tracing_cpu(cpu) {
                /*
                }
        }
        __raw_spin_unlock(&ftrace_max_lock);
-       raw_local_irq_enable();
+       local_irq_enable();
  
        tracing_cpumask = tracing_cpumask_new;
  
@@@ -2189,13 -2709,16 +2709,16 @@@ static struct file_operations tracing_c
  };
  
  static ssize_t
- tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+ tracing_trace_options_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
  {
+       int i;
        char *buf;
        int r = 0;
        int len = 0;
-       int i;
+       u32 tracer_flags = current_trace->flags->val;
+       struct tracer_opt *trace_opts = current_trace->flags->opts;
  
        /* calulate max size */
        for (i = 0; trace_options[i]; i++) {
                len += 3; /* "no" and space */
        }
  
+       /*
+        * Increase the size with names of options specific
+        * of the current tracer.
+        */
+       for (i = 0; trace_opts[i].name; i++) {
+               len += strlen(trace_opts[i].name);
+               len += 3; /* "no" and space */
+       }
        /* +2 for \n and \0 */
        buf = kmalloc(len + 2, GFP_KERNEL);
        if (!buf)
                        r += sprintf(buf + r, "no%s ", trace_options[i]);
        }
  
+       for (i = 0; trace_opts[i].name; i++) {
+               if (tracer_flags & trace_opts[i].bit)
+                       r += sprintf(buf + r, "%s ",
+                               trace_opts[i].name);
+               else
+                       r += sprintf(buf + r, "no%s ",
+                               trace_opts[i].name);
+       }
        r += sprintf(buf + r, "\n");
        WARN_ON(r >= len + 2);
  
        return r;
  }
  
+ /* Try to assign a tracer specific option */
+ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+ {
+       struct tracer_flags *trace_flags = trace->flags;
+       struct tracer_opt *opts = NULL;
+       int ret = 0, i = 0;
+       int len;
+       for (i = 0; trace_flags->opts[i].name; i++) {
+               opts = &trace_flags->opts[i];
+               len = strlen(opts->name);
+               if (strncmp(cmp, opts->name, len) == 0) {
+                       ret = trace->set_flag(trace_flags->val,
+                               opts->bit, !neg);
+                       break;
+               }
+       }
+       /* Not found */
+       if (!trace_flags->opts[i].name)
+               return -EINVAL;
+       /* Refused to handle */
+       if (ret)
+               return ret;
+       if (neg)
+               trace_flags->val &= ~opts->bit;
+       else
+               trace_flags->val |= opts->bit;
+       return 0;
+ }
  static ssize_t
- tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
  {
        char buf[64];
        char *cmp = buf;
        int neg = 0;
+       int ret;
        int i;
  
        if (cnt >= sizeof(buf))
                        break;
                }
        }
-       /*
-        * If no option could be set, return an error:
-        */
-       if (!trace_options[i])
-               return -EINVAL;
+       /* If no option could be set, test the specific tracer options */
+       if (!trace_options[i]) {
+               ret = set_tracer_option(current_trace, cmp, neg);
+               if (ret)
+                       return ret;
+       }
  
        filp->f_pos += cnt;
  
  
  static struct file_operations tracing_iter_fops = {
        .open           = tracing_open_generic,
-       .read           = tracing_iter_ctrl_read,
-       .write          = tracing_iter_ctrl_write,
+       .read           = tracing_trace_options_read,
+       .write          = tracing_trace_options_write,
  };
  
  static const char readme_msg[] =
        "# echo sched_switch > /debug/tracing/current_tracer\n"
        "# cat /debug/tracing/current_tracer\n"
        "sched_switch\n"
-       "# cat /debug/tracing/iter_ctrl\n"
+       "# cat /debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
-       "# echo print-parent > /debug/tracing/iter_ctrl\n"
+       "# echo print-parent > /debug/tracing/trace_options\n"
        "# echo 1 > /debug/tracing/tracing_enabled\n"
        "# cat /debug/tracing/trace > /tmp/trace.txt\n"
        "echo 0 > /debug/tracing/tracing_enabled\n"
@@@ -2311,11 -2889,10 +2889,10 @@@ static ssize_
  tracing_ctrl_read(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
  {
        char buf[64];
        int r;
  
-       r = sprintf(buf, "%ld\n", tr->ctrl);
+       r = sprintf(buf, "%u\n", tracer_enabled);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  }
  
@@@ -2343,16 -2920,18 +2920,18 @@@ tracing_ctrl_write(struct file *filp, c
        val = !!val;
  
        mutex_lock(&trace_types_lock);
-       if (tr->ctrl ^ val) {
-               if (val)
+       if (tracer_enabled ^ val) {
+               if (val) {
                        tracer_enabled = 1;
-               else
+                       if (current_trace->start)
+                               current_trace->start(tr);
+                       tracing_start();
+               } else {
                        tracer_enabled = 0;
-               tr->ctrl = val;
-               if (current_trace && current_trace->ctrl_update)
-                       current_trace->ctrl_update(tr);
+                       tracing_stop();
+                       if (current_trace->stop)
+                               current_trace->stop(tr);
+               }
        }
        mutex_unlock(&trace_types_lock);
  
@@@ -2378,29 -2957,11 +2957,11 @@@ tracing_set_trace_read(struct file *fil
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  }
  
- static ssize_t
- tracing_set_trace_write(struct file *filp, const char __user *ubuf,
-                       size_t cnt, loff_t *ppos)
+ static int tracing_set_tracer(char *buf)
  {
        struct trace_array *tr = &global_trace;
        struct tracer *t;
-       char buf[max_tracer_type_len+1];
-       int i;
-       size_t ret;
-       ret = cnt;
-       if (cnt > max_tracer_type_len)
-               cnt = max_tracer_type_len;
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-       buf[cnt] = 0;
-       /* strip ending whitespace. */
-       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
-               buf[i] = 0;
+       int ret = 0;
  
        mutex_lock(&trace_types_lock);
        for (t = trace_types; t; t = t->next) {
        if (t == current_trace)
                goto out;
  
+       trace_branch_disable();
        if (current_trace && current_trace->reset)
                current_trace->reset(tr);
  
        current_trace = t;
-       if (t->init)
-               t->init(tr);
+       if (t->init) {
+               ret = t->init(tr);
+               if (ret)
+                       goto out;
+       }
  
+       trace_branch_enable(tr);
   out:
        mutex_unlock(&trace_types_lock);
  
-       if (ret > 0)
-               filp->f_pos += ret;
+       return ret;
+ }
+ static ssize_t
+ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+ {
+       char buf[max_tracer_type_len+1];
+       int i;
+       size_t ret;
+       int err;
+       ret = cnt;
+       if (cnt > max_tracer_type_len)
+               cnt = max_tracer_type_len;
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+       buf[cnt] = 0;
+       /* strip ending whitespace. */
+       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+               buf[i] = 0;
+       err = tracing_set_tracer(buf);
+       if (err)
+               return err;
+       filp->f_pos += ret;
  
        return ret;
  }
@@@ -2492,6 -3087,10 +3087,10 @@@ static int tracing_open_pipe(struct ino
                return -ENOMEM;
  
        mutex_lock(&trace_types_lock);
+       /* trace pipe does not show start of buffer */
+       cpus_setall(iter->started);
        iter->tr = &global_trace;
        iter->trace = current_trace;
        filp->private_data = iter;
@@@ -2667,7 -3266,7 +3266,7 @@@ tracing_entries_read(struct file *filp
        char buf[64];
        int r;
  
-       r = sprintf(buf, "%lu\n", tr->entries);
+       r = sprintf(buf, "%lu\n", tr->entries >> 10);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  }
  
@@@ -2678,7 -3277,6 +3277,6 @@@ tracing_entries_write(struct file *filp
        unsigned long val;
        char buf[64];
        int ret, cpu;
-       struct trace_array *tr = filp->private_data;
  
        if (cnt >= sizeof(buf))
                return -EINVAL;
  
        mutex_lock(&trace_types_lock);
  
-       if (tr->ctrl) {
-               cnt = -EBUSY;
-               pr_info("ftrace: please disable tracing"
-                       " before modifying buffer size\n");
-               goto out;
-       }
+       tracing_stop();
  
        /* disable all cpu buffers */
        for_each_tracing_cpu(cpu) {
                        atomic_inc(&max_tr.data[cpu]->disabled);
        }
  
+       /* value is in KB */
+       val <<= 10;
        if (val != global_trace.entries) {
                ret = ring_buffer_resize(global_trace.buffer, val);
                if (ret < 0) {
                        atomic_dec(&max_tr.data[cpu]->disabled);
        }
  
+       tracing_start();
        max_tr.entries = global_trace.entries;
        mutex_unlock(&trace_types_lock);
  
@@@ -2762,7 -3359,7 +3359,7 @@@ static int mark_printk(const char *fmt
        int ret;
        va_list args;
        va_start(args, fmt);
-       ret = trace_vprintk(0, fmt, args);
+       ret = trace_vprintk(0, -1, fmt, args);
        va_end(args);
        return ret;
  }
@@@ -2773,9 -3370,8 +3370,8 @@@ tracing_mark_write(struct file *filp, c
  {
        char *buf;
        char *end;
-       struct trace_array *tr = &global_trace;
  
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled)
                return -EINVAL;
  
        if (cnt > TRACE_BUF_SIZE)
@@@ -2841,22 -3437,38 +3437,38 @@@ static struct file_operations tracing_m
  
  #ifdef CONFIG_DYNAMIC_FTRACE
  
+ int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+ {
+       return 0;
+ }
  static ssize_t
- tracing_read_long(struct file *filp, char __user *ubuf,
+ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
  {
+       static char ftrace_dyn_info_buffer[1024];
+       static DEFINE_MUTEX(dyn_info_mutex);
        unsigned long *p = filp->private_data;
-       char buf[64];
+       char *buf = ftrace_dyn_info_buffer;
+       int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
        int r;
  
-       r = sprintf(buf, "%ld\n", *p);
+       mutex_lock(&dyn_info_mutex);
+       r = sprintf(buf, "%ld ", *p);
  
-       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+       r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+       buf[r++] = '\n';
+       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+       mutex_unlock(&dyn_info_mutex);
+       return r;
  }
  
- static struct file_operations tracing_read_long_fops = {
+ static struct file_operations tracing_dyn_info_fops = {
        .open           = tracing_open_generic,
-       .read           = tracing_read_long,
+       .read           = tracing_read_dyn_info,
  };
  #endif
  
@@@ -2897,10 -3509,10 +3509,10 @@@ static __init int tracer_init_debugfs(v
        if (!entry)
                pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
  
-       entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+       entry = debugfs_create_file("trace_options", 0644, d_tracer,
                                    NULL, &tracing_iter_fops);
        if (!entry)
-               pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+               pr_warning("Could not create debugfs 'trace_options' entry\n");
  
        entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
                                    NULL, &tracing_cpumask_fops);
                pr_warning("Could not create debugfs "
                           "'trace_pipe' entry\n");
  
-       entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+       entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
                                    &global_trace, &tracing_entries_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
-                          "'trace_entries' entry\n");
+                          "'buffer_size_kb' entry\n");
  
        entry = debugfs_create_file("trace_marker", 0220, d_tracer,
                                    NULL, &tracing_mark_fops);
  #ifdef CONFIG_DYNAMIC_FTRACE
        entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                                    &ftrace_update_tot_cnt,
-                                   &tracing_read_long_fops);
+                                   &tracing_dyn_info_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
                           "'dyn_ftrace_total_info' entry\n");
        return 0;
  }
  
- int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
  {
        static DEFINE_SPINLOCK(trace_buf_lock);
        static char trace_buf[TRACE_BUF_SIZE];
        struct ring_buffer_event *event;
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
-       struct print_entry *entry;
-       unsigned long flags, irq_flags;
        int cpu, len = 0, size, pc;
+       struct print_entry *entry;
+       unsigned long irq_flags;
  
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled || tracing_selftest_running)
                return 0;
  
        pc = preempt_count();
        if (unlikely(atomic_read(&data->disabled)))
                goto out;
  
-       spin_lock_irqsave(&trace_buf_lock, flags);
+       pause_graph_tracing();
+       spin_lock_irqsave(&trace_buf_lock, irq_flags);
        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
  
        len = min(len, TRACE_BUF_SIZE-1);
        if (!event)
                goto out_unlock;
        entry = ring_buffer_event_data(event);
-       tracing_generic_entry_update(&entry->ent, flags, pc);
+       tracing_generic_entry_update(&entry->ent, irq_flags, pc);
        entry->ent.type                 = TRACE_PRINT;
        entry->ip                       = ip;
+       entry->depth                    = depth;
  
        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = 0;
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
  
   out_unlock:
-       spin_unlock_irqrestore(&trace_buf_lock, flags);
+       spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+       unpause_graph_tracing();
   out:
        preempt_enable_notrace();
  
@@@ -3037,7 -3651,7 +3651,7 @@@ int __ftrace_printk(unsigned long ip, c
                return 0;
  
        va_start(ap, fmt);
-       ret = trace_vprintk(ip, fmt, ap);
+       ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
        va_end(ap);
        return ret;
  }
@@@ -3046,7 -3660,8 +3660,8 @@@ EXPORT_SYMBOL_GPL(__ftrace_printk)
  static int trace_panic_handler(struct notifier_block *this,
                               unsigned long event, void *unused)
  {
-       ftrace_dump();
+       if (ftrace_dump_on_oops)
+               ftrace_dump();
        return NOTIFY_OK;
  }
  
@@@ -3062,7 -3677,8 +3677,8 @@@ static int trace_die_handler(struct not
  {
        switch (val) {
        case DIE_OOPS:
-               ftrace_dump();
+               if (ftrace_dump_on_oops)
+                       ftrace_dump();
                break;
        default:
                break;
@@@ -3103,7 -3719,6 +3719,6 @@@ trace_printk_seq(struct trace_seq *s
        trace_seq_reset(s);
  }
  
  void ftrace_dump(void)
  {
        static DEFINE_SPINLOCK(ftrace_dump_lock);
                atomic_inc(&global_trace.data[cpu]->disabled);
        }
  
+       /* don't look at user memory in panic mode */
+       trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
        printk(KERN_TRACE "Dumping ftrace buffer:\n");
  
        iter.tr = &global_trace;
@@@ -3221,7 -3839,6 +3839,6 @@@ __init static int tracer_alloc_buffers(
  #endif
  
        /* All seems OK, enable tracing */
-       global_trace.ctrl = tracer_enabled;
        tracing_disabled = 0;
  
        atomic_notifier_chain_register(&panic_notifier_list,