[CRYPTO] templates: Pass type/mask when creating instances
[safe/jmp/linux-2.6] / kernel / sched.c
index 15ce772..960d7c5 100644 (file)
 #include <asm/unistd.h>
 
 /*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+       return (unsigned long long)jiffies * (1000000000 / HZ);
+}
+
+/*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
@@ -428,7 +438,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 14
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -466,7 +476,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
                        for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
                                        itype++) {
-                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+                                               "%lu",
                                    sd->lb_cnt[itype],
                                    sd->lb_balanced[itype],
                                    sd->lb_failed[itype],
@@ -476,11 +487,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
                                    sd->lb_nobusyq[itype],
                                    sd->lb_nobusyg[itype]);
                        }
-                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+                           " %lu %lu %lu\n",
                            sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
                            sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
                            sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
-                           sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+                           sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                           sd->ttwu_move_balance);
                }
                preempt_enable();
 #endif
@@ -940,6 +953,9 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
        unsigned long long now;
 
+       if (rt_task(p))
+               goto out;
+
        now = sched_clock();
 #ifdef CONFIG_SMP
        if (!local) {
@@ -961,8 +977,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
                                     (now - p->timestamp) >> 20);
        }
 
-       if (!rt_task(p))
-               p->prio = recalc_task_prio(p, now);
+       p->prio = recalc_task_prio(p, now);
 
        /*
         * This checks to make sure it's not an uninterruptible task
@@ -987,7 +1002,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
                }
        }
        p->timestamp = now;
-
+out:
        __activate_task(p, rq);
 }
 
@@ -1452,7 +1467,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
                if (this_sd->flags & SD_WAKE_AFFINE) {
                        unsigned long tl = this_load;
-                       unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+                       unsigned long tl_per_task;
+
+                       tl_per_task = cpu_avg_load_per_task(this_cpu);
 
                        /*
                         * If sync wakeup then subtract the (maximum possible)
@@ -1560,6 +1577,7 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
        return try_to_wake_up(p, state, 0);
 }
 
+static void task_running_tick(struct rq *rq, struct task_struct *p);
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1620,7 +1638,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
                 * runqueue lock is not a problem.
                 */
                current->time_slice = 1;
-               scheduler_tick();
+               task_running_tick(cpu_rq(cpu), current);
        }
        local_irq_enable();
        put_cpu();
@@ -1835,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm = next->mm;
        struct mm_struct *oldmm = prev->active_mm;
 
+       /*
+        * For paravirt, this is coupled with an exit in switch_to to
+        * combine the page table reload and the switch backend into
+        * one hypercall.
+        */
+       arch_enter_lazy_cpu_mode();
+
        if (!mm) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
@@ -2249,7 +2274,7 @@ out:
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
-                  cpumask_t *cpus)
+                  cpumask_t *cpus, int *balance)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2278,10 +2303,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                unsigned long load, group_capacity;
                int local_group;
                int i;
+               unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
 
                local_group = cpu_isset(this_cpu, group->cpumask);
 
+               if (local_group)
+                       balance_cpu = first_cpu(group->cpumask);
+
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
 
@@ -2297,9 +2326,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                                *sd_idle = 0;
 
                        /* Bias balancing toward cpus of our domain */
-                       if (local_group)
+                       if (local_group) {
+                               if (idle_cpu(i) && !first_idle_cpu) {
+                                       first_idle_cpu = 1;
+                                       balance_cpu = i;
+                               }
+
                                load = target_load(i, load_idx);
-                       else
+                       else
                                load = source_load(i, load_idx);
 
                        avg_load += load;
@@ -2307,6 +2341,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        sum_weighted_load += rq->raw_weighted_load;
                }
 
+               /*
+                * First idle cpu or the first cpu(busiest) in this sched group
+                * is eligible for doing load balancing at this and above
+                * domains.
+                */
+               if (local_group && balance_cpu != this_cpu && balance) {
+                       *balance = 0;
+                       goto ret;
+               }
+
                total_load += avg_load;
                total_pwr += group->cpu_power;
 
@@ -2466,18 +2510,21 @@ small_imbalance:
                pwr_now /= SCHED_LOAD_SCALE;
 
                /* Amount of load we'd subtract */
-               tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+               tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+                       busiest->cpu_power;
                if (max_load > tmp)
                        pwr_move += busiest->cpu_power *
                                min(busiest_load_per_task, max_load - tmp);
 
                /* Amount of load we'd add */
-               if (max_load*busiest->cpu_power <
-                               busiest_load_per_task*SCHED_LOAD_SCALE)
-                       tmp = max_load*busiest->cpu_power/this->cpu_power;
+               if (max_load * busiest->cpu_power <
+                               busiest_load_per_task * SCHED_LOAD_SCALE)
+                       tmp = max_load * busiest->cpu_power / this->cpu_power;
                else
-                       tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
-               pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+                       tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+                               this->cpu_power;
+               pwr_move += this->cpu_power *
+                       min(this_load_per_task, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;
 
                /* Move if we gain throughput */
@@ -2498,8 +2545,8 @@ out_balanced:
                *imbalance = min_load_per_task;
                return group_min;
        }
-ret:
 #endif
+ret:
        *imbalance = 0;
        return NULL;
 }
@@ -2550,7 +2597,8 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
-                       struct sched_domain *sd, enum idle_type idle)
+                       struct sched_domain *sd, enum idle_type idle,
+                       int *balance)
 {
        int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
@@ -2573,7 +2621,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 redo:
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-                                                       &cpus);
+                                  &cpus, balance);
+
+       if (*balance == 0)
+               goto out_balanced;
+
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -2715,7 +2767,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 redo:
        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
-                               &sd_idle, &cpus);
+                                  &sd_idle, &cpus, NULL);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
@@ -2852,14 +2904,16 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 static void update_load(struct rq *this_rq)
 {
        unsigned long this_load;
-       int i, scale;
+       unsigned int i, scale;
 
        this_load = this_rq->raw_weighted_load;
 
        /* Update our load: */
-       for (i = 0, scale = 1; i < 3; i++, scale <<= 1) {
+       for (i = 0, scale = 1; i < 3; i++, scale += scale) {
                unsigned long old_load, new_load;
 
+               /* scale is effectively 1 << i now, and >> i divides by scale */
+
                old_load = this_rq->cpu_load[i];
                new_load = this_load;
                /*
@@ -2869,7 +2923,7 @@ static void update_load(struct rq *this_rq)
                 */
                if (new_load > old_load)
                        new_load += scale-1;
-               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
 }
 
@@ -2885,7 +2939,7 @@ static DEFINE_SPINLOCK(balancing);
 
 static void run_rebalance_domains(struct softirq_action *h)
 {
-       int this_cpu = smp_processor_id();
+       int this_cpu = smp_processor_id(), balance = 1;
        struct rq *this_rq = cpu_rq(this_cpu);
        unsigned long interval;
        struct sched_domain *sd;
@@ -2917,7 +2971,7 @@ static void run_rebalance_domains(struct softirq_action *h)
                }
 
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(this_cpu, this_rq, sd, idle)) {
+                       if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -2932,6 +2986,14 @@ static void run_rebalance_domains(struct softirq_action *h)
 out:
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
+
+               /*
+                * Stop the load balance at this level. There is another
+                * CPU in our sched group which is doing load balancing more
+                * actively.
+                */
+               if (!balance)
+                       break;
        }
        this_rq->next_balance = next_balance;
 }
@@ -2944,23 +3006,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
 }
 #endif
 
-static inline void wake_priority_sleeper(struct rq *rq)
-{
-#ifdef CONFIG_SCHED_SMT
-       if (!rq->nr_running)
-               return;
-
-       spin_lock(&rq->lock);
-       /*
-        * If an SMT sibling task has been put to sleep for priority
-        * reasons reschedule the idle task to see if it can now run.
-        */
-       if (rq->nr_running)
-               resched_task(rq->idle);
-       spin_unlock(&rq->lock);
-#endif
-}
-
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
@@ -3177,10 +3222,7 @@ void scheduler_tick(void)
 
        update_cpu_clock(p, rq, now);
 
-       if (p == rq->idle)
-               /* Task on the idle queue */
-               wake_priority_sleeper(rq);
-       else
+       if (p != rq->idle)
                task_running_tick(rq, p);
 #ifdef CONFIG_SMP
        update_load(rq);
@@ -3189,136 +3231,6 @@ void scheduler_tick(void)
 #endif
 }
 
-#ifdef CONFIG_SCHED_SMT
-static inline void wakeup_busy_runqueue(struct rq *rq)
-{
-       /* If an SMT runqueue is sleeping due to priority reasons wake it up */
-       if (rq->curr == rq->idle && rq->nr_running)
-               resched_task(rq->idle);
-}
-
-/*
- * Called with interrupt disabled and this_rq's runqueue locked.
- */
-static void wake_sleeping_dependent(int this_cpu)
-{
-       struct sched_domain *tmp, *sd = NULL;
-       int i;
-
-       for_each_domain(this_cpu, tmp) {
-               if (tmp->flags & SD_SHARE_CPUPOWER) {
-                       sd = tmp;
-                       break;
-               }
-       }
-
-       if (!sd)
-               return;
-
-       for_each_cpu_mask(i, sd->span) {
-               struct rq *smt_rq = cpu_rq(i);
-
-               if (i == this_cpu)
-                       continue;
-               if (unlikely(!spin_trylock(&smt_rq->lock)))
-                       continue;
-
-               wakeup_busy_runqueue(smt_rq);
-               spin_unlock(&smt_rq->lock);
-       }
-}
-
-/*
- * number of 'lost' timeslices this task wont be able to fully
- * utilize, if another task runs on a sibling. This models the
- * slowdown effect of other tasks running on siblings:
- */
-static inline unsigned long
-smt_slice(struct task_struct *p, struct sched_domain *sd)
-{
-       return p->time_slice * (100 - sd->per_cpu_gain) / 100;
-}
-
-/*
- * To minimise lock contention and not have to drop this_rq's runlock we only
- * trylock the sibling runqueues and bypass those runqueues if we fail to
- * acquire their lock. As we only trylock the normal locking order does not
- * need to be obeyed.
- */
-static int
-dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
-{
-       struct sched_domain *tmp, *sd = NULL;
-       int ret = 0, i;
-
-       /* kernel/rt threads do not participate in dependent sleeping */
-       if (!p->mm || rt_task(p))
-               return 0;
-
-       for_each_domain(this_cpu, tmp) {
-               if (tmp->flags & SD_SHARE_CPUPOWER) {
-                       sd = tmp;
-                       break;
-               }
-       }
-
-       if (!sd)
-               return 0;
-
-       for_each_cpu_mask(i, sd->span) {
-               struct task_struct *smt_curr;
-               struct rq *smt_rq;
-
-               if (i == this_cpu)
-                       continue;
-
-               smt_rq = cpu_rq(i);
-               if (unlikely(!spin_trylock(&smt_rq->lock)))
-                       continue;
-
-               smt_curr = smt_rq->curr;
-
-               if (!smt_curr->mm)
-                       goto unlock;
-
-               /*
-                * If a user task with lower static priority than the
-                * running task on the SMT sibling is trying to schedule,
-                * delay it till there is proportionately less timeslice
-                * left of the sibling task to prevent a lower priority
-                * task from using an unfair proportion of the
-                * physical cpu's resources. -ck
-                */
-               if (rt_task(smt_curr)) {
-                       /*
-                        * With real time tasks we run non-rt tasks only
-                        * per_cpu_gain% of the time.
-                        */
-                       if ((jiffies % DEF_TIMESLICE) >
-                               (sd->per_cpu_gain * DEF_TIMESLICE / 100))
-                                       ret = 1;
-               } else {
-                       if (smt_curr->static_prio < p->static_prio &&
-                               !TASK_PREEMPTS_CURR(p, smt_rq) &&
-                               smt_slice(smt_curr, sd) > task_timeslice(p))
-                                       ret = 1;
-               }
-unlock:
-               spin_unlock(&smt_rq->lock);
-       }
-       return ret;
-}
-#else
-static inline void wake_sleeping_dependent(int this_cpu)
-{
-}
-static inline int
-dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
-{
-       return 0;
-}
-#endif
-
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
 
 void fastcall add_preempt_count(int val)
@@ -3332,7 +3244,8 @@ void fastcall add_preempt_count(int val)
        /*
         * Spinlock count overflowing soon?
         */
-       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+                               PREEMPT_MASK - 10);
 }
 EXPORT_SYMBOL(add_preempt_count);
 
@@ -3386,6 +3299,8 @@ asmlinkage void __sched schedule(void)
                        "%s/0x%08x/%d\n",
                        current->comm, preempt_count(), current->pid);
                debug_show_held_locks(current);
+               if (irqs_disabled())
+                       print_irqtrace_events(current);
                dump_stack();
        }
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3442,7 +3357,6 @@ need_resched_nonpreemptible:
                if (!rq->nr_running) {
                        next = rq->idle;
                        rq->expired_timestamp = 0;
-                       wake_sleeping_dependent(cpu);
                        goto switch_tasks;
                }
        }
@@ -3482,8 +3396,6 @@ need_resched_nonpreemptible:
                }
        }
        next->sleep_type = SLEEP_NORMAL;
-       if (dependent_sleeper(cpu, rq, next))
-               next = rq->idle;
 switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
@@ -3501,7 +3413,7 @@ switch_tasks:
 
        sched_info_switch(prev, next);
        if (likely(prev != next)) {
-               next->timestamp = now;
+               next->timestamp = next->last_ran = now;
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
@@ -4147,13 +4059,12 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 }
 
 /**
- * sched_setscheduler - change the scheduling policy and/or RT priority of
- * a thread.
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
- * NOTE: the task may be already dead
+ * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
                       struct sched_param *param)
@@ -4521,7 +4432,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
- * this function yields the current CPU by moving the calling thread
+ * This function yields the current CPU by moving the calling thread
  * to the expired array. If there are no other threads running on this
  * CPU then this function will return.
  */
@@ -4571,15 +4482,6 @@ asmlinkage long sys_sched_yield(void)
        return 0;
 }
 
-static inline int __resched_legal(int expected_preempt_count)
-{
-       if (unlikely(preempt_count() != expected_preempt_count))
-               return 0;
-       if (unlikely(system_state != SYSTEM_RUNNING))
-               return 0;
-       return 1;
-}
-
 static void __cond_resched(void)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -4599,7 +4501,8 @@ static void __cond_resched(void)
 
 int __sched cond_resched(void)
 {
-       if (need_resched() && __resched_legal(0)) {
+       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
+                                       system_state == SYSTEM_RUNNING) {
                __cond_resched();
                return 1;
        }
@@ -4625,7 +4528,7 @@ int cond_resched_lock(spinlock_t *lock)
                ret = 1;
                spin_lock(lock);
        }
-       if (need_resched() && __resched_legal(1)) {
+       if (need_resched() && system_state == SYSTEM_RUNNING) {
                spin_release(&lock->dep_map, 1, _THIS_IP_);
                _raw_spin_unlock(lock);
                preempt_enable_no_resched();
@@ -4641,7 +4544,7 @@ int __sched cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
 
-       if (need_resched() && __resched_legal(0)) {
+       if (need_resched() && system_state == SYSTEM_RUNNING) {
                raw_local_irq_disable();
                _local_bh_enable();
                raw_local_irq_enable();
@@ -4656,7 +4559,7 @@ EXPORT_SYMBOL(cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
- * this is a shortcut for kernel-space yielding - it marks the
+ * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
@@ -4784,32 +4687,10 @@ out_unlock:
        return retval;
 }
 
-static inline struct task_struct *eldest_child(struct task_struct *p)
-{
-       if (list_empty(&p->children))
-               return NULL;
-       return list_entry(p->children.next,struct task_struct,sibling);
-}
-
-static inline struct task_struct *older_sibling(struct task_struct *p)
-{
-       if (p->sibling.prev==&p->parent->children)
-               return NULL;
-       return list_entry(p->sibling.prev,struct task_struct,sibling);
-}
-
-static inline struct task_struct *younger_sibling(struct task_struct *p)
-{
-       if (p->sibling.next==&p->parent->children)
-               return NULL;
-       return list_entry(p->sibling.next,struct task_struct,sibling);
-}
-
 static const char stat_nam[] = "RSDTtZX";
 
 static void show_task(struct task_struct *p)
 {
-       struct task_struct *relative;
        unsigned long free = 0;
        unsigned state;
 
@@ -4835,19 +4716,7 @@ static void show_task(struct task_struct *p)
                free = (unsigned long)n - (unsigned long)end_of_stack(p);
        }
 #endif
-       printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
-       if ((relative = eldest_child(p)))
-               printk("%5d ", relative->pid);
-       else
-               printk("      ");
-       if ((relative = younger_sibling(p)))
-               printk("%7d", relative->pid);
-       else
-               printk("       ");
-       if ((relative = older_sibling(p)))
-               printk(" %5d", relative->pid);
-       else
-               printk("      ");
+       printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
        if (!p->mm)
                printk(" (L-TLB)\n");
        else
@@ -4877,7 +4746,7 @@ void show_state_filter(unsigned long state_filter)
                 * console might take alot of time:
                 */
                touch_nmi_watchdog();
-               if (p->state & state_filter)
+               if (!state_filter || (p->state & state_filter))
                        show_task(p);
        } while_each_thread(g, p);
 
@@ -5405,16 +5274,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                if (!(sd->flags & SD_LOAD_BALANCE)) {
                        printk("does not load-balance\n");
                        if (sd->parent)
-                               printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+                               printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+                                               " has parent");
                        break;
                }
 
                printk("span %s\n", str);
 
                if (!cpu_isset(cpu, sd->span))
-                       printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+                       printk(KERN_ERR "ERROR: domain->span does not contain "
+                                       "CPU%d\n", cpu);
                if (!cpu_isset(cpu, group->cpumask))
-                       printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+                       printk(KERN_ERR "ERROR: domain->groups does not contain"
+                                       " CPU%d\n", cpu);
 
                printk(KERN_DEBUG);
                for (i = 0; i < level + 2; i++)
@@ -5429,7 +5301,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
                        if (!group->cpu_power) {
                                printk("\n");
-                               printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+                               printk(KERN_ERR "ERROR: domain->cpu_power not "
+                                               "set\n");
                        }
 
                        if (!cpus_weight(group->cpumask)) {
@@ -5452,15 +5325,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                printk("\n");
 
                if (!cpus_equal(sd->span, groupmask))
-                       printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+                       printk(KERN_ERR "ERROR: groups don't span "
+                                       "domain->span\n");
 
                level++;
                sd = sd->parent;
+               if (!sd)
+                       continue;
 
-               if (sd) {
-                       if (!cpus_subset(groupmask, sd->span))
-                               printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
-               }
+               if (!cpus_subset(groupmask, sd->span))
+                       printk(KERN_ERR "ERROR: parent span is not a superset "
+                               "of domain->span\n");
 
        } while (sd);
 }
@@ -5556,7 +5431,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -5778,8 +5653,9 @@ __setup("max_cache_size=", setup_max_cache_size);
  */
 static void touch_cache(void *__cache, unsigned long __size)
 {
-       unsigned long size = __size/sizeof(long), chunk1 = size/3,
-                       chunk2 = 2*size/3;
+       unsigned long size = __size / sizeof(long);
+       unsigned long chunk1 = size / 3;
+       unsigned long chunk2 = 2 * size / 3;
        unsigned long *cache = __cache;
        int i;
 
@@ -5888,11 +5764,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
         */
        measure_one(cache, size, cpu1, cpu2);
        for (i = 0; i < ITERATIONS; i++)
-               cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+               cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
 
        measure_one(cache, size, cpu2, cpu1);
        for (i = 0; i < ITERATIONS; i++)
-               cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+               cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
 
        /*
         * (We measure the non-migrating [cached] cost on both
@@ -5902,17 +5778,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
 
        measure_one(cache, size, cpu1, cpu1);
        for (i = 0; i < ITERATIONS; i++)
-               cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+               cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
 
        measure_one(cache, size, cpu2, cpu2);
        for (i = 0; i < ITERATIONS; i++)
-               cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+               cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
 
        /*
         * Get the per-iteration migration cost:
         */
-       do_div(cost1, 2*ITERATIONS);
-       do_div(cost2, 2*ITERATIONS);
+       do_div(cost1, 2 * ITERATIONS);
+       do_div(cost2, 2 * ITERATIONS);
 
        return cost1 - cost2;
 }
@@ -5950,7 +5826,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
         */
        cache = vmalloc(max_size);
        if (!cache) {
-               printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+               printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
                return 1000000; /* return 1 msec on very small boxen */
        }
 
@@ -5975,7 +5851,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
                avg_fluct = (avg_fluct + fluct)/2;
 
                if (migration_debug)
-                       printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+                       printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
+                               "(%8Ld %8Ld)\n",
                                cpu1, cpu2, size,
                                (long)cost / 1000000,
                                ((long)cost / 100000) % 10,
@@ -6070,20 +5947,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
                        -1
 #endif
                );
-       if (system_state == SYSTEM_BOOTING) {
-               if (num_online_cpus() > 1) {
-                       printk("migration_cost=");
-                       for (distance = 0; distance <= max_distance; distance++) {
-                               if (distance)
-                                       printk(",");
-                               printk("%ld", (long)migration_cost[distance] / 1000);
-                       }
-                       printk("\n");
+       if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
+               printk("migration_cost=");
+               for (distance = 0; distance <= max_distance; distance++) {
+                       if (distance)
+                               printk(",");
+                       printk("%ld", (long)migration_cost[distance] / 1000);
                }
+               printk("\n");
        }
        j1 = jiffies;
        if (migration_debug)
-               printk("migration: %ld seconds\n", (j1-j0)/HZ);
+               printk("migration: %ld seconds\n", (j1-j0) / HZ);
 
        /*
         * Move back to the original CPU. NUMA-Q gets confused
@@ -6821,7 +6696,7 @@ void __init sched_init_smp(void)
 
        lock_cpu_hotplug();
        arch_init_sched_domains(&cpu_online_map);
-       cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
        if (cpus_empty(non_isolated_cpus))
                cpu_set(smp_processor_id(), non_isolated_cpus);
        unlock_cpu_hotplug();
@@ -6928,6 +6803,8 @@ void __might_sleep(char *file, int line)
                printk("in_atomic():%d, irqs_disabled():%d\n",
                        in_atomic(), irqs_disabled());
                debug_show_held_locks(current);
+               if (irqs_disabled())
+                       print_irqtrace_events(current);
                dump_stack();
        }
 #endif