[PATCH] fbdev: Add 1366x768 (WXGA) mode to mode database

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 3ee2ae4..f06d059 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,6 +49,7 @@
  #include <linux/syscalls.h>
  #include <linux/times.h>
  #include <linux/acct.h>
+#include <linux/kprobes.h>
  #include <asm/tlb.h>
  
  #include <asm/unistd.h>
@@ -144,7 +145,8 @@
         (v1) * (v2_max) / (v1_max)
  
  #define DELTA(p) \
-       (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
+       (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+               INTERACTIVE_DELTA)
  
  #define TASK_INTERACTIVE(p) \
         ((p)->prio <= (p)->static_prio - DELTA(p))
@@ -178,13 +180,6 @@ static unsigned int task_timeslice(task_t *p)
  #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)      \
                                 < (long long) (sd)->cache_hot_time)
  
-void __put_task_struct_cb(struct rcu_head *rhp)
-{
-       __put_task_struct(container_of(rhp, struct task_struct, rcu));
-}
-
-EXPORT_SYMBOL_GPL(__put_task_struct_cb);
-
  /*
   * These are the runqueue data structures:
   */
@@ -215,7 +210,6 @@ struct runqueue {
          */
         unsigned long nr_running;
  #ifdef CONFIG_SMP
-       unsigned long prio_bias;
         unsigned long cpu_load[3];
  #endif
         unsigned long long nr_switches;
@@ -245,6 +239,7 @@ struct runqueue {
  
         task_t *migration_thread;
         struct list_head migration_queue;
+       int cpu;
  #endif
  
  #ifdef CONFIG_SCHEDSTATS
@@ -669,68 +664,17 @@ static int effective_prio(task_t *p)
         return prio;
  }
  
-#ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-       rq->prio_bias += MAX_PRIO - prio;
-}
-
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-       rq->prio_bias -= MAX_PRIO - prio;
-}
-
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-       rq->nr_running++;
-       if (rt_task(p)) {
-               if (p != rq->migration_thread)
-                       /*
-                        * The migration thread does the actual balancing. Do
-                        * not bias by its priority as the ultra high priority
-                        * will skew balancing adversely.
-                        */
-                       inc_prio_bias(rq, p->prio);
-       } else
-               inc_prio_bias(rq, p->static_prio);
-}
-
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-       rq->nr_running--;
-       if (rt_task(p)) {
-               if (p != rq->migration_thread)
-                       dec_prio_bias(rq, p->prio);
-       } else
-               dec_prio_bias(rq, p->static_prio);
-}
-#else
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-}
-
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-}
-
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-       rq->nr_running++;
-}
-
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-       rq->nr_running--;
-}
-#endif
-
  /*
   * __activate_task - move a task to the runqueue.
   */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(task_t *p, runqueue_t *rq)
  {
-       enqueue_task(p, rq->active);
-       inc_nr_running(p, rq);
+       prio_array_t *target = rq->active;
+
+       if (batch_task(p))
+               target = rq->expired;
+       enqueue_task(p, target);
+       rq->nr_running++;
  }
  
  /*
@@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
  static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
  {
         enqueue_task_head(p, rq->active);
-       inc_nr_running(p, rq);
+       rq->nr_running++;
  }
  
  static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -748,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
         unsigned long long __sleep_time = now - p->timestamp;
         unsigned long sleep_time;
  
-       if (unlikely(p->policy == SCHED_BATCH))
+       if (batch_task(p))
                 sleep_time = 0;
         else {
                 if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -760,27 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
         if (likely(sleep_time > 0)) {
                 /*
                  * User tasks that sleep a long time are categorised as
-                * idle and will get just interactive status to stay active &
-                * prevent them suddenly becoming cpu hogs and starving
-                * other processes.
+                * idle. They will only have their sleep_avg increased to a
+                * level that makes them just interactive priority to stay
+                * active yet prevent them suddenly becoming cpu hogs and
+                * starving other processes.
                  */
-               if (p->mm && p->activated != -1 &&
-                       sleep_time > INTERACTIVE_SLEEP(p)) {
-                               p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-                                               DEF_TIMESLICE);
-               } else {
-                       /*
-                        * The lower the sleep avg a task has the more
-                        * rapidly it will rise with sleep time.
-                        */
-                       sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
+               if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
+                               unsigned long ceiling;
  
+                               ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+                                       DEF_TIMESLICE);
+                               if (p->sleep_avg < ceiling)
+                                       p->sleep_avg = ceiling;
+               } else {
                         /*
                          * Tasks waking from uninterruptible sleep are
                          * limited in their sleep_avg rise as they
                          * are likely to be waiting on I/O
                          */
-                       if (p->activated == -1 && p->mm) {
+                       if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
                                         sleep_time = 0;
                                 else if (p->sleep_avg + sleep_time >=
@@ -835,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
          * This checks to make sure it's not an uninterruptible task
          * that is now waking up.
          */
-       if (!p->activated) {
+       if (p->sleep_type == SLEEP_NORMAL) {
                 /*
                  * Tasks which were woken up by interrupts (ie. hw events)
                  * are most likely of interactive nature. So we give them
@@ -844,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
                  * on a CPU, first time around:
                  */
                 if (in_interrupt())
-                       p->activated = 2;
+                       p->sleep_type = SLEEP_INTERRUPTED;
                 else {
                         /*
                          * Normal first-time wakeups get a credit too for
                          * on-runqueue time, but it will be weighted down:
                          */
-                       p->activated = 1;
+                       p->sleep_type = SLEEP_INTERACTIVE;
                 }
         }
         p->timestamp = now;
@@ -863,7 +805,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
   */
  static void deactivate_task(struct task_struct *p, runqueue_t *rq)
  {
-       dec_nr_running(p, rq);
+       rq->nr_running--;
         dequeue_task(p, p->array);
         p->array = NULL;
  }
@@ -1007,61 +949,27 @@ void kick_process(task_t *p)
   * We want to under-estimate the load of migration sources, to
   * balance conservatively.
   */
-static unsigned long __source_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long source_load(int cpu, int type)
  {
         runqueue_t *rq = cpu_rq(cpu);
-       unsigned long running = rq->nr_running;
-       unsigned long source_load, cpu_load = rq->cpu_load[type-1],
-               load_now = running * SCHED_LOAD_SCALE;
-
+       unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
         if (type == 0)
-               source_load = load_now;
-       else
-               source_load = min(cpu_load, load_now);
-
-       if (running > 1 || (idle == NOT_IDLE && running))
-               /*
-                * If we are busy rebalancing the load is biased by
-                * priority to create 'nice' support across cpus. When
-                * idle rebalancing we should only bias the source_load if
-                * there is more than one task running on that queue to
-                * prevent idle rebalance from trying to pull tasks from a
-                * queue with only one running task.
-                */
-               source_load = source_load * rq->prio_bias / running;
+               return load_now;
  
-       return source_load;
-}
-
-static inline unsigned long source_load(int cpu, int type)
-{
-       return __source_load(cpu, type, NOT_IDLE);
+       return min(rq->cpu_load[type-1], load_now);
  }
  
  /*
   * Return a high guess at the load of a migration-target cpu
   */
-static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long target_load(int cpu, int type)
  {
         runqueue_t *rq = cpu_rq(cpu);
-       unsigned long running = rq->nr_running;
-       unsigned long target_load, cpu_load = rq->cpu_load[type-1],
-               load_now = running * SCHED_LOAD_SCALE;
-
+       unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
         if (type == 0)
-               target_load = load_now;
-       else
-               target_load = max(cpu_load, load_now);
-
-       if (running > 1 || (idle == NOT_IDLE && running))
-               target_load = target_load * rq->prio_bias / running;
+               return load_now;
  
-       return target_load;
-}
-
-static inline unsigned long target_load(int cpu, int type)
-{
-       return __target_load(cpu, type, NOT_IDLE);
+       return max(rq->cpu_load[type-1], load_now);
  }
  
  /*
@@ -1294,9 +1202,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
                 }
         }
  
-       if (p->last_waker_cpu != this_cpu)
-               goto out_set_cpu;
-
         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                 goto out_set_cpu;
  
@@ -1367,8 +1272,6 @@ out_set_cpu:
                 cpu = task_cpu(p);
         }
  
-       p->last_waker_cpu = this_cpu;
-
  out_activate:
  #endif /* CONFIG_SMP */
         if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1377,19 +1280,19 @@ out_activate:
                  * Tasks on involuntary sleep don't earn
                  * sleep_avg beyond just interactive state.
                  */
-               p->activated = -1;
-       }
+               p->sleep_type = SLEEP_NONINTERACTIVE;
+       } else
  
         /*
          * Tasks that have marked their sleep as noninteractive get
-        * woken up without updating their sleep average. (i.e. their
-        * sleep is handled in a priority-neutral manner, no priority
-        * boost and no penalty.)
+        * woken up with their sleep average not weighted in an
+        * interactive way.
          */
-       if (old_state & TASK_NONINTERACTIVE)
-               __activate_task(p, rq);
-       else
-               activate_task(p, rq, cpu == this_cpu);
+               if (old_state & TASK_NONINTERACTIVE)
+                       p->sleep_type = SLEEP_NONINTERACTIVE;
+
+
+       activate_task(p, rq, cpu == this_cpu);
         /*
          * Sync wakeups (i.e. those types of wakeups where the waker
          * has indicated that it will leave the CPU in short order)
@@ -1450,12 +1353,9 @@ void fastcall sched_fork(task_t *p, int clone_flags)
  #ifdef CONFIG_SCHEDSTATS
         memset(&p->sched_info, 0, sizeof(p->sched_info));
  #endif
-#if defined(CONFIG_SMP)
-       p->last_waker_cpu = cpu;
-#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
         p->oncpu = 0;
  #endif
-#endif
  #ifdef CONFIG_PREEMPT
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
@@ -1530,7 +1430,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
                                 list_add_tail(&p->run_list, &current->run_list);
                                 p->array = current->array;
                                 p->array->nr_active++;
-                               inc_nr_running(p, rq);
+                               rq->nr_running++;
                         }
                         set_need_resched();
                 } else
@@ -1656,8 +1556,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
         finish_lock_switch(rq, prev);
         if (mm)
                 mmdrop(mm);
-       if (unlikely(prev_task_flags & PF_DEAD))
+       if (unlikely(prev_task_flags & PF_DEAD)) {
+               /*
+                * Remove function-return probe instances associated with this
+                * task and put them back on the free list.
+                */
+               kprobe_flush_task(prev);
                 put_task_struct(prev);
+       }
  }
  
  /**
@@ -1727,7 +1633,7 @@ unsigned long nr_uninterruptible(void)
  {
         unsigned long i, sum = 0;
  
-       for_each_cpu(i)
+       for_each_possible_cpu(i)
                 sum += cpu_rq(i)->nr_uninterruptible;
  
         /*
@@ -1744,7 +1650,7 @@ unsigned long long nr_context_switches(void)
  {
         unsigned long long i, sum = 0;
  
-       for_each_cpu(i)
+       for_each_possible_cpu(i)
                 sum += cpu_rq(i)->nr_switches;
  
         return sum;
@@ -1754,17 +1660,35 @@ unsigned long nr_iowait(void)
  {
         unsigned long i, sum = 0;
  
-       for_each_cpu(i)
+       for_each_possible_cpu(i)
                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
  
         return sum;
  }
  
+unsigned long nr_active(void)
+{
+       unsigned long i, running = 0, uninterruptible = 0;
+
+       for_each_online_cpu(i) {
+               running += cpu_rq(i)->nr_running;
+               uninterruptible += cpu_rq(i)->nr_uninterruptible;
+       }
+
+       if (unlikely((long)uninterruptible < 0))
+               uninterruptible = 0;
+
+       return running + uninterruptible;
+}
+
  #ifdef CONFIG_SMP
  
  /*
   * double_rq_lock - safely lock two runqueues
   *
+ * We must take them in cpu order to match code in
+ * dependent_sleeper and wake_dependent_sleeper.
+ *
   * Note this does not disable interrupts like task_rq_lock,
   * you need to do so manually before calling.
   */
@@ -1776,7 +1700,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
                 spin_lock(&rq1->lock);
                 __acquire(rq2->lock);   /* Fake it out ;) */
         } else {
-               if (rq1 < rq2) {
+               if (rq1->cpu < rq2->cpu) {
                         spin_lock(&rq1->lock);
                         spin_lock(&rq2->lock);
                 } else {
@@ -1812,7 +1736,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
         __acquires(this_rq->lock)
  {
         if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
+               if (busiest->cpu < this_rq->cpu) {
                         spin_unlock(&this_rq->lock);
                         spin_lock(&busiest->lock);
                         spin_lock(&this_rq->lock);
@@ -1875,9 +1799,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
                runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
  {
         dequeue_task(p, src_array);
-       dec_nr_running(p, src_rq);
+       src_rq->nr_running--;
         set_task_cpu(p, this_cpu);
-       inc_nr_running(p, this_rq);
+       this_rq->nr_running++;
         enqueue_task(p, this_array);
         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
                                 + this_rq->timestamp_last_tick;
@@ -2056,9 +1980,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
-                               load = __target_load(i, load_idx, idle);
+                               load = target_load(i, load_idx);
                         else
-                               load = __source_load(i, load_idx, idle);
+                               load = source_load(i, load_idx);
  
                         avg_load += load;
                 }
@@ -2171,7 +2095,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
         int i;
  
         for_each_cpu_mask(i, group->cpumask) {
-               load = __source_load(i, 0, idle);
+               load = source_load(i, 0);
  
                 if (load > max_load) {
                         max_load = load;
@@ -2959,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count);
  
  #endif
  
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+       return (sleep_type == SLEEP_INTERACTIVE ||
+               sleep_type == SLEEP_INTERRUPTED);
+}
+
  /*
   * schedule() is the main scheduler function.
   */
@@ -2978,13 +2908,11 @@ asmlinkage void __sched schedule(void)
          * schedule() atomically, we ignore that path for now.
          * Otherwise, whine if we are scheduling when we should not be.
          */
-       if (likely(!current->exit_state)) {
-               if (unlikely(in_atomic())) {
-                       printk(KERN_ERR "scheduling while atomic: "
-                               "%s/0x%08x/%d\n",
-                               current->comm, preempt_count(), current->pid);
-                       dump_stack();
-               }
+       if (unlikely(in_atomic() && !current->exit_state)) {
+               printk(KERN_ERR "BUG: scheduling while atomic: "
+                       "%s/0x%08x/%d\n",
+                       current->comm, preempt_count(), current->pid);
+               dump_stack();
         }
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
  
@@ -3084,12 +3012,12 @@ go_idle:
         queue = array->queue + idx;
         next = list_entry(queue->next, task_t, run_list);
  
-       if (!rt_task(next) && next->activated > 0) {
+       if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
                 unsigned long long delta = now - next->timestamp;
                 if (unlikely((long long)(now - next->timestamp) < 0))
                         delta = 0;
  
-               if (next->activated == 1)
+               if (next->sleep_type == SLEEP_INTERACTIVE)
                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
  
                 array = next->array;
@@ -3099,10 +3027,9 @@ go_idle:
                         dequeue_task(next, array);
                         next->prio = new_prio;
                         enqueue_task(next, array);
-               } else
-                       requeue_task(next, array);
+               }
         }
-       next->activated = 0;
+       next->sleep_type = SLEEP_NORMAL;
  switch_tasks:
         if (next == rq->idle)
                 schedstat_inc(rq, sched_goidle);
@@ -3571,10 +3498,8 @@ void set_user_nice(task_t *p, long nice)
                 goto out_unlock;
         }
         array = p->array;
-       if (array) {
+       if (array)
                 dequeue_task(p, array);
-               dec_prio_bias(rq, p->static_prio);
-       }
  
         old_prio = p->prio;
         new_prio = NICE_TO_PRIO(nice);
@@ -3584,7 +3509,6 @@ void set_user_nice(task_t *p, long nice)
  
         if (array) {
                 enqueue_task(p, array);
-               inc_prio_bias(rq, p->static_prio);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@ -3962,6 +3886,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
                         !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
+       retval = security_task_setscheduler(p, 0, NULL);
+       if (retval)
+               goto out_unlock;
+
         cpus_allowed = cpuset_cpus_allowed(p);
         cpus_and(new_mask, new_mask, cpus_allowed);
         retval = set_cpus_allowed(p, new_mask);
@@ -4030,8 +3958,11 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
         if (!p)
                 goto out_unlock;
  
-       retval = 0;
-       cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
+       retval = security_task_getscheduler(p);
+       if (retval)
+               goto out_unlock;
+
+       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
  
  out_unlock:
         read_unlock(&tasklist_lock);
@@ -4122,6 +4053,9 @@ asmlinkage long sys_sched_yield(void)
  
  static inline void __cond_resched(void)
  {
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+       __might_sleep(__FILE__, __LINE__);
+#endif
         /*
          * The BKS might be reacquired before we have dropped
          * PREEMPT_ACTIVE, which could trigger a second
@@ -4129,6 +4063,8 @@ static inline void __cond_resched(void)
          */
         if (unlikely(preempt_count()))
                 return;
+       if (unlikely(system_state != SYSTEM_RUNNING))
+               return;
         do {
                 add_preempt_count(PREEMPT_ACTIVE);
                 schedule();
@@ -4216,7 +4152,7 @@ EXPORT_SYMBOL(yield);
   */
  void __sched io_schedule(void)
  {
-       struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+       struct runqueue *rq = &__raw_get_cpu_var(runqueues);
  
         atomic_inc(&rq->nr_iowait);
         schedule();
@@ -4227,7 +4163,7 @@ EXPORT_SYMBOL(io_schedule);
  
  long __sched io_schedule_timeout(long timeout)
  {
-       struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+       struct runqueue *rq = &__raw_get_cpu_var(runqueues);
         long ret;
  
         atomic_inc(&rq->nr_iowait);
@@ -4434,6 +4370,7 @@ void __devinit init_idle(task_t *idle, int cpu)
         runqueue_t *rq = cpu_rq(cpu);
         unsigned long flags;
  
+       idle->timestamp = sched_clock();
         idle->sleep_avg = 0;
         idle->array = NULL;
         idle->prio = MAX_PRIO;
@@ -4819,6 +4756,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
                 break;
  #ifdef CONFIG_HOTPLUG_CPU
         case CPU_UP_CANCELED:
+               if (!cpu_rq(cpu)->migration_thread)
+                       break;
                 /* Unbind it from offline cpu so it can run.  Fall thru. */
                 kthread_bind(cpu_rq(cpu)->migration_thread,
                              any_online_cpu(cpu_online_map));
@@ -4861,7 +4800,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
  /* Register at highest priority so that task migration (migrate_all_tasks)
   * happens before everything else.
   */
-static struct notifier_block __devinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
         .notifier_call = migration_call,
         .priority = 10
  };
@@ -5141,7 +5080,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
  #define SEARCH_SCOPE           2
  #define MIN_CACHE_SIZE         (64*1024U)
  #define DEFAULT_CACHE_SIZE     (5*1024*1024U)
-#define ITERATIONS             2
+#define ITERATIONS             1
  #define SIZE_THRESH            130
  #define COST_THRESH            130
  
@@ -5159,7 +5098,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
  #define MAX_DOMAIN_DISTANCE 32
  
  static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-               { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+               { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
+/*
+ * Architectures may override the migration cost and thus avoid
+ * boot-time calibration. Unit is nanoseconds. Mostly useful for
+ * virtualized hardware:
+ */
+#ifdef CONFIG_DEFAULT_MIGRATION_COST
+                       CONFIG_DEFAULT_MIGRATION_COST
+#else
+                       -1LL
+#endif
+};
  
  /*
   * Allow override of migration cost - in units of microseconds.
@@ -5480,9 +5430,9 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
                                 break;
                         }
                 /*
-                * Increase the cachesize in 5% steps:
+                * Increase the cachesize in 10% steps:
                  */
-               size = size * 20 / 19;
+               size = size * 10 / 9;
         }
  
         if (migration_debug)
@@ -5551,13 +5501,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
                         -1
  #endif
                 );
-       printk("migration_cost=");
-       for (distance = 0; distance <= max_distance; distance++) {
-               if (distance)
-                       printk(",");
-               printk("%ld", (long)migration_cost[distance] / 1000);
+       if (system_state == SYSTEM_BOOTING) {
+               printk("migration_cost=");
+               for (distance = 0; distance <= max_distance; distance++) {
+                       if (distance)
+                               printk(",");
+                       printk("%ld", (long)migration_cost[distance] / 1000);
+               }
+               printk("\n");
         }
-       printk("\n");
         j1 = jiffies;
         if (migration_debug)
                 printk("migration: %ld seconds\n", (j1-j0)/HZ);
@@ -5662,11 +5614,31 @@ static int cpu_to_cpu_group(int cpu)
  }
  #endif
  
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+       return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+       return cpu;
+}
+#endif
+
  static DEFINE_PER_CPU(struct sched_domain, phys_domains);
  static struct sched_group sched_group_phys[NR_CPUS];
  static int cpu_to_phys_group(int cpu)
  {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+       cpumask_t mask = cpu_coregroup_map(cpu);
+       return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
         return first_cpu(cpu_sibling_map[cpu]);
  #else
         return cpu;
@@ -5689,6 +5661,32 @@ static int cpu_to_allnodes_group(int cpu)
  {
         return cpu_to_node(cpu);
  }
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+       struct sched_group *sg = group_head;
+       int j;
+
+       if (!sg)
+               return;
+next_sg:
+       for_each_cpu_mask(j, sg->cpumask) {
+               struct sched_domain *sd;
+
+               sd = &per_cpu(phys_domains, j);
+               if (j != first_cpu(sd->groups->cpumask)) {
+                       /*
+                        * Only add "power" once for each
+                        * physical package.
+                        */
+                       continue;
+               }
+
+               sg->cpu_power += sd->groups->cpu_power;
+       }
+       sg = sg->next;
+       if (sg != group_head)
+               goto next_sg;
+}
  #endif
  
  /*
@@ -5764,6 +5762,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
                 sd->parent = p;
                 sd->groups = &sched_group_phys[group];
  
+#ifdef CONFIG_SCHED_MC
+               p = sd;
+               sd = &per_cpu(core_domains, i);
+               group = cpu_to_core_group(i);
+               *sd = SD_MC_INIT;
+               sd->span = cpu_coregroup_map(i);
+               cpus_and(sd->span, sd->span, *cpu_map);
+               sd->parent = p;
+               sd->groups = &sched_group_core[group];
+#endif
+
  #ifdef CONFIG_SCHED_SMT
                 p = sd;
                 sd = &per_cpu(cpu_domains, i);
@@ -5789,6 +5798,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
         }
  #endif
  
+#ifdef CONFIG_SCHED_MC
+       /* Set up multi-core groups */
+       for_each_cpu_mask(i, *cpu_map) {
+               cpumask_t this_core_map = cpu_coregroup_map(i);
+               cpus_and(this_core_map, this_core_map, *cpu_map);
+               if (i != first_cpu(this_core_map))
+                       continue;
+               init_sched_build_groups(sched_group_core, this_core_map,
+                                       &cpu_to_core_group);
+       }
+#endif
+
+
         /* Set up physical groups */
         for (i = 0; i < MAX_NUMNODES; i++) {
                 cpumask_t nodemask = node_to_cpumask(i);
@@ -5885,51 +5907,38 @@ void build_sched_domains(const cpumask_t *cpu_map)
                 power = SCHED_LOAD_SCALE;
                 sd->groups->cpu_power = power;
  #endif
+#ifdef CONFIG_SCHED_MC
+               sd = &per_cpu(core_domains, i);
+               power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+                                           * SCHED_LOAD_SCALE / 10;
+               sd->groups->cpu_power = power;
+
+               sd = &per_cpu(phys_domains, i);
  
+               /*
+                * This has to be < 2 * SCHED_LOAD_SCALE
+                * Lets keep it SCHED_LOAD_SCALE, so that
+                * while calculating NUMA group's cpu_power
+                * we can simply do
+                *  numa_group->cpu_power += phys_group->cpu_power;
+                *
+                * See "only add power once for each physical pkg"
+                * comment below
+                */
+               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
                 sd = &per_cpu(phys_domains, i);
                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
                 sd->groups->cpu_power = power;
-
-#ifdef CONFIG_NUMA
-               sd = &per_cpu(allnodes_domains, i);
-               if (sd->groups) {
-                       power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                               (cpus_weight(sd->groups->cpumask)-1) / 10;
-                       sd->groups->cpu_power = power;
-               }
  #endif
         }
  
  #ifdef CONFIG_NUMA
-       for (i = 0; i < MAX_NUMNODES; i++) {
-               struct sched_group *sg = sched_group_nodes[i];
-               int j;
-
-               if (sg == NULL)
-                       continue;
-next_sg:
-               for_each_cpu_mask(j, sg->cpumask) {
-                       struct sched_domain *sd;
-                       int power;
+       for (i = 0; i < MAX_NUMNODES; i++)
+               init_numa_sched_groups_power(sched_group_nodes[i]);
  
-                       sd = &per_cpu(phys_domains, j);
-                       if (j != first_cpu(sd->groups->cpumask)) {
-                               /*
-                                * Only add "power" once for each
-                                * physical package.
-                                */
-                               continue;
-                       }
-                       power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                               (cpus_weight(sd->groups->cpumask)-1) / 10;
-
-                       sg->cpu_power += power;
-               }
-               sg = sg->next;
-               if (sg != sched_group_nodes[i])
-                       goto next_sg;
-       }
+       init_numa_sched_groups_power(sched_group_allnodes);
  #endif
  
         /* Attach the domains */
@@ -5937,6 +5946,8 @@ next_sg:
                 struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
                 sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+               sd = &per_cpu(core_domains, i);
  #else
                 sd = &per_cpu(phys_domains, i);
  #endif
@@ -6109,7 +6120,7 @@ void __init sched_init(void)
         runqueue_t *rq;
         int i, j, k;
  
-       for (i = 0; i < NR_CPUS; i++) {
+       for_each_possible_cpu(i) {
                 prio_array_t *array;
  
                 rq = cpu_rq(i);
@@ -6127,6 +6138,7 @@ void __init sched_init(void)
                 rq->push_cpu = 0;
                 rq->migration_thread = NULL;
                 INIT_LIST_HEAD(&rq->migration_queue);
+               rq->cpu = i;
  #endif
                 atomic_set(&rq->nr_iowait, 0);
  
@@ -6167,7 +6179,7 @@ void __might_sleep(char *file, int line)
                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                         return;
                 prev_jiffy = jiffies;
-               printk(KERN_ERR "Debug: sleeping function called from invalid"
+               printk(KERN_ERR "BUG: sleeping function called from invalid"
                                 " context at %s:%d\n", file, line);
                 printk("in_atomic():%d, irqs_disabled():%d\n",
                         in_atomic(), irqs_disabled());