SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  */
  20
  21 #include <linux/mm.h>
  22 #include <linux/module.h>
  23 #include <linux/nmi.h>
  24 #include <linux/init.h>
  25 #include <asm/uaccess.h>
  26 #include <linux/highmem.h>
  27 #include <linux/smp_lock.h>
  28 #include <asm/mmu_context.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/completion.h>
  31 #include <linux/kernel_stat.h>
  32 #include <linux/security.h>
  33 #include <linux/notifier.h>
  34 #include <linux/profile.h>
  35 #include <linux/suspend.h>
  36 #include <linux/blkdev.h>
  37 #include <linux/delay.h>
  38 #include <linux/smp.h>
  39 #include <linux/threads.h>
  40 #include <linux/timer.h>
  41 #include <linux/rcupdate.h>
  42 #include <linux/cpu.h>
  43 #include <linux/cpuset.h>
  44 #include <linux/percpu.h>
  45 #include <linux/kthread.h>
  46 #include <linux/seq_file.h>
  47 #include <linux/syscalls.h>
  48 #include <linux/times.h>
  49 #include <linux/acct.h>
  50 #include <asm/tlb.h>
  51
  52 #include <asm/unistd.h>
  53
  54 /*
  55  * Convert user-nice values [ -20 ... 0 ... 19 ]
  56  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  57  * and back.
  58  */
  59 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  60 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  61 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  62
  63 /*
  64  * 'User priority' is the nice value converted to something we
  65  * can work with better when scaling various scheduler parameters,
  66  * it's a [ 0 ... 39 ] range.
  67  */
  68 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  69 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  70 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  71
  72 /*
  73  * Some helpers for converting nanosecond timing to jiffy resolution
  74  */
  75 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  76 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  77
  78 /*
  79  * These are the 'tuning knobs' of the scheduler:
  80  *
  81  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
  82  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  83  * Timeslices get refilled after they expire.
  84  */
  85 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
  86 #define DEF_TIMESLICE           (100 * HZ / 1000)
  87 #define ON_RUNQUEUE_WEIGHT       30
  88 #define CHILD_PENALTY            95
  89 #define PARENT_PENALTY          100
  90 #define EXIT_WEIGHT               3
  91 #define PRIO_BONUS_RATIO         25
  92 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
  93 #define INTERACTIVE_DELTA         2
  94 #define MAX_SLEEP_AVG           (DEF_TIMESLICE * MAX_BONUS)
  95 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
  96 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
  97
  98 /*
  99  * If a task is 'interactive' then we reinsert it in the active
 100  * array after it has expired its current timeslice. (it will not
 101  * continue to run immediately, it will still roundrobin with
 102  * other interactive tasks.)
 103  *
 104  * This part scales the interactivity limit depending on niceness.
 105  *
 106  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
 107  * Here are a few examples of different nice levels:
 108  *
 109  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
 110  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
 111  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
 112  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
 113  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
 114  *
 115  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 116  *  priority range a task can explore, a value of '1' means the
 117  *  task is rated interactive.)
 118  *
 119  * Ie. nice +19 tasks can never get 'interactive' enough to be
 120  * reinserted into the active array. And only heavily CPU-hog nice -20
 121  * tasks will be expired. Default nice 0 tasks are somewhere between,
 122  * it takes some effort for them to get interactive, but it's not
 123  * too hard.
 124  */
 125
 126 #define CURRENT_BONUS(p) \
 127         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 128                 MAX_SLEEP_AVG)
 129
 130 #define GRANULARITY     (10 * HZ / 1000 ? : 1)
 131
 132 #ifdef CONFIG_SMP
 133 #define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
 134                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 135                         num_online_cpus())
 136 #else
 137 #define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
 138                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 139 #endif
 140
 141 #define SCALE(v1,v1_max,v2_max) \
 142         (v1) * (v2_max) / (v1_max)
 143
 144 #define DELTA(p) \
 145         (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
 146
 147 #define TASK_INTERACTIVE(p) \
 148         ((p)->prio <= (p)->static_prio - DELTA(p))
 149
 150 #define INTERACTIVE_SLEEP(p) \
 151         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 152                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 153
 154 #define TASK_PREEMPTS_CURR(p, rq) \
 155         ((p)->prio < (rq)->curr->prio)
 156
 157 /*
 158  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 159  * to time slice values: [800ms ... 100ms ... 5ms]
 160  *
 161  * The higher a thread's priority, the bigger timeslices
 162  * it gets during one round of execution. But even the lowest
 163  * priority thread gets MIN_TIMESLICE worth of execution time.
 164  */
 165
 166 #define SCALE_PRIO(x, prio) \
 167         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 168
 169 static inline unsigned int task_timeslice(task_t *p)
 170 {
 171         if (p->static_prio < NICE_TO_PRIO(0))
 172                 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
 173         else
 174                 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
 175 }
 176 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)       \
 177                                 < (long long) (sd)->cache_hot_time)
 178
 179 /*
 180  * These are the runqueue data structures:
 181  */
 182
 183 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 184
 185 typedef struct runqueue runqueue_t;
 186
 187 struct prio_array {
 188         unsigned int nr_active;
 189         unsigned long bitmap[BITMAP_SIZE];
 190         struct list_head queue[MAX_PRIO];
 191 };
 192
 193 /*
 194  * This is the main, per-CPU runqueue data structure.
 195  *
 196  * Locking rule: those places that want to lock multiple runqueues
 197  * (such as the load balancing or the thread migration code), lock
 198  * acquire operations must be ordered by ascending &runqueue.
 199  */
 200 struct runqueue {
 201         spinlock_t lock;
 202
 203         /*
 204          * nr_running and cpu_load should be in the same cacheline because
 205          * remote CPUs use both these fields when doing load calculation.
 206          */
 207         unsigned long nr_running;
 208 #ifdef CONFIG_SMP
 209         unsigned long cpu_load[3];
 210 #endif
 211         unsigned long long nr_switches;
 212
 213         /*
 214          * This is part of a global counter where only the total sum
 215          * over all CPUs matters. A task can increase this counter on
 216          * one CPU and if it got migrated afterwards it may decrease
 217          * it on another CPU. Always updated under the runqueue lock:
 218          */
 219         unsigned long nr_uninterruptible;
 220
 221         unsigned long expired_timestamp;
 222         unsigned long long timestamp_last_tick;
 223         task_t *curr, *idle;
 224         struct mm_struct *prev_mm;
 225         prio_array_t *active, *expired, arrays[2];
 226         int best_expired_prio;
 227         atomic_t nr_iowait;
 228
 229 #ifdef CONFIG_SMP
 230         struct sched_domain *sd;
 231
 232         /* For active balancing */
 233         int active_balance;
 234         int push_cpu;
 235
 236         task_t *migration_thread;
 237         struct list_head migration_queue;
 238 #endif
 239
 240 #ifdef CONFIG_SCHEDSTATS
 241         /* latency stats */
 242         struct sched_info rq_sched_info;
 243
 244         /* sys_sched_yield() stats */
 245         unsigned long yld_exp_empty;
 246         unsigned long yld_act_empty;
 247         unsigned long yld_both_empty;
 248         unsigned long yld_cnt;
 249
 250         /* schedule() stats */
 251         unsigned long sched_switch;
 252         unsigned long sched_cnt;
 253         unsigned long sched_goidle;
 254
 255         /* try_to_wake_up() stats */
 256         unsigned long ttwu_cnt;
 257         unsigned long ttwu_local;
 258 #endif
 259 };
 260
 261 static DEFINE_PER_CPU(struct runqueue, runqueues);
 262
 263 #define for_each_domain(cpu, domain) \
 264         for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
 265
 266 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 267 #define this_rq()               (&__get_cpu_var(runqueues))
 268 #define task_rq(p)              cpu_rq(task_cpu(p))
 269 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 270
 271 /*
 272  * Default context-switch locking:
 273  */
 274 #ifndef prepare_arch_switch
 275 # define prepare_arch_switch(rq, next)  do { } while (0)
 276 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
 277 # define task_running(rq, p)            ((rq)->curr == (p))
 278 #endif
 279
 280 /*
 281  * task_rq_lock - lock the runqueue a given task resides on and disable
 282  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 283  * explicitly disabling preemption.
 284  */
 285 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 286         __acquires(rq->lock)
 287 {
 288         struct runqueue *rq;
 289
 290 repeat_lock_task:
 291         local_irq_save(*flags);
 292         rq = task_rq(p);
 293         spin_lock(&rq->lock);
 294         if (unlikely(rq != task_rq(p))) {
 295                 spin_unlock_irqrestore(&rq->lock, *flags);
 296                 goto repeat_lock_task;
 297         }
 298         return rq;
 299 }
 300
 301 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 302         __releases(rq->lock)
 303 {
 304         spin_unlock_irqrestore(&rq->lock, *flags);
 305 }
 306
 307 #ifdef CONFIG_SCHEDSTATS
 308 /*
 309  * bump this up when changing the output format or the meaning of an existing
 310  * format, so that tools can adapt (or abort)
 311  */
 312 #define SCHEDSTAT_VERSION 11
 313
 314 static int show_schedstat(struct seq_file *seq, void *v)
 315 {
 316         int cpu;
 317
 318         seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
 319         seq_printf(seq, "timestamp %lu\n", jiffies);
 320         for_each_online_cpu(cpu) {
 321                 runqueue_t *rq = cpu_rq(cpu);
 322 #ifdef CONFIG_SMP
 323                 struct sched_domain *sd;
 324                 int dcnt = 0;
 325 #endif
 326
 327                 /* runqueue-specific stats */
 328                 seq_printf(seq,
 329                     "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 330                     cpu, rq->yld_both_empty,
 331                     rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
 332                     rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
 333                     rq->ttwu_cnt, rq->ttwu_local,
 334                     rq->rq_sched_info.cpu_time,
 335                     rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 336
 337                 seq_printf(seq, "\n");
 338
 339 #ifdef CONFIG_SMP
 340                 /* domain-specific stats */
 341                 for_each_domain(cpu, sd) {
 342                         enum idle_type itype;
 343                         char mask_str[NR_CPUS];
 344
 345                         cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 346                         seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 347                         for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
 348                                         itype++) {
 349                                 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
 350                                     sd->lb_cnt[itype],
 351                                     sd->lb_balanced[itype],
 352                                     sd->lb_failed[itype],
 353                                     sd->lb_imbalance[itype],
 354                                     sd->lb_gained[itype],
 355                                     sd->lb_hot_gained[itype],
 356                                     sd->lb_nobusyq[itype],
 357                                     sd->lb_nobusyg[itype]);
 358                         }
 359                         seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
 360                             sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
 361                             sd->sbe_pushed, sd->sbe_attempts,
 362                             sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 363                 }
 364 #endif
 365         }
 366         return 0;
 367 }
 368
 369 static int schedstat_open(struct inode *inode, struct file *file)
 370 {
 371         unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
 372         char *buf = kmalloc(size, GFP_KERNEL);
 373         struct seq_file *m;
 374         int res;
 375
 376         if (!buf)
 377                 return -ENOMEM;
 378         res = single_open(file, show_schedstat, NULL);
 379         if (!res) {
 380                 m = file->private_data;
 381                 m->buf = buf;
 382                 m->size = size;
 383         } else
 384                 kfree(buf);
 385         return res;
 386 }
 387
 388 struct file_operations proc_schedstat_operations = {
 389         .open    = schedstat_open,
 390         .read    = seq_read,
 391         .llseek  = seq_lseek,
 392         .release = single_release,
 393 };
 394
 395 # define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
 396 # define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
 397 #else /* !CONFIG_SCHEDSTATS */
 398 # define schedstat_inc(rq, field)       do { } while (0)
 399 # define schedstat_add(rq, field, amt)  do { } while (0)
 400 #endif
 401
 402 /*
 403  * rq_lock - lock a given runqueue and disable interrupts.
 404  */
 405 static inline runqueue_t *this_rq_lock(void)
 406         __acquires(rq->lock)
 407 {
 408         runqueue_t *rq;
 409
 410         local_irq_disable();
 411         rq = this_rq();
 412         spin_lock(&rq->lock);
 413
 414         return rq;
 415 }
 416
 417 #ifdef CONFIG_SCHEDSTATS
 418 /*
 419  * Called when a process is dequeued from the active array and given
 420  * the cpu.  We should note that with the exception of interactive
 421  * tasks, the expired queue will become the active queue after the active
 422  * queue is empty, without explicitly dequeuing and requeuing tasks in the
 423  * expired queue.  (Interactive tasks may be requeued directly to the
 424  * active queue, thus delaying tasks in the expired queue from running;
 425  * see scheduler_tick()).
 426  *
 427  * This function is only called from sched_info_arrive(), rather than
 428  * dequeue_task(). Even though a task may be queued and dequeued multiple
 429  * times as it is shuffled about, we're really interested in knowing how
 430  * long it was from the *first* time it was queued to the time that it
 431  * finally hit a cpu.
 432  */
 433 static inline void sched_info_dequeued(task_t *t)
 434 {
 435         t->sched_info.last_queued = 0;
 436 }
 437
 438 /*
 439  * Called when a task finally hits the cpu.  We can now calculate how
 440  * long it was waiting to run.  We also note when it began so that we
 441  * can keep stats on how long its timeslice is.
 442  */
 443 static inline void sched_info_arrive(task_t *t)
 444 {
 445         unsigned long now = jiffies, diff = 0;
 446         struct runqueue *rq = task_rq(t);
 447
 448         if (t->sched_info.last_queued)
 449                 diff = now - t->sched_info.last_queued;
 450         sched_info_dequeued(t);
 451         t->sched_info.run_delay += diff;
 452         t->sched_info.last_arrival = now;
 453         t->sched_info.pcnt++;
 454
 455         if (!rq)
 456                 return;
 457
 458         rq->rq_sched_info.run_delay += diff;
 459         rq->rq_sched_info.pcnt++;
 460 }
 461
 462 /*
 463  * Called when a process is queued into either the active or expired
 464  * array.  The time is noted and later used to determine how long we
 465  * had to wait for us to reach the cpu.  Since the expired queue will
 466  * become the active queue after active queue is empty, without dequeuing
 467  * and requeuing any tasks, we are interested in queuing to either. It
 468  * is unusual but not impossible for tasks to be dequeued and immediately
 469  * requeued in the same or another array: this can happen in sched_yield(),
 470  * set_user_nice(), and even load_balance() as it moves tasks from runqueue
 471  * to runqueue.
 472  *
 473  * This function is only called from enqueue_task(), but also only updates
 474  * the timestamp if it is already not set.  It's assumed that
 475  * sched_info_dequeued() will clear that stamp when appropriate.
 476  */
 477 static inline void sched_info_queued(task_t *t)
 478 {
 479         if (!t->sched_info.last_queued)
 480                 t->sched_info.last_queued = jiffies;
 481 }
 482
 483 /*
 484  * Called when a process ceases being the active-running process, either
 485  * voluntarily or involuntarily.  Now we can calculate how long we ran.
 486  */
 487 static inline void sched_info_depart(task_t *t)
 488 {
 489         struct runqueue *rq = task_rq(t);
 490         unsigned long diff = jiffies - t->sched_info.last_arrival;
 491
 492         t->sched_info.cpu_time += diff;
 493
 494         if (rq)
 495                 rq->rq_sched_info.cpu_time += diff;
 496 }
 497
 498 /*
 499  * Called when tasks are switched involuntarily due, typically, to expiring
 500  * their time slice.  (This may also be called when switching to or from
 501  * the idle task.)  We are only called when prev != next.
 502  */
 503 static inline void sched_info_switch(task_t *prev, task_t *next)
 504 {
 505         struct runqueue *rq = task_rq(prev);
 506
 507         /*
 508          * prev now departs the cpu.  It's not interesting to record
 509          * stats about how efficient we were at scheduling the idle
 510          * process, however.
 511          */
 512         if (prev != rq->idle)
 513                 sched_info_depart(prev);
 514
 515         if (next != rq->idle)
 516                 sched_info_arrive(next);
 517 }
 518 #else
 519 #define sched_info_queued(t)            do { } while (0)
 520 #define sched_info_switch(t, next)      do { } while (0)
 521 #endif /* CONFIG_SCHEDSTATS */
 522
 523 /*
 524  * Adding/removing a task to/from a priority array:
 525  */
 526 static void dequeue_task(struct task_struct *p, prio_array_t *array)
 527 {
 528         array->nr_active--;
 529         list_del(&p->run_list);
 530         if (list_empty(array->queue + p->prio))
 531                 __clear_bit(p->prio, array->bitmap);
 532 }
 533
 534 static void enqueue_task(struct task_struct *p, prio_array_t *array)
 535 {
 536         sched_info_queued(p);
 537         list_add_tail(&p->run_list, array->queue + p->prio);
 538         __set_bit(p->prio, array->bitmap);
 539         array->nr_active++;
 540         p->array = array;
 541 }
 542
 543 /*
 544  * Put task to the end of the run list without the overhead of dequeue
 545  * followed by enqueue.
 546  */
 547 static void requeue_task(struct task_struct *p, prio_array_t *array)
 548 {
 549         list_move_tail(&p->run_list, array->queue + p->prio);
 550 }
 551
 552 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 553 {
 554         list_add(&p->run_list, array->queue + p->prio);
 555         __set_bit(p->prio, array->bitmap);
 556         array->nr_active++;
 557         p->array = array;
 558 }
 559
 560 /*
 561  * effective_prio - return the priority that is based on the static
 562  * priority but is modified by bonuses/penalties.
 563  *
 564  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 565  * into the -5 ... 0 ... +5 bonus/penalty range.
 566  *
 567  * We use 25% of the full 0...39 priority range so that:
 568  *
 569  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 570  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 571  *
 572  * Both properties are important to certain workloads.
 573  */
 574 static int effective_prio(task_t *p)
 575 {
 576         int bonus, prio;
 577
 578         if (rt_task(p))
 579                 return p->prio;
 580
 581         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 582
 583         prio = p->static_prio - bonus;
 584         if (prio < MAX_RT_PRIO)
 585                 prio = MAX_RT_PRIO;
 586         if (prio > MAX_PRIO-1)
 587                 prio = MAX_PRIO-1;
 588         return prio;
 589 }
 590
 591 /*
 592  * __activate_task - move a task to the runqueue.
 593  */
 594 static inline void __activate_task(task_t *p, runqueue_t *rq)
 595 {
 596         enqueue_task(p, rq->active);
 597         rq->nr_running++;
 598 }
 599
 600 /*
 601  * __activate_idle_task - move idle task to the _front_ of runqueue.
 602  */
 603 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 604 {
 605         enqueue_task_head(p, rq->active);
 606         rq->nr_running++;
 607 }
 608
 609 static void recalc_task_prio(task_t *p, unsigned long long now)
 610 {
 611         /* Caller must always ensure 'now >= p->timestamp' */
 612         unsigned long long __sleep_time = now - p->timestamp;
 613         unsigned long sleep_time;
 614
 615         if (__sleep_time > NS_MAX_SLEEP_AVG)
 616                 sleep_time = NS_MAX_SLEEP_AVG;
 617         else
 618                 sleep_time = (unsigned long)__sleep_time;
 619
 620         if (likely(sleep_time > 0)) {
 621                 /*
 622                  * User tasks that sleep a long time are categorised as
 623                  * idle and will get just interactive status to stay active &
 624                  * prevent them suddenly becoming cpu hogs and starving
 625                  * other processes.
 626                  */
 627                 if (p->mm && p->activated != -1 &&
 628                         sleep_time > INTERACTIVE_SLEEP(p)) {
 629                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 630                                                 DEF_TIMESLICE);
 631                 } else {
 632                         /*
 633                          * The lower the sleep avg a task has the more
 634                          * rapidly it will rise with sleep time.
 635                          */
 636                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 637
 638                         /*
 639                          * Tasks waking from uninterruptible sleep are
 640                          * limited in their sleep_avg rise as they
 641                          * are likely to be waiting on I/O
 642                          */
 643                         if (p->activated == -1 && p->mm) {
 644                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
 645                                         sleep_time = 0;
 646                                 else if (p->sleep_avg + sleep_time >=
 647                                                 INTERACTIVE_SLEEP(p)) {
 648                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
 649                                         sleep_time = 0;
 650                                 }
 651                         }
 652
 653                         /*
 654                          * This code gives a bonus to interactive tasks.
 655                          *
 656                          * The boost works by updating the 'average sleep time'
 657                          * value here, based on ->timestamp. The more time a
 658                          * task spends sleeping, the higher the average gets -
 659                          * and the higher the priority boost gets as well.
 660                          */
 661                         p->sleep_avg += sleep_time;
 662
 663                         if (p->sleep_avg > NS_MAX_SLEEP_AVG)
 664                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
 665                 }
 666         }
 667
 668         p->prio = effective_prio(p);
 669 }
 670
 671 /*
 672  * activate_task - move a task to the runqueue and do priority recalculation
 673  *
 674  * Update all the scheduling statistics stuff. (sleep average
 675  * calculation, priority modifiers, etc.)
 676  */
 677 static void activate_task(task_t *p, runqueue_t *rq, int local)
 678 {
 679         unsigned long long now;
 680
 681         now = sched_clock();
 682 #ifdef CONFIG_SMP
 683         if (!local) {
 684                 /* Compensate for drifting sched_clock */
 685                 runqueue_t *this_rq = this_rq();
 686                 now = (now - this_rq->timestamp_last_tick)
 687                         + rq->timestamp_last_tick;
 688         }
 689 #endif
 690
 691         recalc_task_prio(p, now);
 692
 693         /*
 694          * This checks to make sure it's not an uninterruptible task
 695          * that is now waking up.
 696          */
 697         if (!p->activated) {
 698                 /*
 699                  * Tasks which were woken up by interrupts (ie. hw events)
 700                  * are most likely of interactive nature. So we give them
 701                  * the credit of extending their sleep time to the period
 702                  * of time they spend on the runqueue, waiting for execution
 703                  * on a CPU, first time around:
 704                  */
 705                 if (in_interrupt())
 706                         p->activated = 2;
 707                 else {
 708                         /*
 709                          * Normal first-time wakeups get a credit too for
 710                          * on-runqueue time, but it will be weighted down:
 711                          */
 712                         p->activated = 1;
 713                 }
 714         }
 715         p->timestamp = now;
 716
 717         __activate_task(p, rq);
 718 }
 719
 720 /*
 721  * deactivate_task - remove a task from the runqueue.
 722  */
 723 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 724 {
 725         rq->nr_running--;
 726         dequeue_task(p, p->array);
 727         p->array = NULL;
 728 }
 729
 730 /*
 731  * resched_task - mark a task 'to be rescheduled now'.
 732  *
 733  * On UP this means the setting of the need_resched flag, on SMP it
 734  * might also involve a cross-CPU call to trigger the scheduler on
 735  * the target CPU.
 736  */
 737 #ifdef CONFIG_SMP
 738 static void resched_task(task_t *p)
 739 {
 740         int need_resched, nrpolling;
 741
 742         assert_spin_locked(&task_rq(p)->lock);
 743
 744         /* minimise the chance of sending an interrupt to poll_idle() */
 745         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 746         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
 747         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 748
 749         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 750                 smp_send_reschedule(task_cpu(p));
 751 }
 752 #else
 753 static inline void resched_task(task_t *p)
 754 {
 755         set_tsk_need_resched(p);
 756 }
 757 #endif
 758
 759 /**
 760  * task_curr - is this task currently executing on a CPU?
 761  * @p: the task in question.
 762  */
 763 inline int task_curr(const task_t *p)
 764 {
 765         return cpu_curr(task_cpu(p)) == p;
 766 }
 767
 768 #ifdef CONFIG_SMP
 769 enum request_type {
 770         REQ_MOVE_TASK,
 771         REQ_SET_DOMAIN,
 772 };
 773
 774 typedef struct {
 775         struct list_head list;
 776         enum request_type type;
 777
 778         /* For REQ_MOVE_TASK */
 779         task_t *task;
 780         int dest_cpu;
 781
 782         /* For REQ_SET_DOMAIN */
 783         struct sched_domain *sd;
 784
 785         struct completion done;
 786 } migration_req_t;
 787
 788 /*
 789  * The task's runqueue lock must be held.
 790  * Returns true if you have to wait for migration thread.
 791  */
 792 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
 793 {
 794         runqueue_t *rq = task_rq(p);
 795
 796         /*
 797          * If the task is not on a runqueue (and not running), then
 798          * it is sufficient to simply update the task's cpu field.
 799          */
 800         if (!p->array && !task_running(rq, p)) {
 801                 set_task_cpu(p, dest_cpu);
 802                 return 0;
 803         }
 804
 805         init_completion(&req->done);
 806         req->type = REQ_MOVE_TASK;
 807         req->task = p;
 808         req->dest_cpu = dest_cpu;
 809         list_add(&req->list, &rq->migration_queue);
 810         return 1;
 811 }
 812
 813 /*
 814  * wait_task_inactive - wait for a thread to unschedule.
 815  *
 816  * The caller must ensure that the task *will* unschedule sometime soon,
 817  * else this function might spin for a *long* time. This function can't
 818  * be called with interrupts off, or it may introduce deadlock with
 819  * smp_call_function() if an IPI is sent by the same process we are
 820  * waiting to become inactive.
 821  */
 822 void wait_task_inactive(task_t * p)
 823 {
 824         unsigned long flags;
 825         runqueue_t *rq;
 826         int preempted;
 827
 828 repeat:
 829         rq = task_rq_lock(p, &flags);
 830         /* Must be off runqueue entirely, not preempted. */
 831         if (unlikely(p->array || task_running(rq, p))) {
 832                 /* If it's preempted, we yield.  It could be a while. */
 833                 preempted = !task_running(rq, p);
 834                 task_rq_unlock(rq, &flags);
 835                 cpu_relax();
 836                 if (preempted)
 837                         yield();
 838                 goto repeat;
 839         }
 840         task_rq_unlock(rq, &flags);
 841 }
 842
 843 /***
 844  * kick_process - kick a running thread to enter/exit the kernel
 845  * @p: the to-be-kicked thread
 846  *
 847  * Cause a process which is running on another CPU to enter
 848  * kernel-mode, without any delay. (to get signals handled.)
 849  *
 850  * NOTE: this function doesnt have to take the runqueue lock,
 851  * because all it wants to ensure is that the remote task enters
 852  * the kernel. If the IPI races and the task has been migrated
 853  * to another CPU then no harm is done and the purpose has been
 854  * achieved as well.
 855  */
 856 void kick_process(task_t *p)
 857 {
 858         int cpu;
 859
 860         preempt_disable();
 861         cpu = task_cpu(p);
 862         if ((cpu != smp_processor_id()) && task_curr(p))
 863                 smp_send_reschedule(cpu);
 864         preempt_enable();
 865 }
 866
 867 /*
 868  * Return a low guess at the load of a migration-source cpu.
 869  *
 870  * We want to under-estimate the load of migration sources, to
 871  * balance conservatively.
 872  */
 873 static inline unsigned long source_load(int cpu, int type)
 874 {
 875         runqueue_t *rq = cpu_rq(cpu);
 876         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 877         if (type == 0)
 878                 return load_now;
 879
 880         return min(rq->cpu_load[type-1], load_now);
 881 }
 882
 883 /*
 884  * Return a high guess at the load of a migration-target cpu
 885  */
 886 static inline unsigned long target_load(int cpu, int type)
 887 {
 888         runqueue_t *rq = cpu_rq(cpu);
 889         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 890         if (type == 0)
 891                 return load_now;
 892
 893         return max(rq->cpu_load[type-1], load_now);
 894 }
 895
 896 #endif
 897
 898 /*
 899  * wake_idle() will wake a task on an idle cpu if task->cpu is
 900  * not idle and an idle cpu is available.  The span of cpus to
 901  * search starts with cpus closest then further out as needed,
 902  * so we always favor a closer, idle cpu.
 903  *
 904  * Returns the CPU we should wake onto.
 905  */
 906 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 907 static int wake_idle(int cpu, task_t *p)
 908 {
 909         cpumask_t tmp;
 910         struct sched_domain *sd;
 911         int i;
 912
 913         if (idle_cpu(cpu))
 914                 return cpu;
 915
 916         for_each_domain(cpu, sd) {
 917                 if (sd->flags & SD_WAKE_IDLE) {
 918                         cpus_and(tmp, sd->span, p->cpus_allowed);
 919                         for_each_cpu_mask(i, tmp) {
 920                                 if (idle_cpu(i))
 921                                         return i;
 922                         }
 923                 }
 924                 else
 925                         break;
 926         }
 927         return cpu;
 928 }
 929 #else
 930 static inline int wake_idle(int cpu, task_t *p)
 931 {
 932         return cpu;
 933 }
 934 #endif
 935
 936 /***
 937  * try_to_wake_up - wake up a thread
 938  * @p: the to-be-woken-up thread
 939  * @state: the mask of task states that can be woken
 940  * @sync: do a synchronous wakeup?
 941  *
 942  * Put it on the run-queue if it's not already there. The "current"
 943  * thread is always on the run-queue (except when the actual
 944  * re-schedule is in progress), and as such you're allowed to do
 945  * the simpler "current->state = TASK_RUNNING" to mark yourself
 946  * runnable without the overhead of this.
 947  *
 948  * returns failure only if the task is already active.
 949  */
 950 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 951 {
 952         int cpu, this_cpu, success = 0;
 953         unsigned long flags;
 954         long old_state;
 955         runqueue_t *rq;
 956 #ifdef CONFIG_SMP
 957         unsigned long load, this_load;
 958         struct sched_domain *sd, *this_sd = NULL;
 959         int new_cpu;
 960 #endif
 961
 962         rq = task_rq_lock(p, &flags);
 963         old_state = p->state;
 964         if (!(old_state & state))
 965                 goto out;
 966
 967         if (p->array)
 968                 goto out_running;
 969
 970         cpu = task_cpu(p);
 971         this_cpu = smp_processor_id();
 972
 973 #ifdef CONFIG_SMP
 974         if (unlikely(task_running(rq, p)))
 975                 goto out_activate;
 976
 977         new_cpu = cpu;
 978
 979         schedstat_inc(rq, ttwu_cnt);
 980         if (cpu == this_cpu) {
 981                 schedstat_inc(rq, ttwu_local);
 982                 goto out_set_cpu;
 983         }
 984
 985         for_each_domain(this_cpu, sd) {
 986                 if (cpu_isset(cpu, sd->span)) {
 987                         schedstat_inc(sd, ttwu_wake_remote);
 988                         this_sd = sd;
 989                         break;
 990                 }
 991         }
 992
 993         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 994                 goto out_set_cpu;
 995
 996         /*
 997          * Check for affine wakeup and passive balancing possibilities.
 998          */
 999         if (this_sd) {
1000                 int idx = this_sd->wake_idx;
1001                 unsigned int imbalance;
1002
1003                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1004
1005                 load = source_load(cpu, idx);
1006                 this_load = target_load(this_cpu, idx);
1007
1008                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1009
1010                 if (this_sd->flags & SD_WAKE_AFFINE) {
1011                         unsigned long tl = this_load;
1012                         /*
1013                          * If sync wakeup then subtract the (maximum possible)
1014                          * effect of the currently running task from the load
1015                          * of the current CPU:
1016                          */
1017                         if (sync)
1018                                 tl -= SCHED_LOAD_SCALE;
1019
1020                         if ((tl <= load &&
1021                                 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
1022                                 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
1023                                 /*
1024                                  * This domain has SD_WAKE_AFFINE and
1025                                  * p is cache cold in this domain, and
1026                                  * there is no bad imbalance.
1027                                  */
1028                                 schedstat_inc(this_sd, ttwu_move_affine);
1029                                 goto out_set_cpu;
1030                         }
1031                 }
1032
1033                 /*
1034                  * Start passive balancing when half the imbalance_pct
1035                  * limit is reached.
1036                  */
1037                 if (this_sd->flags & SD_WAKE_BALANCE) {
1038                         if (imbalance*this_load <= 100*load) {
1039                                 schedstat_inc(this_sd, ttwu_move_balance);
1040                                 goto out_set_cpu;
1041                         }
1042                 }
1043         }
1044
1045         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1046 out_set_cpu:
1047         new_cpu = wake_idle(new_cpu, p);
1048         if (new_cpu != cpu) {
1049                 set_task_cpu(p, new_cpu);
1050                 task_rq_unlock(rq, &flags);
1051                 /* might preempt at this point */
1052                 rq = task_rq_lock(p, &flags);
1053                 old_state = p->state;
1054                 if (!(old_state & state))
1055                         goto out;
1056                 if (p->array)
1057                         goto out_running;
1058
1059                 this_cpu = smp_processor_id();
1060                 cpu = task_cpu(p);
1061         }
1062
1063 out_activate:
1064 #endif /* CONFIG_SMP */
1065         if (old_state == TASK_UNINTERRUPTIBLE) {
1066                 rq->nr_uninterruptible--;
1067                 /*
1068                  * Tasks on involuntary sleep don't earn
1069                  * sleep_avg beyond just interactive state.
1070                  */
1071                 p->activated = -1;
1072         }
1073
1074         /*
1075          * Sync wakeups (i.e. those types of wakeups where the waker
1076          * has indicated that it will leave the CPU in short order)
1077          * don't trigger a preemption, if the woken up task will run on
1078          * this cpu. (in this case the 'I will reschedule' promise of
1079          * the waker guarantees that the freshly woken up task is going
1080          * to be considered on this CPU.)
1081          */
1082         activate_task(p, rq, cpu == this_cpu);
1083         if (!sync || cpu != this_cpu) {
1084                 if (TASK_PREEMPTS_CURR(p, rq))
1085                         resched_task(rq->curr);
1086         }
1087         success = 1;
1088
1089 out_running:
1090         p->state = TASK_RUNNING;
1091 out:
1092         task_rq_unlock(rq, &flags);
1093
1094         return success;
1095 }
1096
1097 int fastcall wake_up_process(task_t * p)
1098 {
1099         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1100                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1101 }
1102
1103 EXPORT_SYMBOL(wake_up_process);
1104
1105 int fastcall wake_up_state(task_t *p, unsigned int state)
1106 {
1107         return try_to_wake_up(p, state, 0);
1108 }
1109
1110 #ifdef CONFIG_SMP
1111 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1112                            struct sched_domain *sd);
1113 #endif
1114
1115 /*
1116  * Perform scheduler related setup for a newly forked process p.
1117  * p is forked by current.
1118  */
1119 void fastcall sched_fork(task_t *p)
1120 {
1121         /*
1122          * We mark the process as running here, but have not actually
1123          * inserted it onto the runqueue yet. This guarantees that
1124          * nobody will actually run it, and a signal or other external
1125          * event cannot wake it up and insert it on the runqueue either.
1126          */
1127         p->state = TASK_RUNNING;
1128         INIT_LIST_HEAD(&p->run_list);
1129         p->array = NULL;
1130         spin_lock_init(&p->switch_lock);
1131 #ifdef CONFIG_SCHEDSTATS
1132         memset(&p->sched_info, 0, sizeof(p->sched_info));
1133 #endif
1134 #ifdef CONFIG_PREEMPT
1135         /*
1136          * During context-switch we hold precisely one spinlock, which
1137          * schedule_tail drops. (in the common case it's this_rq()->lock,
1138          * but it also can be p->switch_lock.) So we compensate with a count
1139          * of 1. Also, we want to start with kernel preemption disabled.
1140          */
1141         p->thread_info->preempt_count = 1;
1142 #endif
1143         /*
1144          * Share the timeslice between parent and child, thus the
1145          * total amount of pending timeslices in the system doesn't change,
1146          * resulting in more scheduling fairness.
1147          */
1148         local_irq_disable();
1149         p->time_slice = (current->time_slice + 1) >> 1;
1150         /*
1151          * The remainder of the first timeslice might be recovered by
1152          * the parent if the child exits early enough.
1153          */
1154         p->first_time_slice = 1;
1155         current->time_slice >>= 1;
1156         p->timestamp = sched_clock();
1157         if (unlikely(!current->time_slice)) {
1158                 /*
1159                  * This case is rare, it happens when the parent has only
1160                  * a single jiffy left from its timeslice. Taking the
1161                  * runqueue lock is not a problem.
1162                  */
1163                 current->time_slice = 1;
1164                 preempt_disable();
1165                 scheduler_tick();
1166                 local_irq_enable();
1167                 preempt_enable();
1168         } else
1169                 local_irq_enable();
1170 }
1171
1172 /*
1173  * wake_up_new_task - wake up a newly created task for the first time.
1174  *
1175  * This function will do some initial scheduler statistics housekeeping
1176  * that must be done for every newly created context, then puts the task
1177  * on the runqueue and wakes it.
1178  */
1179 void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1180 {
1181         unsigned long flags;
1182         int this_cpu, cpu;
1183         runqueue_t *rq, *this_rq;
1184
1185         rq = task_rq_lock(p, &flags);
1186         cpu = task_cpu(p);
1187         this_cpu = smp_processor_id();
1188
1189         BUG_ON(p->state != TASK_RUNNING);
1190
1191         /*
1192          * We decrease the sleep average of forking parents
1193          * and children as well, to keep max-interactive tasks
1194          * from forking tasks that are max-interactive. The parent
1195          * (current) is done further down, under its lock.
1196          */
1197         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1198                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1199
1200         p->prio = effective_prio(p);
1201
1202         if (likely(cpu == this_cpu)) {
1203                 if (!(clone_flags & CLONE_VM)) {
1204                         /*
1205                          * The VM isn't cloned, so we're in a good position to
1206                          * do child-runs-first in anticipation of an exec. This
1207                          * usually avoids a lot of COW overhead.
1208                          */
1209                         if (unlikely(!current->array))
1210                                 __activate_task(p, rq);
1211                         else {
1212                                 p->prio = current->prio;
1213                                 list_add_tail(&p->run_list, &current->run_list);
1214                                 p->array = current->array;
1215                                 p->array->nr_active++;
1216                                 rq->nr_running++;
1217                         }
1218                         set_need_resched();
1219                 } else
1220                         /* Run child last */
1221                         __activate_task(p, rq);
1222                 /*
1223                  * We skip the following code due to cpu == this_cpu
1224                  *
1225                  *   task_rq_unlock(rq, &flags);
1226                  *   this_rq = task_rq_lock(current, &flags);
1227                  */
1228                 this_rq = rq;
1229         } else {
1230                 this_rq = cpu_rq(this_cpu);
1231
1232                 /*
1233                  * Not the local CPU - must adjust timestamp. This should
1234                  * get optimised away in the !CONFIG_SMP case.
1235                  */
1236                 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1237                                         + rq->timestamp_last_tick;
1238                 __activate_task(p, rq);
1239                 if (TASK_PREEMPTS_CURR(p, rq))
1240                         resched_task(rq->curr);
1241
1242                 /*
1243                  * Parent and child are on different CPUs, now get the
1244                  * parent runqueue to update the parent's ->sleep_avg:
1245                  */
1246                 task_rq_unlock(rq, &flags);
1247                 this_rq = task_rq_lock(current, &flags);
1248         }
1249         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1250                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1251         task_rq_unlock(this_rq, &flags);
1252 }
1253
1254 /*
1255  * Potentially available exiting-child timeslices are
1256  * retrieved here - this way the parent does not get
1257  * penalized for creating too many threads.
1258  *
1259  * (this cannot be used to 'generate' timeslices
1260  * artificially, because any timeslice recovered here
1261  * was given away by the parent in the first place.)
1262  */
1263 void fastcall sched_exit(task_t * p)
1264 {
1265         unsigned long flags;
1266         runqueue_t *rq;
1267
1268         /*
1269          * If the child was a (relative-) CPU hog then decrease
1270          * the sleep_avg of the parent as well.
1271          */
1272         rq = task_rq_lock(p->parent, &flags);
1273         if (p->first_time_slice) {
1274                 p->parent->time_slice += p->time_slice;
1275                 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1276                         p->parent->time_slice = task_timeslice(p);
1277         }
1278         if (p->sleep_avg < p->parent->sleep_avg)
1279                 p->parent->sleep_avg = p->parent->sleep_avg /
1280                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1281                 (EXIT_WEIGHT + 1);
1282         task_rq_unlock(rq, &flags);
1283 }
1284
1285 /**
1286  * finish_task_switch - clean up after a task-switch
1287  * @prev: the thread we just switched away from.
1288  *
1289  * We enter this with the runqueue still locked, and finish_arch_switch()
1290  * will unlock it along with doing any other architecture-specific cleanup
1291  * actions.
1292  *
1293  * Note that we may have delayed dropping an mm in context_switch(). If
1294  * so, we finish that here outside of the runqueue lock.  (Doing it
1295  * with the lock held can cause deadlocks; see schedule() for
1296  * details.)
1297  */
1298 static inline void finish_task_switch(task_t *prev)
1299         __releases(rq->lock)
1300 {
1301         runqueue_t *rq = this_rq();
1302         struct mm_struct *mm = rq->prev_mm;
1303         unsigned long prev_task_flags;
1304
1305         rq->prev_mm = NULL;
1306
1307         /*
1308          * A task struct has one reference for the use as "current".
1309          * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1310          * calls schedule one last time. The schedule call will never return,
1311          * and the scheduled task must drop that reference.
1312          * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1313          * still held, otherwise prev could be scheduled on another cpu, die
1314          * there before we look at prev->state, and then the reference would
1315          * be dropped twice.
1316          *              Manfred Spraul <manfred@colorfullife.com>
1317          */
1318         prev_task_flags = prev->flags;
1319         finish_arch_switch(rq, prev);
1320         if (mm)
1321                 mmdrop(mm);
1322         if (unlikely(prev_task_flags & PF_DEAD))
1323                 put_task_struct(prev);
1324 }
1325
1326 /**
1327  * schedule_tail - first thing a freshly forked thread must call.
1328  * @prev: the thread we just switched away from.
1329  */
1330 asmlinkage void schedule_tail(task_t *prev)
1331         __releases(rq->lock)
1332 {
1333         finish_task_switch(prev);
1334
1335         if (current->set_child_tid)
1336                 put_user(current->pid, current->set_child_tid);
1337 }
1338
1339 /*
1340  * context_switch - switch to the new MM and the new
1341  * thread's register state.
1342  */
1343 static inline
1344 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1345 {
1346         struct mm_struct *mm = next->mm;
1347         struct mm_struct *oldmm = prev->active_mm;
1348
1349         if (unlikely(!mm)) {
1350                 next->active_mm = oldmm;
1351                 atomic_inc(&oldmm->mm_count);
1352                 enter_lazy_tlb(oldmm, next);
1353         } else
1354                 switch_mm(oldmm, mm, next);
1355
1356         if (unlikely(!prev->mm)) {
1357                 prev->active_mm = NULL;
1358                 WARN_ON(rq->prev_mm);
1359                 rq->prev_mm = oldmm;
1360         }
1361
1362         /* Here we just switch the register state and the stack. */
1363         switch_to(prev, next, prev);
1364
1365         return prev;
1366 }
1367
1368 /*
1369  * nr_running, nr_uninterruptible and nr_context_switches:
1370  *
1371  * externally visible scheduler statistics: current number of runnable
1372  * threads, current number of uninterruptible-sleeping threads, total
1373  * number of context switches performed since bootup.
1374  */
1375 unsigned long nr_running(void)
1376 {
1377         unsigned long i, sum = 0;
1378
1379         for_each_online_cpu(i)
1380                 sum += cpu_rq(i)->nr_running;
1381
1382         return sum;
1383 }
1384
1385 unsigned long nr_uninterruptible(void)
1386 {
1387         unsigned long i, sum = 0;
1388
1389         for_each_cpu(i)
1390                 sum += cpu_rq(i)->nr_uninterruptible;
1391
1392         /*
1393          * Since we read the counters lockless, it might be slightly
1394          * inaccurate. Do not allow it to go below zero though:
1395          */
1396         if (unlikely((long)sum < 0))
1397                 sum = 0;
1398
1399         return sum;
1400 }
1401
1402 unsigned long long nr_context_switches(void)
1403 {
1404         unsigned long long i, sum = 0;
1405
1406         for_each_cpu(i)
1407                 sum += cpu_rq(i)->nr_switches;
1408
1409         return sum;
1410 }
1411
1412 unsigned long nr_iowait(void)
1413 {
1414         unsigned long i, sum = 0;
1415
1416         for_each_cpu(i)
1417                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1418
1419         return sum;
1420 }
1421
1422 #ifdef CONFIG_SMP
1423
1424 /*
1425  * double_rq_lock - safely lock two runqueues
1426  *
1427  * Note this does not disable interrupts like task_rq_lock,
1428  * you need to do so manually before calling.
1429  */
1430 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1431         __acquires(rq1->lock)
1432         __acquires(rq2->lock)
1433 {
1434         if (rq1 == rq2) {
1435                 spin_lock(&rq1->lock);
1436                 __acquire(rq2->lock);   /* Fake it out ;) */
1437         } else {
1438                 if (rq1 < rq2) {
1439                         spin_lock(&rq1->lock);
1440                         spin_lock(&rq2->lock);
1441                 } else {
1442                         spin_lock(&rq2->lock);
1443                         spin_lock(&rq1->lock);
1444                 }
1445         }
1446 }
1447
1448 /*
1449  * double_rq_unlock - safely unlock two runqueues
1450  *
1451  * Note this does not restore interrupts like task_rq_unlock,
1452  * you need to do so manually after calling.
1453  */
1454 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1455         __releases(rq1->lock)
1456         __releases(rq2->lock)
1457 {
1458         spin_unlock(&rq1->lock);
1459         if (rq1 != rq2)
1460                 spin_unlock(&rq2->lock);
1461         else
1462                 __release(rq2->lock);
1463 }
1464
1465 /*
1466  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1467  */
1468 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1469         __releases(this_rq->lock)
1470         __acquires(busiest->lock)
1471         __acquires(this_rq->lock)
1472 {
1473         if (unlikely(!spin_trylock(&busiest->lock))) {
1474                 if (busiest < this_rq) {
1475                         spin_unlock(&this_rq->lock);
1476                         spin_lock(&busiest->lock);
1477                         spin_lock(&this_rq->lock);
1478                 } else
1479                         spin_lock(&busiest->lock);
1480         }
1481 }
1482
1483 /*
1484  * find_idlest_cpu - find the least busy runqueue.
1485  */
1486 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1487                            struct sched_domain *sd)
1488 {
1489         unsigned long load, min_load, this_load;
1490         int i, min_cpu;
1491         cpumask_t mask;
1492
1493         min_cpu = UINT_MAX;
1494         min_load = ULONG_MAX;
1495
1496         cpus_and(mask, sd->span, p->cpus_allowed);
1497
1498         for_each_cpu_mask(i, mask) {
1499                 load = target_load(i, sd->wake_idx);
1500
1501                 if (load < min_load) {
1502                         min_cpu = i;
1503                         min_load = load;
1504
1505                         /* break out early on an idle CPU: */
1506                         if (!min_load)
1507                                 break;
1508                 }
1509         }
1510
1511         /* add +1 to account for the new task */
1512         this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
1513
1514         /*
1515          * Would with the addition of the new task to the
1516          * current CPU there be an imbalance between this
1517          * CPU and the idlest CPU?
1518          *
1519          * Use half of the balancing threshold - new-context is
1520          * a good opportunity to balance.
1521          */
1522         if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1523                 return min_cpu;
1524
1525         return this_cpu;
1526 }
1527
1528 /*
1529  * If dest_cpu is allowed for this process, migrate the task to it.
1530  * This is accomplished by forcing the cpu_allowed mask to only
1531  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1532  * the cpu_allowed mask is restored.
1533  */
1534 static void sched_migrate_task(task_t *p, int dest_cpu)
1535 {
1536         migration_req_t req;
1537         runqueue_t *rq;
1538         unsigned long flags;
1539
1540         rq = task_rq_lock(p, &flags);
1541         if (!cpu_isset(dest_cpu, p->cpus_allowed)
1542             || unlikely(cpu_is_offline(dest_cpu)))
1543                 goto out;
1544
1545         /* force the process onto the specified CPU */
1546         if (migrate_task(p, dest_cpu, &req)) {
1547                 /* Need to wait for migration thread (might exit: take ref). */
1548                 struct task_struct *mt = rq->migration_thread;
1549                 get_task_struct(mt);
1550                 task_rq_unlock(rq, &flags);
1551                 wake_up_process(mt);
1552                 put_task_struct(mt);
1553                 wait_for_completion(&req.done);
1554                 return;
1555         }
1556 out:
1557         task_rq_unlock(rq, &flags);
1558 }
1559
1560 /*
1561  * sched_exec(): find the highest-level, exec-balance-capable
1562  * domain and try to migrate the task to the least loaded CPU.
1563  *
1564  * execve() is a valuable balancing opportunity, because at this point
1565  * the task has the smallest effective memory and cache footprint.
1566  */
1567 void sched_exec(void)
1568 {
1569         struct sched_domain *tmp, *sd = NULL;
1570         int new_cpu, this_cpu = get_cpu();
1571
1572         /* Prefer the current CPU if there's only this task running */
1573         if (this_rq()->nr_running <= 1)
1574                 goto out;
1575
1576         for_each_domain(this_cpu, tmp)
1577                 if (tmp->flags & SD_BALANCE_EXEC)
1578                         sd = tmp;
1579
1580         if (sd) {
1581                 schedstat_inc(sd, sbe_attempts);
1582                 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1583                 if (new_cpu != this_cpu) {
1584                         schedstat_inc(sd, sbe_pushed);
1585                         put_cpu();
1586                         sched_migrate_task(current, new_cpu);
1587                         return;
1588                 }
1589         }
1590 out:
1591         put_cpu();
1592 }
1593
1594 /*
1595  * pull_task - move a task from a remote runqueue to the local runqueue.
1596  * Both runqueues must be locked.
1597  */
1598 static inline
1599 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1600                runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1601 {
1602         dequeue_task(p, src_array);
1603         src_rq->nr_running--;
1604         set_task_cpu(p, this_cpu);
1605         this_rq->nr_running++;
1606         enqueue_task(p, this_array);
1607         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1608                                 + this_rq->timestamp_last_tick;
1609         /*
1610          * Note that idle threads have a prio of MAX_PRIO, for this test
1611          * to be always true for them.
1612          */
1613         if (TASK_PREEMPTS_CURR(p, this_rq))
1614                 resched_task(this_rq->curr);
1615 }
1616
1617 /*
1618  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1619  */
1620 static inline
1621 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1622              struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1623 {
1624         /*
1625          * We do not migrate tasks that are:
1626          * 1) running (obviously), or
1627          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1628          * 3) are cache-hot on their current CPU.
1629          */
1630         if (!cpu_isset(this_cpu, p->cpus_allowed))
1631                 return 0;
1632         *all_pinned = 0;
1633
1634         if (task_running(rq, p))
1635                 return 0;
1636
1637         /*
1638          * Aggressive migration if:
1639          * 1) task is cache cold, or
1640          * 2) too many balance attempts have failed.
1641          */
1642
1643         if (sd->nr_balance_failed > sd->cache_nice_tries)
1644                 return 1;
1645
1646         if (task_hot(p, rq->timestamp_last_tick, sd))
1647                 return 0;
1648         return 1;
1649 }
1650
1651 /*
1652  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1653  * as part of a balancing operation within "domain". Returns the number of
1654  * tasks moved.
1655  *
1656  * Called with both runqueues locked.
1657  */
1658 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1659                       unsigned long max_nr_move, struct sched_domain *sd,
1660                       enum idle_type idle, int *all_pinned)
1661 {
1662         prio_array_t *array, *dst_array;
1663         struct list_head *head, *curr;
1664         int idx, pulled = 0, pinned = 0;
1665         task_t *tmp;
1666
1667         if (max_nr_move == 0)
1668                 goto out;
1669
1670         pinned = 1;
1671
1672         /*
1673          * We first consider expired tasks. Those will likely not be
1674          * executed in the near future, and they are most likely to
1675          * be cache-cold, thus switching CPUs has the least effect
1676          * on them.
1677          */
1678         if (busiest->expired->nr_active) {
1679                 array = busiest->expired;
1680                 dst_array = this_rq->expired;
1681         } else {
1682                 array = busiest->active;
1683                 dst_array = this_rq->active;
1684         }
1685
1686 new_array:
1687         /* Start searching at priority 0: */
1688         idx = 0;
1689 skip_bitmap:
1690         if (!idx)
1691                 idx = sched_find_first_bit(array->bitmap);
1692         else
1693                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1694         if (idx >= MAX_PRIO) {
1695                 if (array == busiest->expired && busiest->active->nr_active) {
1696                         array = busiest->active;
1697                         dst_array = this_rq->active;
1698                         goto new_array;
1699                 }
1700                 goto out;
1701         }
1702
1703         head = array->queue + idx;
1704         curr = head->prev;
1705 skip_queue:
1706         tmp = list_entry(curr, task_t, run_list);
1707
1708         curr = curr->prev;
1709
1710         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1711                 if (curr != head)
1712                         goto skip_queue;
1713                 idx++;
1714                 goto skip_bitmap;
1715         }
1716
1717 #ifdef CONFIG_SCHEDSTATS
1718         if (task_hot(tmp, busiest->timestamp_last_tick, sd))
1719                 schedstat_inc(sd, lb_hot_gained[idle]);
1720 #endif
1721
1722         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1723         pulled++;
1724
1725         /* We only want to steal up to the prescribed number of tasks. */
1726         if (pulled < max_nr_move) {
1727                 if (curr != head)
1728                         goto skip_queue;
1729                 idx++;
1730                 goto skip_bitmap;
1731         }
1732 out:
1733         /*
1734          * Right now, this is the only place pull_task() is called,
1735          * so we can safely collect pull_task() stats here rather than
1736          * inside pull_task().
1737          */
1738         schedstat_add(sd, lb_gained[idle], pulled);
1739
1740         if (all_pinned)
1741                 *all_pinned = pinned;
1742         return pulled;
1743 }
1744
1745 /*
1746  * find_busiest_group finds and returns the busiest CPU group within the
1747  * domain. It calculates and returns the number of tasks which should be
1748  * moved to restore balance via the imbalance parameter.
1749  */
1750 static struct sched_group *
1751 find_busiest_group(struct sched_domain *sd, int this_cpu,
1752                    unsigned long *imbalance, enum idle_type idle)
1753 {
1754         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1755         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1756         int load_idx;
1757
1758         max_load = this_load = total_load = total_pwr = 0;
1759         if (idle == NOT_IDLE)
1760                 load_idx = sd->busy_idx;
1761         else if (idle == NEWLY_IDLE)
1762                 load_idx = sd->newidle_idx;
1763         else
1764                 load_idx = sd->idle_idx;
1765
1766         do {
1767                 unsigned long load;
1768                 int local_group;
1769                 int i;
1770
1771                 local_group = cpu_isset(this_cpu, group->cpumask);
1772
1773                 /* Tally up the load of all CPUs in the group */
1774                 avg_load = 0;
1775
1776                 for_each_cpu_mask(i, group->cpumask) {
1777                         /* Bias balancing toward cpus of our domain */
1778                         if (local_group)
1779                                 load = target_load(i, load_idx);
1780                         else
1781                                 load = source_load(i, load_idx);
1782
1783                         avg_load += load;
1784                 }
1785
1786                 total_load += avg_load;
1787                 total_pwr += group->cpu_power;
1788
1789                 /* Adjust by relative CPU power of the group */
1790                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1791
1792                 if (local_group) {
1793                         this_load = avg_load;
1794                         this = group;
1795                         goto nextgroup;
1796                 } else if (avg_load > max_load) {
1797                         max_load = avg_load;
1798                         busiest = group;
1799                 }
1800 nextgroup:
1801                 group = group->next;
1802         } while (group != sd->groups);
1803
1804         if (!busiest || this_load >= max_load)
1805                 goto out_balanced;
1806
1807         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1808
1809         if (this_load >= avg_load ||
1810                         100*max_load <= sd->imbalance_pct*this_load)
1811                 goto out_balanced;
1812
1813         /*
1814          * We're trying to get all the cpus to the average_load, so we don't
1815          * want to push ourselves above the average load, nor do we wish to
1816          * reduce the max loaded cpu below the average load, as either of these
1817          * actions would just result in more rebalancing later, and ping-pong
1818          * tasks around. Thus we look for the minimum possible imbalance.
1819          * Negative imbalances (*we* are more loaded than anyone else) will
1820          * be counted as no imbalance for these purposes -- we can't fix that
1821          * by pulling tasks to us.  Be careful of negative numbers as they'll
1822          * appear as very large values with unsigned longs.
1823          */
1824         /* How much load to actually move to equalise the imbalance */
1825         *imbalance = min((max_load - avg_load) * busiest->cpu_power,
1826                                 (avg_load - this_load) * this->cpu_power)
1827                         / SCHED_LOAD_SCALE;
1828
1829         if (*imbalance < SCHED_LOAD_SCALE) {
1830                 unsigned long pwr_now = 0, pwr_move = 0;
1831                 unsigned long tmp;
1832
1833                 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1834                         *imbalance = 1;
1835                         return busiest;
1836                 }
1837
1838                 /*
1839                  * OK, we don't have enough imbalance to justify moving tasks,
1840                  * however we may be able to increase total CPU power used by
1841                  * moving them.
1842                  */
1843
1844                 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1845                 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1846                 pwr_now /= SCHED_LOAD_SCALE;
1847
1848                 /* Amount of load we'd subtract */
1849                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1850                 if (max_load > tmp)
1851                         pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1852                                                         max_load - tmp);
1853
1854                 /* Amount of load we'd add */
1855                 if (max_load*busiest->cpu_power <
1856                                 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
1857                         tmp = max_load*busiest->cpu_power/this->cpu_power;
1858                 else
1859                         tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1860                 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1861                 pwr_move /= SCHED_LOAD_SCALE;
1862
1863                 /* Move if we gain throughput */
1864                 if (pwr_move <= pwr_now)
1865                         goto out_balanced;
1866
1867                 *imbalance = 1;
1868                 return busiest;
1869         }
1870
1871         /* Get rid of the scaling factor, rounding down as we divide */
1872         *imbalance = *imbalance / SCHED_LOAD_SCALE;
1873         return busiest;
1874
1875 out_balanced:
1876
1877         *imbalance = 0;
1878         return NULL;
1879 }
1880
1881 /*
1882  * find_busiest_queue - find the busiest runqueue among the cpus in group.
1883  */
1884 static runqueue_t *find_busiest_queue(struct sched_group *group)
1885 {
1886         unsigned long load, max_load = 0;
1887         runqueue_t *busiest = NULL;
1888         int i;
1889
1890         for_each_cpu_mask(i, group->cpumask) {
1891                 load = source_load(i, 0);
1892
1893                 if (load > max_load) {
1894                         max_load = load;
1895                         busiest = cpu_rq(i);
1896                 }
1897         }
1898
1899         return busiest;
1900 }
1901
1902 /*
1903  * Check this_cpu to ensure it is balanced within domain. Attempt to move
1904  * tasks if there is an imbalance.
1905  *
1906  * Called with this_rq unlocked.
1907  */
1908 static int load_balance(int this_cpu, runqueue_t *this_rq,
1909                         struct sched_domain *sd, enum idle_type idle)
1910 {
1911         struct sched_group *group;
1912         runqueue_t *busiest;
1913         unsigned long imbalance;
1914         int nr_moved, all_pinned;
1915         int active_balance = 0;
1916
1917         spin_lock(&this_rq->lock);
1918         schedstat_inc(sd, lb_cnt[idle]);
1919
1920         group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1921         if (!group) {
1922                 schedstat_inc(sd, lb_nobusyg[idle]);
1923                 goto out_balanced;
1924         }
1925
1926         busiest = find_busiest_queue(group);
1927         if (!busiest) {
1928                 schedstat_inc(sd, lb_nobusyq[idle]);
1929                 goto out_balanced;
1930         }
1931
1932         BUG_ON(busiest == this_rq);
1933
1934         schedstat_add(sd, lb_imbalance[idle], imbalance);
1935
1936         nr_moved = 0;
1937         if (busiest->nr_running > 1) {
1938                 /*
1939                  * Attempt to move tasks. If find_busiest_group has found
1940                  * an imbalance but busiest->nr_running <= 1, the group is
1941                  * still unbalanced. nr_moved simply stays zero, so it is
1942                  * correctly treated as an imbalance.
1943                  */
1944                 double_lock_balance(this_rq, busiest);
1945                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1946                                                 imbalance, sd, idle,
1947                                                 &all_pinned);
1948                 spin_unlock(&busiest->lock);
1949
1950                 /* All tasks on this runqueue were pinned by CPU affinity */
1951                 if (unlikely(all_pinned))
1952                         goto out_balanced;
1953         }
1954
1955         spin_unlock(&this_rq->lock);
1956
1957         if (!nr_moved) {
1958                 schedstat_inc(sd, lb_failed[idle]);
1959                 sd->nr_balance_failed++;
1960
1961                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1962
1963                         spin_lock(&busiest->lock);
1964                         if (!busiest->active_balance) {
1965                                 busiest->active_balance = 1;
1966                                 busiest->push_cpu = this_cpu;
1967                                 active_balance = 1;
1968                         }
1969                         spin_unlock(&busiest->lock);
1970                         if (active_balance)
1971                                 wake_up_process(busiest->migration_thread);
1972
1973                         /*
1974                          * We've kicked active balancing, reset the failure
1975                          * counter.
1976                          */
1977                         sd->nr_balance_failed = sd->cache_nice_tries+1;
1978                 }
1979         } else
1980                 sd->nr_balance_failed = 0;
1981
1982         if (likely(!active_balance)) {
1983                 /* We were unbalanced, so reset the balancing interval */
1984                 sd->balance_interval = sd->min_interval;
1985         } else {
1986                 /*
1987                  * If we've begun active balancing, start to back off. This
1988                  * case may not be covered by the all_pinned logic if there
1989                  * is only 1 task on the busy runqueue (because we don't call
1990                  * move_tasks).
1991                  */
1992                 if (sd->balance_interval < sd->max_interval)
1993                         sd->balance_interval *= 2;
1994         }
1995
1996         return nr_moved;
1997
1998 out_balanced:
1999         spin_unlock(&this_rq->lock);
2000
2001         schedstat_inc(sd, lb_balanced[idle]);
2002
2003         sd->nr_balance_failed = 0;
2004         /* tune up the balancing interval */
2005         if (sd->balance_interval < sd->max_interval)
2006                 sd->balance_interval *= 2;
2007
2008         return 0;
2009 }
2010
2011 /*
2012  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2013  * tasks if there is an imbalance.
2014  *
2015  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2016  * this_rq is locked.
2017  */
2018 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2019                                 struct sched_domain *sd)
2020 {
2021         struct sched_group *group;
2022         runqueue_t *busiest = NULL;
2023         unsigned long imbalance;
2024         int nr_moved = 0;
2025
2026         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2027         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2028         if (!group) {
2029                 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2030                 goto out_balanced;
2031         }
2032
2033         busiest = find_busiest_queue(group);
2034         if (!busiest) {
2035                 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2036                 goto out_balanced;
2037         }
2038
2039         BUG_ON(busiest == this_rq);
2040
2041         /* Attempt to move tasks */
2042         double_lock_balance(this_rq, busiest);
2043
2044         schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2045         nr_moved = move_tasks(this_rq, this_cpu, busiest,
2046                                         imbalance, sd, NEWLY_IDLE, NULL);
2047         if (!nr_moved)
2048                 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2049         else
2050                 sd->nr_balance_failed = 0;
2051
2052         spin_unlock(&busiest->lock);
2053         return nr_moved;
2054
2055 out_balanced:
2056         schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2057         sd->nr_balance_failed = 0;
2058         return 0;
2059 }
2060
2061 /*
2062  * idle_balance is called by schedule() if this_cpu is about to become
2063  * idle. Attempts to pull tasks from other CPUs.
2064  */
2065 static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2066 {
2067         struct sched_domain *sd;
2068
2069         for_each_domain(this_cpu, sd) {
2070                 if (sd->flags & SD_BALANCE_NEWIDLE) {
2071                         if (load_balance_newidle(this_cpu, this_rq, sd)) {
2072                                 /* We've pulled tasks over so stop searching */
2073                                 break;
2074                         }
2075                 }
2076         }
2077 }
2078
2079 /*
2080  * active_load_balance is run by migration threads. It pushes running tasks
2081  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2082  * running on each physical CPU where possible, and avoids physical /
2083  * logical imbalances.
2084  *
2085  * Called with busiest_rq locked.
2086  */
2087 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2088 {
2089         struct sched_domain *sd;
2090         runqueue_t *target_rq;
2091         int target_cpu = busiest_rq->push_cpu;
2092
2093         if (busiest_rq->nr_running <= 1)
2094                 /* no task to move */
2095                 return;
2096
2097         target_rq = cpu_rq(target_cpu);
2098
2099         /*
2100          * This condition is "impossible", if it occurs
2101          * we need to fix it.  Originally reported by
2102          * Bjorn Helgaas on a 128-cpu setup.
2103          */
2104         BUG_ON(busiest_rq == target_rq);
2105
2106         /* move a task from busiest_rq to target_rq */
2107         double_lock_balance(busiest_rq, target_rq);
2108
2109         /* Search for an sd spanning us and the target CPU. */
2110         for_each_domain(target_cpu, sd)
2111                 if ((sd->flags & SD_LOAD_BALANCE) &&
2112                         cpu_isset(busiest_cpu, sd->span))
2113                                 break;
2114
2115         if (unlikely(sd == NULL))
2116                 goto out;
2117
2118         schedstat_inc(sd, alb_cnt);
2119
2120         if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2121                 schedstat_inc(sd, alb_pushed);
2122         else
2123                 schedstat_inc(sd, alb_failed);
2124 out:
2125         spin_unlock(&target_rq->lock);
2126 }
2127
2128 /*
2129  * rebalance_tick will get called every timer tick, on every CPU.
2130  *
2131  * It checks each scheduling domain to see if it is due to be balanced,
2132  * and initiates a balancing operation if so.
2133  *
2134  * Balancing parameters are set up in arch_init_sched_domains.
2135  */
2136
2137 /* Don't have all balancing operations going off at once */
2138 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2139
2140 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2141                            enum idle_type idle)
2142 {
2143         unsigned long old_load, this_load;
2144         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2145         struct sched_domain *sd;
2146         int i;
2147
2148         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2149         /* Update our load */
2150         for (i = 0; i < 3; i++) {
2151                 unsigned long new_load = this_load;
2152                 int scale = 1 << i;
2153                 old_load = this_rq->cpu_load[i];
2154                 /*
2155                  * Round up the averaging division if load is increasing. This
2156                  * prevents us from getting stuck on 9 if the load is 10, for
2157                  * example.
2158                  */
2159                 if (new_load > old_load)
2160                         new_load += scale-1;
2161                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2162         }
2163
2164         for_each_domain(this_cpu, sd) {
2165                 unsigned long interval;
2166
2167                 if (!(sd->flags & SD_LOAD_BALANCE))
2168                         continue;
2169
2170                 interval = sd->balance_interval;
2171                 if (idle != SCHED_IDLE)
2172                         interval *= sd->busy_factor;
2173
2174                 /* scale ms to jiffies */
2175                 interval = msecs_to_jiffies(interval);
2176                 if (unlikely(!interval))
2177                         interval = 1;
2178
2179                 if (j - sd->last_balance >= interval) {
2180                         if (load_balance(this_cpu, this_rq, sd, idle)) {
2181                                 /* We've pulled tasks over so no longer idle */
2182                                 idle = NOT_IDLE;
2183                         }
2184                         sd->last_balance += interval;
2185                 }
2186         }
2187 }
2188 #else
2189 /*
2190  * on UP we do not need to balance between CPUs:
2191  */
2192 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2193 {
2194 }
2195 static inline void idle_balance(int cpu, runqueue_t *rq)
2196 {
2197 }
2198 #endif
2199
2200 static inline int wake_priority_sleeper(runqueue_t *rq)
2201 {
2202         int ret = 0;
2203 #ifdef CONFIG_SCHED_SMT
2204         spin_lock(&rq->lock);
2205         /*
2206          * If an SMT sibling task has been put to sleep for priority
2207          * reasons reschedule the idle task to see if it can now run.
2208          */
2209         if (rq->nr_running) {
2210                 resched_task(rq->idle);
2211                 ret = 1;
2212         }
2213         spin_unlock(&rq->lock);
2214 #endif
2215         return ret;
2216 }
2217
2218 DEFINE_PER_CPU(struct kernel_stat, kstat);
2219
2220 EXPORT_PER_CPU_SYMBOL(kstat);
2221
2222 /*
2223  * This is called on clock ticks and on context switches.
2224  * Bank in p->sched_time the ns elapsed since the last tick or switch.
2225  */
2226 static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
2227                                     unsigned long long now)
2228 {
2229         unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
2230         p->sched_time += now - last;
2231 }
2232
2233 /*
2234  * Return current->sched_time plus any more ns on the sched_clock
2235  * that have not yet been banked.
2236  */
2237 unsigned long long current_sched_time(const task_t *tsk)
2238 {
2239         unsigned long long ns;
2240         unsigned long flags;
2241         local_irq_save(flags);
2242         ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
2243         ns = tsk->sched_time + (sched_clock() - ns);
2244         local_irq_restore(flags);
2245         return ns;
2246 }
2247
2248 /*
2249  * We place interactive tasks back into the active array, if possible.
2250  *
2251  * To guarantee that this does not starve expired tasks we ignore the
2252  * interactivity of a task if the first expired task had to wait more
2253  * than a 'reasonable' amount of time. This deadline timeout is
2254  * load-dependent, as the frequency of array switched decreases with
2255  * increasing number of running tasks. We also ignore the interactivity
2256  * if a better static_prio task has expired:
2257  */
2258 #define EXPIRED_STARVING(rq) \
2259         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2260                 (jiffies - (rq)->expired_timestamp >= \
2261                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2262                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
2263
2264 /*
2265  * Account user cpu time to a process.
2266  * @p: the process that the cpu time gets accounted to
2267  * @hardirq_offset: the offset to subtract from hardirq_count()
2268  * @cputime: the cpu time spent in user space since the last update
2269  */
2270 void account_user_time(struct task_struct *p, cputime_t cputime)
2271 {
2272         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2273         cputime64_t tmp;
2274
2275         p->utime = cputime_add(p->utime, cputime);
2276
2277         /* Add user time to cpustat. */
2278         tmp = cputime_to_cputime64(cputime);
2279         if (TASK_NICE(p) > 0)
2280                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
2281         else
2282                 cpustat->user = cputime64_add(cpustat->user, tmp);
2283 }
2284
2285 /*
2286  * Account system cpu time to a process.
2287  * @p: the process that the cpu time gets accounted to
2288  * @hardirq_offset: the offset to subtract from hardirq_count()
2289  * @cputime: the cpu time spent in kernel space since the last update
2290  */
2291 void account_system_time(struct task_struct *p, int hardirq_offset,
2292                          cputime_t cputime)
2293 {
2294         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2295         runqueue_t *rq = this_rq();
2296         cputime64_t tmp;
2297
2298         p->stime = cputime_add(p->stime, cputime);
2299
2300         /* Add system time to cpustat. */
2301         tmp = cputime_to_cputime64(cputime);
2302         if (hardirq_count() - hardirq_offset)
2303                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
2304         else if (softirq_count())
2305                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2306         else if (p != rq->idle)
2307                 cpustat->system = cputime64_add(cpustat->system, tmp);
2308         else if (atomic_read(&rq->nr_iowait) > 0)
2309                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2310         else
2311                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2312         /* Account for system time used */
2313         acct_update_integrals(p);
2314         /* Update rss highwater mark */
2315         update_mem_hiwater(p);
2316 }
2317
2318 /*
2319  * Account for involuntary wait time.
2320  * @p: the process from which the cpu time has been stolen
2321  * @steal: the cpu time spent in involuntary wait
2322  */
2323 void account_steal_time(struct task_struct *p, cputime_t steal)
2324 {
2325         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2326         cputime64_t tmp = cputime_to_cputime64(steal);
2327         runqueue_t *rq = this_rq();
2328
2329         if (p == rq->idle) {
2330                 p->stime = cputime_add(p->stime, steal);
2331                 if (atomic_read(&rq->nr_iowait) > 0)
2332                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2333                 else
2334                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
2335         } else
2336                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
2337 }
2338
2339 /*
2340  * This function gets called by the timer code, with HZ frequency.
2341  * We call it with interrupts disabled.
2342  *
2343  * It also gets called by the fork code, when changing the parent's
2344  * timeslices.
2345  */
2346 void scheduler_tick(void)
2347 {
2348         int cpu = smp_processor_id();
2349         runqueue_t *rq = this_rq();
2350         task_t *p = current;
2351         unsigned long long now = sched_clock();
2352
2353         update_cpu_clock(p, rq, now);
2354
2355         rq->timestamp_last_tick = now;
2356
2357         if (p == rq->idle) {
2358                 if (wake_priority_sleeper(rq))
2359                         goto out;
2360                 rebalance_tick(cpu, rq, SCHED_IDLE);
2361                 return;
2362         }
2363
2364         /* Task might have expired already, but not scheduled off yet */
2365         if (p->array != rq->active) {
2366                 set_tsk_need_resched(p);
2367                 goto out;
2368         }
2369         spin_lock(&rq->lock);
2370         /*
2371          * The task was running during this tick - update the
2372          * time slice counter. Note: we do not update a thread's
2373          * priority until it either goes to sleep or uses up its
2374          * timeslice. This makes it possible for interactive tasks
2375          * to use up their timeslices at their highest priority levels.
2376          */
2377         if (rt_task(p)) {
2378                 /*
2379                  * RR tasks need a special form of timeslice management.
2380                  * FIFO tasks have no timeslices.
2381                  */
2382                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2383                         p->time_slice = task_timeslice(p);
2384                         p->first_time_slice = 0;
2385                         set_tsk_need_resched(p);
2386
2387                         /* put it at the end of the queue: */
2388                         requeue_task(p, rq->active);
2389                 }
2390                 goto out_unlock;
2391         }
2392         if (!--p->time_slice) {
2393                 dequeue_task(p, rq->active);
2394                 set_tsk_need_resched(p);
2395                 p->prio = effective_prio(p);
2396                 p->time_slice = task_timeslice(p);
2397                 p->first_time_slice = 0;
2398
2399                 if (!rq->expired_timestamp)
2400                         rq->expired_timestamp = jiffies;
2401                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2402                         enqueue_task(p, rq->expired);
2403                         if (p->static_prio < rq->best_expired_prio)
2404                                 rq->best_expired_prio = p->static_prio;
2405                 } else
2406                         enqueue_task(p, rq->active);
2407         } else {
2408                 /*
2409                  * Prevent a too long timeslice allowing a task to monopolize
2410                  * the CPU. We do this by splitting up the timeslice into
2411                  * smaller pieces.
2412                  *
2413                  * Note: this does not mean the task's timeslices expire or
2414                  * get lost in any way, they just might be preempted by
2415                  * another task of equal priority. (one with higher
2416                  * priority would have preempted this task already.) We
2417                  * requeue this task to the end of the list on this priority
2418                  * level, which is in essence a round-robin of tasks with
2419                  * equal priority.
2420                  *
2421                  * This only applies to tasks in the interactive
2422                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
2423                  */
2424                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2425                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2426                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2427                         (p->array == rq->active)) {
2428
2429                         requeue_task(p, rq->active);
2430                         set_tsk_need_resched(p);
2431                 }
2432         }
2433 out_unlock:
2434         spin_unlock(&rq->lock);
2435 out:
2436         rebalance_tick(cpu, rq, NOT_IDLE);
2437 }
2438
2439 #ifdef CONFIG_SCHED_SMT
2440 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2441 {
2442         struct sched_domain *sd = this_rq->sd;
2443         cpumask_t sibling_map;
2444         int i;
2445
2446         if (!(sd->flags & SD_SHARE_CPUPOWER))
2447                 return;
2448
2449         /*
2450          * Unlock the current runqueue because we have to lock in
2451          * CPU order to avoid deadlocks. Caller knows that we might
2452          * unlock. We keep IRQs disabled.
2453          */
2454         spin_unlock(&this_rq->lock);
2455
2456         sibling_map = sd->span;
2457
2458         for_each_cpu_mask(i, sibling_map)
2459                 spin_lock(&cpu_rq(i)->lock);
2460         /*
2461          * We clear this CPU from the mask. This both simplifies the
2462          * inner loop and keps this_rq locked when we exit:
2463          */
2464         cpu_clear(this_cpu, sibling_map);
2465
2466         for_each_cpu_mask(i, sibling_map) {
2467                 runqueue_t *smt_rq = cpu_rq(i);
2468
2469                 /*
2470                  * If an SMT sibling task is sleeping due to priority
2471                  * reasons wake it up now.
2472                  */
2473                 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2474                         resched_task(smt_rq->idle);
2475         }
2476
2477         for_each_cpu_mask(i, sibling_map)
2478                 spin_unlock(&cpu_rq(i)->lock);
2479         /*
2480          * We exit with this_cpu's rq still held and IRQs
2481          * still disabled:
2482          */
2483 }
2484
2485 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2486 {
2487         struct sched_domain *sd = this_rq->sd;
2488         cpumask_t sibling_map;
2489         prio_array_t *array;
2490         int ret = 0, i;
2491         task_t *p;
2492
2493         if (!(sd->flags & SD_SHARE_CPUPOWER))
2494                 return 0;
2495
2496         /*
2497          * The same locking rules and details apply as for
2498          * wake_sleeping_dependent():
2499          */
2500         spin_unlock(&this_rq->lock);
2501         sibling_map = sd->span;
2502         for_each_cpu_mask(i, sibling_map)
2503                 spin_lock(&cpu_rq(i)->lock);
2504         cpu_clear(this_cpu, sibling_map);
2505
2506         /*
2507          * Establish next task to be run - it might have gone away because
2508          * we released the runqueue lock above:
2509          */
2510         if (!this_rq->nr_running)
2511                 goto out_unlock;
2512         array = this_rq->active;
2513         if (!array->nr_active)
2514                 array = this_rq->expired;
2515         BUG_ON(!array->nr_active);
2516
2517         p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2518                 task_t, run_list);
2519
2520         for_each_cpu_mask(i, sibling_map) {
2521                 runqueue_t *smt_rq = cpu_rq(i);
2522                 task_t *smt_curr = smt_rq->curr;
2523
2524                 /*
2525                  * If a user task with lower static priority than the
2526                  * running task on the SMT sibling is trying to schedule,
2527                  * delay it till there is proportionately less timeslice
2528                  * left of the sibling task to prevent a lower priority
2529                  * task from using an unfair proportion of the
2530                  * physical cpu's resources. -ck
2531                  */
2532                 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2533                         task_timeslice(p) || rt_task(smt_curr)) &&
2534                         p->mm && smt_curr->mm && !rt_task(p))
2535                                 ret = 1;
2536
2537                 /*
2538                  * Reschedule a lower priority task on the SMT sibling,
2539                  * or wake it up if it has been put to sleep for priority
2540                  * reasons.
2541                  */
2542                 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2543                         task_timeslice(smt_curr) || rt_task(p)) &&
2544                         smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2545                         (smt_curr == smt_rq->idle && smt_rq->nr_running))
2546                                 resched_task(smt_curr);
2547         }
2548 out_unlock:
2549         for_each_cpu_mask(i, sibling_map)
2550                 spin_unlock(&cpu_rq(i)->lock);
2551         return ret;
2552 }
2553 #else
2554 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2555 {
2556 }
2557
2558 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2559 {
2560         return 0;
2561 }
2562 #endif
2563
2564 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2565
2566 void fastcall add_preempt_count(int val)
2567 {
2568         /*
2569          * Underflow?
2570          */
2571         BUG_ON((preempt_count() < 0));
2572         preempt_count() += val;
2573         /*
2574          * Spinlock count overflowing soon?
2575          */
2576         BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2577 }
2578 EXPORT_SYMBOL(add_preempt_count);
2579
2580 void fastcall sub_preempt_count(int val)
2581 {
2582         /*
2583          * Underflow?
2584          */
2585         BUG_ON(val > preempt_count());
2586         /*
2587          * Is the spinlock portion underflowing?
2588          */
2589         BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2590         preempt_count() -= val;
2591 }
2592 EXPORT_SYMBOL(sub_preempt_count);
2593
2594 #endif
2595
2596 /*
2597  * schedule() is the main scheduler function.
2598  */
2599 asmlinkage void __sched schedule(void)
2600 {
2601         long *switch_count;
2602         task_t *prev, *next;
2603         runqueue_t *rq;
2604         prio_array_t *array;
2605         struct list_head *queue;
2606         unsigned long long now;
2607         unsigned long run_time;
2608         int cpu, idx;
2609
2610         /*
2611          * Test if we are atomic.  Since do_exit() needs to call into
2612          * schedule() atomically, we ignore that path for now.
2613          * Otherwise, whine if we are scheduling when we should not be.
2614          */
2615         if (likely(!current->exit_state)) {
2616                 if (unlikely(in_atomic())) {
2617                         printk(KERN_ERR "scheduling while atomic: "
2618                                 "%s/0x%08x/%d\n",
2619                                 current->comm, preempt_count(), current->pid);
2620                         dump_stack();
2621                 }
2622         }
2623         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2624
2625 need_resched:
2626         preempt_disable();
2627         prev = current;
2628         release_kernel_lock(prev);
2629 need_resched_nonpreemptible:
2630         rq = this_rq();
2631
2632         /*
2633          * The idle thread is not allowed to schedule!
2634          * Remove this check after it has been exercised a bit.
2635          */
2636         if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2637                 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2638                 dump_stack();
2639         }
2640
2641         schedstat_inc(rq, sched_cnt);
2642         now = sched_clock();
2643         if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
2644                 run_time = now - prev->timestamp;
2645                 if (unlikely((long long)(now - prev->timestamp) < 0))
2646                         run_time = 0;
2647         } else
2648                 run_time = NS_MAX_SLEEP_AVG;
2649
2650         /*
2651          * Tasks charged proportionately less run_time at high sleep_avg to
2652          * delay them losing their interactive status
2653          */
2654         run_time /= (CURRENT_BONUS(prev) ? : 1);
2655
2656         spin_lock_irq(&rq->lock);
2657
2658         if (unlikely(prev->flags & PF_DEAD))
2659                 prev->state = EXIT_DEAD;
2660
2661         switch_count = &prev->nivcsw;
2662         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2663                 switch_count = &prev->nvcsw;
2664                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2665                                 unlikely(signal_pending(prev))))
2666                         prev->state = TASK_RUNNING;
2667                 else {
2668                         if (prev->state == TASK_UNINTERRUPTIBLE)
2669                                 rq->nr_uninterruptible++;
2670                         deactivate_task(prev, rq);
2671                 }
2672         }
2673
2674         cpu = smp_processor_id();
2675         if (unlikely(!rq->nr_running)) {
2676 go_idle:
2677                 idle_balance(cpu, rq);
2678                 if (!rq->nr_running) {
2679                         next = rq->idle;
2680                         rq->expired_timestamp = 0;
2681                         wake_sleeping_dependent(cpu, rq);
2682                         /*
2683                          * wake_sleeping_dependent() might have released
2684                          * the runqueue, so break out if we got new
2685                          * tasks meanwhile:
2686                          */
2687                         if (!rq->nr_running)
2688                                 goto switch_tasks;
2689                 }
2690         } else {
2691                 if (dependent_sleeper(cpu, rq)) {
2692                         next = rq->idle;
2693                         goto switch_tasks;
2694                 }
2695                 /*
2696                  * dependent_sleeper() releases and reacquires the runqueue
2697                  * lock, hence go into the idle loop if the rq went
2698                  * empty meanwhile:
2699                  */
2700                 if (unlikely(!rq->nr_running))
2701                         goto go_idle;
2702         }
2703
2704         array = rq->active;
2705         if (unlikely(!array->nr_active)) {
2706                 /*
2707                  * Switch the active and expired arrays.
2708                  */
2709                 schedstat_inc(rq, sched_switch);
2710                 rq->active = rq->expired;
2711                 rq->expired = array;
2712                 array = rq->active;
2713                 rq->expired_timestamp = 0;
2714                 rq->best_expired_prio = MAX_PRIO;
2715         }
2716
2717         idx = sched_find_first_bit(array->bitmap);
2718         queue = array->queue + idx;
2719         next = list_entry(queue->next, task_t, run_list);
2720
2721         if (!rt_task(next) && next->activated > 0) {
2722                 unsigned long long delta = now - next->timestamp;
2723                 if (unlikely((long long)(now - next->timestamp) < 0))
2724                         delta = 0;
2725
2726                 if (next->activated == 1)
2727                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2728
2729                 array = next->array;
2730                 dequeue_task(next, array);
2731                 recalc_task_prio(next, next->timestamp + delta);
2732                 enqueue_task(next, array);
2733         }
2734         next->activated = 0;
2735 switch_tasks:
2736         if (next == rq->idle)
2737                 schedstat_inc(rq, sched_goidle);
2738         prefetch(next);
2739         clear_tsk_need_resched(prev);
2740         rcu_qsctr_inc(task_cpu(prev));
2741
2742         update_cpu_clock(prev, rq, now);
2743
2744         prev->sleep_avg -= run_time;
2745         if ((long)prev->sleep_avg <= 0)
2746                 prev->sleep_avg = 0;
2747         prev->timestamp = prev->last_ran = now;
2748
2749         sched_info_switch(prev, next);
2750         if (likely(prev != next)) {
2751                 next->timestamp = now;
2752                 rq->nr_switches++;
2753                 rq->curr = next;
2754                 ++*switch_count;
2755
2756                 prepare_arch_switch(rq, next);
2757                 prev = context_switch(rq, prev, next);
2758                 barrier();
2759
2760                 finish_task_switch(prev);
2761         } else
2762                 spin_unlock_irq(&rq->lock);
2763
2764         prev = current;
2765         if (unlikely(reacquire_kernel_lock(prev) < 0))
2766                 goto need_resched_nonpreemptible;
2767         preempt_enable_no_resched();
2768         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2769                 goto need_resched;
2770 }
2771
2772 EXPORT_SYMBOL(schedule);
2773
2774 #ifdef CONFIG_PREEMPT
2775 /*
2776  * this is is the entry point to schedule() from in-kernel preemption
2777  * off of preempt_enable.  Kernel preemptions off return from interrupt
2778  * occur there and call schedule directly.
2779  */
2780 asmlinkage void __sched preempt_schedule(void)
2781 {
2782         struct thread_info *ti = current_thread_info();
2783 #ifdef CONFIG_PREEMPT_BKL
2784         struct task_struct *task = current;
2785         int saved_lock_depth;
2786 #endif
2787         /*
2788          * If there is a non-zero preempt_count or interrupts are disabled,
2789          * we do not want to preempt the current task.  Just return..
2790          */
2791         if (unlikely(ti->preempt_count || irqs_disabled()))
2792                 return;
2793
2794 need_resched:
2795         add_preempt_count(PREEMPT_ACTIVE);
2796         /*
2797          * We keep the big kernel semaphore locked, but we
2798          * clear ->lock_depth so that schedule() doesnt
2799          * auto-release the semaphore:
2800          */
2801 #ifdef CONFIG_PREEMPT_BKL
2802         saved_lock_depth = task->lock_depth;
2803         task->lock_depth = -1;
2804 #endif
2805         schedule();
2806 #ifdef CONFIG_PREEMPT_BKL
2807         task->lock_depth = saved_lock_depth;
2808 #endif
2809         sub_preempt_count(PREEMPT_ACTIVE);
2810
2811         /* we could miss a preemption opportunity between schedule and now */
2812         barrier();
2813         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2814                 goto need_resched;
2815 }
2816
2817 EXPORT_SYMBOL(preempt_schedule);
2818
2819 /*
2820  * this is is the entry point to schedule() from kernel preemption
2821  * off of irq context.
2822  * Note, that this is called and return with irqs disabled. This will
2823  * protect us against recursive calling from irq.
2824  */
2825 asmlinkage void __sched preempt_schedule_irq(void)
2826 {
2827         struct thread_info *ti = current_thread_info();
2828 #ifdef CONFIG_PREEMPT_BKL
2829         struct task_struct *task = current;
2830         int saved_lock_depth;
2831 #endif
2832         /* Catch callers which need to be fixed*/
2833         BUG_ON(ti->preempt_count || !irqs_disabled());
2834
2835 need_resched:
2836         add_preempt_count(PREEMPT_ACTIVE);
2837         /*
2838          * We keep the big kernel semaphore locked, but we
2839          * clear ->lock_depth so that schedule() doesnt
2840          * auto-release the semaphore:
2841          */
2842 #ifdef CONFIG_PREEMPT_BKL
2843         saved_lock_depth = task->lock_depth;
2844         task->lock_depth = -1;
2845 #endif
2846         local_irq_enable();
2847         schedule();
2848         local_irq_disable();
2849 #ifdef CONFIG_PREEMPT_BKL
2850         task->lock_depth = saved_lock_depth;
2851 #endif
2852         sub_preempt_count(PREEMPT_ACTIVE);
2853
2854         /* we could miss a preemption opportunity between schedule and now */
2855         barrier();
2856         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2857                 goto need_resched;
2858 }
2859
2860 #endif /* CONFIG_PREEMPT */
2861
2862 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2863 {
2864         task_t *p = curr->private;
2865         return try_to_wake_up(p, mode, sync);
2866 }
2867
2868 EXPORT_SYMBOL(default_wake_function);
2869
2870 /*
2871  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
2872  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
2873  * number) then we wake all the non-exclusive tasks and one exclusive task.
2874  *
2875  * There are circumstances in which we can try to wake a task which has already
2876  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
2877  * zero in this (rare) case, and we handle it by continuing to scan the queue.
2878  */
2879 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2880                              int nr_exclusive, int sync, void *key)
2881 {
2882         struct list_head *tmp, *next;
2883
2884         list_for_each_safe(tmp, next, &q->task_list) {
2885                 wait_queue_t *curr;
2886                 unsigned flags;
2887                 curr = list_entry(tmp, wait_queue_t, task_list);
2888                 flags = curr->flags;
2889                 if (curr->func(curr, mode, sync, key) &&
2890                     (flags & WQ_FLAG_EXCLUSIVE) &&
2891                     !--nr_exclusive)
2892                         break;
2893         }
2894 }
2895
2896 /**
2897  * __wake_up - wake up threads blocked on a waitqueue.
2898  * @q: the waitqueue
2899  * @mode: which threads
2900  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2901  * @key: is directly passed to the wakeup function
2902  */
2903 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2904                                 int nr_exclusive, void *key)
2905 {
2906         unsigned long flags;
2907
2908         spin_lock_irqsave(&q->lock, flags);
2909         __wake_up_common(q, mode, nr_exclusive, 0, key);
2910         spin_unlock_irqrestore(&q->lock, flags);
2911 }
2912
2913 EXPORT_SYMBOL(__wake_up);
2914
2915 /*
2916  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2917  */
2918 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2919 {
2920         __wake_up_common(q, mode, 1, 0, NULL);
2921 }
2922
2923 /**
2924  * __wake_up_sync - wake up threads blocked on a waitqueue.
2925  * @q: the waitqueue
2926  * @mode: which threads
2927  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2928  *
2929  * The sync wakeup differs that the waker knows that it will schedule
2930  * away soon, so while the target thread will be woken up, it will not
2931  * be migrated to another CPU - ie. the two threads are 'synchronized'
2932  * with each other. This can prevent needless bouncing between CPUs.
2933  *
2934  * On UP it can prevent extra preemption.
2935  */
2936 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2937 {
2938         unsigned long flags;
2939         int sync = 1;
2940
2941         if (unlikely(!q))
2942                 return;
2943
2944         if (unlikely(!nr_exclusive))
2945                 sync = 0;
2946
2947         spin_lock_irqsave(&q->lock, flags);
2948         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2949         spin_unlock_irqrestore(&q->lock, flags);
2950 }
2951 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
2952
2953 void fastcall complete(struct completion *x)
2954 {
2955         unsigned long flags;
2956
2957         spin_lock_irqsave(&x->wait.lock, flags);
2958         x->done++;
2959         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2960                          1, 0, NULL);
2961         spin_unlock_irqrestore(&x->wait.lock, flags);
2962 }
2963 EXPORT_SYMBOL(complete);
2964
2965 void fastcall complete_all(struct completion *x)
2966 {
2967         unsigned long flags;
2968
2969         spin_lock_irqsave(&x->wait.lock, flags);
2970         x->done += UINT_MAX/2;
2971         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2972                          0, 0, NULL);
2973         spin_unlock_irqrestore(&x->wait.lock, flags);
2974 }
2975 EXPORT_SYMBOL(complete_all);
2976
2977 void fastcall __sched wait_for_completion(struct completion *x)
2978 {
2979         might_sleep();
2980         spin_lock_irq(&x->wait.lock);
2981         if (!x->done) {
2982                 DECLARE_WAITQUEUE(wait, current);
2983
2984                 wait.flags |= WQ_FLAG_EXCLUSIVE;
2985                 __add_wait_queue_tail(&x->wait, &wait);
2986                 do {
2987                         __set_current_state(TASK_UNINTERRUPTIBLE);
2988                         spin_unlock_irq(&x->wait.lock);
2989                         schedule();
2990                         spin_lock_irq(&x->wait.lock);
2991                 } while (!x->done);
2992                 __remove_wait_queue(&x->wait, &wait);
2993         }
2994         x->done--;
2995         spin_unlock_irq(&x->wait.lock);
2996 }
2997 EXPORT_SYMBOL(wait_for_completion);
2998
2999 unsigned long fastcall __sched
3000 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3001 {
3002         might_sleep();
3003
3004         spin_lock_irq(&x->wait.lock);
3005         if (!x->done) {
3006                 DECLARE_WAITQUEUE(wait, current);
3007
3008                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3009                 __add_wait_queue_tail(&x->wait, &wait);
3010                 do {
3011                         __set_current_state(TASK_UNINTERRUPTIBLE);
3012                         spin_unlock_irq(&x->wait.lock);
3013                         timeout = schedule_timeout(timeout);
3014                         spin_lock_irq(&x->wait.lock);
3015                         if (!timeout) {
3016                                 __remove_wait_queue(&x->wait, &wait);
3017                                 goto out;
3018                         }
3019                 } while (!x->done);
3020                 __remove_wait_queue(&x->wait, &wait);
3021         }
3022         x->done--;
3023 out:
3024         spin_unlock_irq(&x->wait.lock);
3025         return timeout;
3026 }
3027 EXPORT_SYMBOL(wait_for_completion_timeout);
3028
3029 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3030 {
3031         int ret = 0;
3032
3033         might_sleep();
3034
3035         spin_lock_irq(&x->wait.lock);
3036         if (!x->done) {
3037                 DECLARE_WAITQUEUE(wait, current);
3038
3039                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3040                 __add_wait_queue_tail(&x->wait, &wait);
3041                 do {
3042                         if (signal_pending(current)) {
3043                                 ret = -ERESTARTSYS;
3044                                 __remove_wait_queue(&x->wait, &wait);
3045                                 goto out;
3046                         }
3047                         __set_current_state(TASK_INTERRUPTIBLE);
3048                         spin_unlock_irq(&x->wait.lock);
3049                         schedule();
3050                         spin_lock_irq(&x->wait.lock);
3051                 } while (!x->done);
3052                 __remove_wait_queue(&x->wait, &wait);
3053         }
3054         x->done--;
3055 out:
3056         spin_unlock_irq(&x->wait.lock);
3057
3058         return ret;
3059 }
3060 EXPORT_SYMBOL(wait_for_completion_interruptible);
3061
3062 unsigned long fastcall __sched
3063 wait_for_completion_interruptible_timeout(struct completion *x,
3064                                           unsigned long timeout)
3065 {
3066         might_sleep();
3067
3068         spin_lock_irq(&x->wait.lock);
3069         if (!x->done) {
3070                 DECLARE_WAITQUEUE(wait, current);
3071
3072                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3073                 __add_wait_queue_tail(&x->wait, &wait);
3074                 do {
3075                         if (signal_pending(current)) {
3076                                 timeout = -ERESTARTSYS;
3077                                 __remove_wait_queue(&x->wait, &wait);
3078                                 goto out;
3079                         }
3080                         __set_current_state(TASK_INTERRUPTIBLE);
3081                         spin_unlock_irq(&x->wait.lock);
3082                         timeout = schedule_timeout(timeout);
3083                         spin_lock_irq(&x->wait.lock);
3084                         if (!timeout) {
3085                                 __remove_wait_queue(&x->wait, &wait);
3086                                 goto out;
3087                         }
3088                 } while (!x->done);
3089                 __remove_wait_queue(&x->wait, &wait);
3090         }
3091         x->done--;
3092 out:
3093         spin_unlock_irq(&x->wait.lock);
3094         return timeout;
3095 }
3096 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3097
3098
3099 #define SLEEP_ON_VAR                                    \
3100         unsigned long flags;                            \
3101         wait_queue_t wait;                              \
3102         init_waitqueue_entry(&wait, current);
3103
3104 #define SLEEP_ON_HEAD                                   \
3105         spin_lock_irqsave(&q->lock,flags);              \
3106         __add_wait_queue(q, &wait);                     \
3107         spin_unlock(&q->lock);
3108
3109 #define SLEEP_ON_TAIL                                   \
3110         spin_lock_irq(&q->lock);                        \
3111         __remove_wait_queue(q, &wait);                  \
3112         spin_unlock_irqrestore(&q->lock, flags);
3113
3114 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3115 {
3116         SLEEP_ON_VAR
3117
3118         current->state = TASK_INTERRUPTIBLE;
3119
3120         SLEEP_ON_HEAD
3121         schedule();
3122         SLEEP_ON_TAIL
3123 }
3124
3125 EXPORT_SYMBOL(interruptible_sleep_on);
3126
3127 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3128 {
3129         SLEEP_ON_VAR
3130
3131         current->state = TASK_INTERRUPTIBLE;
3132
3133         SLEEP_ON_HEAD
3134         timeout = schedule_timeout(timeout);
3135         SLEEP_ON_TAIL
3136
3137         return timeout;
3138 }
3139
3140 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3141
3142 void fastcall __sched sleep_on(wait_queue_head_t *q)
3143 {
3144         SLEEP_ON_VAR
3145
3146         current->state = TASK_UNINTERRUPTIBLE;
3147
3148         SLEEP_ON_HEAD
3149         schedule();
3150         SLEEP_ON_TAIL
3151 }
3152
3153 EXPORT_SYMBOL(sleep_on);
3154
3155 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3156 {
3157         SLEEP_ON_VAR
3158
3159         current->state = TASK_UNINTERRUPTIBLE;
3160
3161         SLEEP_ON_HEAD
3162         timeout = schedule_timeout(timeout);
3163         SLEEP_ON_TAIL
3164
3165         return timeout;
3166 }
3167
3168 EXPORT_SYMBOL(sleep_on_timeout);
3169
3170 void set_user_nice(task_t *p, long nice)
3171 {
3172         unsigned long flags;
3173         prio_array_t *array;
3174         runqueue_t *rq;
3175         int old_prio, new_prio, delta;
3176
3177         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3178                 return;
3179         /*
3180          * We have to be careful, if called from sys_setpriority(),
3181          * the task might be in the middle of scheduling on another CPU.
3182          */
3183         rq = task_rq_lock(p, &flags);
3184         /*
3185          * The RT priorities are set via sched_setscheduler(), but we still
3186          * allow the 'normal' nice value to be set - but as expected
3187          * it wont have any effect on scheduling until the task is
3188          * not SCHED_NORMAL:
3189          */
3190         if (rt_task(p)) {
3191                 p->static_prio = NICE_TO_PRIO(nice);
3192                 goto out_unlock;
3193         }
3194         array = p->array;
3195         if (array)
3196                 dequeue_task(p, array);
3197
3198         old_prio = p->prio;
3199         new_prio = NICE_TO_PRIO(nice);
3200         delta = new_prio - old_prio;
3201         p->static_prio = NICE_TO_PRIO(nice);
3202         p->prio += delta;
3203
3204         if (array) {
3205                 enqueue_task(p, array);
3206                 /*
3207                  * If the task increased its priority or is running and
3208                  * lowered its priority, then reschedule its CPU:
3209                  */
3210                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3211                         resched_task(rq->curr);
3212         }
3213 out_unlock:
3214         task_rq_unlock(rq, &flags);
3215 }
3216
3217 EXPORT_SYMBOL(set_user_nice);
3218
3219 /*
3220  * can_nice - check if a task can reduce its nice value
3221  * @p: task
3222  * @nice: nice value
3223  */
3224 int can_nice(const task_t *p, const int nice)
3225 {
3226         /* convert nice value [19,-20] to rlimit style value [0,39] */
3227         int nice_rlim = 19 - nice;
3228         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3229                 capable(CAP_SYS_NICE));
3230 }
3231
3232 #ifdef __ARCH_WANT_SYS_NICE
3233
3234 /*
3235  * sys_nice - change the priority of the current process.
3236  * @increment: priority increment
3237  *
3238  * sys_setpriority is a more generic, but much slower function that
3239  * does similar things.
3240  */
3241 asmlinkage long sys_nice(int increment)
3242 {
3243         int retval;
3244         long nice;
3245
3246         /*
3247          * Setpriority might change our priority at the same moment.
3248          * We don't have to worry. Conceptually one call occurs first
3249          * and we have a single winner.
3250          */
3251         if (increment < -40)
3252                 increment = -40;
3253         if (increment > 40)
3254                 increment = 40;
3255
3256         nice = PRIO_TO_NICE(current->static_prio) + increment;
3257         if (nice < -20)
3258                 nice = -20;
3259         if (nice > 19)
3260                 nice = 19;
3261
3262         if (increment < 0 && !can_nice(current, nice))
3263                 return -EPERM;
3264
3265         retval = security_task_setnice(current, nice);
3266         if (retval)
3267                 return retval;
3268
3269         set_user_nice(current, nice);
3270         return 0;
3271 }
3272
3273 #endif
3274
3275 /**
3276  * task_prio - return the priority value of a given task.
3277  * @p: the task in question.
3278  *
3279  * This is the priority value as seen by users in /proc.
3280  * RT tasks are offset by -200. Normal tasks are centered
3281  * around 0, value goes from -16 to +15.
3282  */
3283 int task_prio(const task_t *p)
3284 {
3285         return p->prio - MAX_RT_PRIO;
3286 }
3287
3288 /**
3289  * task_nice - return the nice value of a given task.
3290  * @p: the task in question.
3291  */
3292 int task_nice(const task_t *p)
3293 {
3294         return TASK_NICE(p);
3295 }
3296
3297 /*
3298  * The only users of task_nice are binfmt_elf and binfmt_elf32.
3299  * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3300  * Therefore, task_nice is needed if there is a compat_mode.
3301  */
3302 #ifdef CONFIG_COMPAT
3303 EXPORT_SYMBOL_GPL(task_nice);
3304 #endif
3305
3306 /**
3307  * idle_cpu - is a given cpu idle currently?
3308  * @cpu: the processor in question.
3309  */
3310 int idle_cpu(int cpu)
3311 {
3312         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3313 }
3314
3315 EXPORT_SYMBOL_GPL(idle_cpu);
3316
3317 /**
3318  * idle_task - return the idle task for a given cpu.
3319  * @cpu: the processor in question.
3320  */
3321 task_t *idle_task(int cpu)
3322 {
3323         return cpu_rq(cpu)->idle;
3324 }
3325
3326 /**
3327  * find_process_by_pid - find a process with a matching PID value.
3328  * @pid: the pid in question.
3329  */
3330 static inline task_t *find_process_by_pid(pid_t pid)
3331 {
3332         return pid ? find_task_by_pid(pid) : current;
3333 }
3334
3335 /* Actually do priority change: must hold rq lock. */
3336 static void __setscheduler(struct task_struct *p, int policy, int prio)
3337 {
3338         BUG_ON(p->array);
3339         p->policy = policy;
3340         p->rt_priority = prio;
3341         if (policy != SCHED_NORMAL)
3342                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3343         else
3344                 p->prio = p->static_prio;
3345 }
3346
3347 /**
3348  * sched_setscheduler - change the scheduling policy and/or RT priority of
3349  * a thread.
3350  * @p: the task in question.
3351  * @policy: new policy.
3352  * @param: structure containing the new RT priority.
3353  */
3354 int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
3355 {
3356         int retval;
3357         int oldprio, oldpolicy = -1;
3358         prio_array_t *array;
3359         unsigned long flags;
3360         runqueue_t *rq;
3361
3362 recheck:
3363         /* double check policy once rq lock held */
3364         if (policy < 0)
3365                 policy = oldpolicy = p->policy;
3366         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3367                                 policy != SCHED_NORMAL)
3368                         return -EINVAL;
3369         /*
3370          * Valid priorities for SCHED_FIFO and SCHED_RR are
3371          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3372          */
3373         if (param->sched_priority < 0 ||
3374             param->sched_priority > MAX_USER_RT_PRIO-1)
3375                 return -EINVAL;
3376         if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3377                 return -EINVAL;
3378
3379         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3380             param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
3381             !capable(CAP_SYS_NICE))
3382                 return -EPERM;
3383         if ((current->euid != p->euid) && (current->euid != p->uid) &&
3384             !capable(CAP_SYS_NICE))
3385                 return -EPERM;
3386
3387         retval = security_task_setscheduler(p, policy, param);
3388         if (retval)
3389                 return retval;
3390         /*
3391          * To be able to change p->policy safely, the apropriate
3392          * runqueue lock must be held.
3393          */
3394         rq = task_rq_lock(p, &flags);
3395         /* recheck policy now with rq lock held */
3396         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3397                 policy = oldpolicy = -1;
3398                 task_rq_unlock(rq, &flags);
3399                 goto recheck;
3400         }
3401         array = p->array;
3402         if (array)
3403                 deactivate_task(p, rq);
3404         oldprio = p->prio;
3405         __setscheduler(p, policy, param->sched_priority);
3406         if (array) {
3407                 __activate_task(p, rq);
3408                 /*
3409                  * Reschedule if we are currently running on this runqueue and
3410                  * our priority decreased, or if we are not currently running on
3411                  * this runqueue and our priority is higher than the current's
3412                  */
3413                 if (task_running(rq, p)) {
3414                         if (p->prio > oldprio)
3415                                 resched_task(rq->curr);
3416                 } else if (TASK_PREEMPTS_CURR(p, rq))
3417                         resched_task(rq->curr);
3418         }
3419         task_rq_unlock(rq, &flags);
3420         return 0;
3421 }
3422 EXPORT_SYMBOL_GPL(sched_setscheduler);
3423
3424 static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3425 {
3426         int retval;
3427         struct sched_param lparam;
3428         struct task_struct *p;
3429
3430         if (!param || pid < 0)
3431                 return -EINVAL;
3432         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3433                 return -EFAULT;
3434         read_lock_irq(&tasklist_lock);
3435         p = find_process_by_pid(pid);
3436         if (!p) {
3437                 read_unlock_irq(&tasklist_lock);
3438                 return -ESRCH;
3439         }
3440         retval = sched_setscheduler(p, policy, &lparam);
3441         read_unlock_irq(&tasklist_lock);
3442         return retval;
3443 }
3444
3445 /**
3446  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3447  * @pid: the pid in question.
3448  * @policy: new policy.
3449  * @param: structure containing the new RT priority.
3450  */
3451 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3452                                        struct sched_param __user *param)
3453 {
3454         return do_sched_setscheduler(pid, policy, param);
3455 }
3456
3457 /**
3458  * sys_sched_setparam - set/change the RT priority of a thread
3459  * @pid: the pid in question.
3460  * @param: structure containing the new RT priority.
3461  */
3462 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3463 {
3464         return do_sched_setscheduler(pid, -1, param);
3465 }
3466
3467 /**
3468  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3469  * @pid: the pid in question.
3470  */
3471 asmlinkage long sys_sched_getscheduler(pid_t pid)
3472 {
3473         int retval = -EINVAL;
3474         task_t *p;
3475
3476         if (pid < 0)
3477                 goto out_nounlock;
3478
3479         retval = -ESRCH;
3480         read_lock(&tasklist_lock);
3481         p = find_process_by_pid(pid);
3482         if (p) {
3483                 retval = security_task_getscheduler(p);
3484                 if (!retval)
3485                         retval = p->policy;
3486         }
3487         read_unlock(&tasklist_lock);
3488
3489 out_nounlock:
3490         return retval;
3491 }
3492
3493 /**
3494  * sys_sched_getscheduler - get the RT priority of a thread
3495  * @pid: the pid in question.
3496  * @param: structure containing the RT priority.
3497  */
3498 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3499 {
3500         struct sched_param lp;
3501         int retval = -EINVAL;
3502         task_t *p;
3503
3504         if (!param || pid < 0)
3505                 goto out_nounlock;
3506
3507         read_lock(&tasklist_lock);
3508         p = find_process_by_pid(pid);
3509         retval = -ESRCH;
3510         if (!p)
3511                 goto out_unlock;
3512
3513         retval = security_task_getscheduler(p);
3514         if (retval)
3515                 goto out_unlock;
3516
3517         lp.sched_priority = p->rt_priority;
3518         read_unlock(&tasklist_lock);
3519
3520         /*
3521          * This one might sleep, we cannot do it with a spinlock held ...
3522          */
3523         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3524
3525 out_nounlock:
3526         return retval;
3527
3528 out_unlock:
3529         read_unlock(&tasklist_lock);
3530         return retval;
3531 }
3532
3533 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3534 {
3535         task_t *p;
3536         int retval;
3537         cpumask_t cpus_allowed;
3538
3539         lock_cpu_hotplug();
3540         read_lock(&tasklist_lock);
3541
3542         p = find_process_by_pid(pid);
3543         if (!p) {
3544                 read_unlock(&tasklist_lock);
3545                 unlock_cpu_hotplug();
3546                 return -ESRCH;
3547         }
3548
3549         /*
3550          * It is not safe to call set_cpus_allowed with the
3551          * tasklist_lock held.  We will bump the task_struct's
3552          * usage count and then drop tasklist_lock.
3553          */
3554         get_task_struct(p);
3555         read_unlock(&tasklist_lock);
3556
3557         retval = -EPERM;
3558         if ((current->euid != p->euid) && (current->euid != p->uid) &&
3559                         !capable(CAP_SYS_NICE))
3560                 goto out_unlock;
3561
3562         cpus_allowed = cpuset_cpus_allowed(p);
3563         cpus_and(new_mask, new_mask, cpus_allowed);
3564         retval = set_cpus_allowed(p, new_mask);
3565
3566 out_unlock:
3567         put_task_struct(p);
3568         unlock_cpu_hotplug();
3569         return retval;
3570 }
3571
3572 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3573                              cpumask_t *new_mask)
3574 {
3575         if (len < sizeof(cpumask_t)) {
3576                 memset(new_mask, 0, sizeof(cpumask_t));
3577         } else if (len > sizeof(cpumask_t)) {
3578                 len = sizeof(cpumask_t);
3579         }
3580         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3581 }
3582
3583 /**
3584  * sys_sched_setaffinity - set the cpu affinity of a process
3585  * @pid: pid of the process
3586  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3587  * @user_mask_ptr: user-space pointer to the new cpu mask
3588  */
3589 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3590                                       unsigned long __user *user_mask_ptr)
3591 {
3592         cpumask_t new_mask;
3593         int retval;
3594
3595         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3596         if (retval)
3597                 return retval;
3598
3599         return sched_setaffinity(pid, new_mask);
3600 }
3601
3602 /*
3603  * Represents all cpu's present in the system
3604  * In systems capable of hotplug, this map could dynamically grow
3605  * as new cpu's are detected in the system via any platform specific
3606  * method, such as ACPI for e.g.
3607  */
3608
3609 cpumask_t cpu_present_map;
3610 EXPORT_SYMBOL(cpu_present_map);
3611
3612 #ifndef CONFIG_SMP
3613 cpumask_t cpu_online_map = CPU_MASK_ALL;
3614 cpumask_t cpu_possible_map = CPU_MASK_ALL;
3615 #endif
3616
3617 long sched_getaffinity(pid_t pid, cpumask_t *mask)
3618 {
3619         int retval;
3620         task_t *p;
3621
3622         lock_cpu_hotplug();
3623         read_lock(&tasklist_lock);
3624
3625         retval = -ESRCH;
3626         p = find_process_by_pid(pid);
3627         if (!p)
3628                 goto out_unlock;
3629
3630         retval = 0;
3631         cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3632
3633 out_unlock:
3634         read_unlock(&tasklist_lock);
3635         unlock_cpu_hotplug();
3636         if (retval)
3637                 return retval;
3638
3639         return 0;
3640 }
3641
3642 /**
3643  * sys_sched_getaffinity - get the cpu affinity of a process
3644  * @pid: pid of the process
3645  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3646  * @user_mask_ptr: user-space pointer to hold the current cpu mask
3647  */
3648 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3649                                       unsigned long __user *user_mask_ptr)
3650 {
3651         int ret;
3652         cpumask_t mask;
3653
3654         if (len < sizeof(cpumask_t))
3655                 return -EINVAL;
3656
3657         ret = sched_getaffinity(pid, &mask);
3658         if (ret < 0)
3659                 return ret;
3660
3661         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3662                 return -EFAULT;
3663
3664         return sizeof(cpumask_t);
3665 }
3666
3667 /**
3668  * sys_sched_yield - yield the current processor to other threads.
3669  *
3670  * this function yields the current CPU by moving the calling thread
3671  * to the expired array. If there are no other threads running on this
3672  * CPU then this function will return.
3673  */
3674 asmlinkage long sys_sched_yield(void)
3675 {
3676         runqueue_t *rq = this_rq_lock();
3677         prio_array_t *array = current->array;
3678         prio_array_t *target = rq->expired;
3679
3680         schedstat_inc(rq, yld_cnt);
3681         /*
3682          * We implement yielding by moving the task into the expired
3683          * queue.
3684          *
3685          * (special rule: RT tasks will just roundrobin in the active
3686          *  array.)
3687          */
3688         if (rt_task(current))
3689                 target = rq->active;
3690
3691         if (current->array->nr_active == 1) {
3692                 schedstat_inc(rq, yld_act_empty);
3693                 if (!rq->expired->nr_active)
3694                         schedstat_inc(rq, yld_both_empty);
3695         } else if (!rq->expired->nr_active)
3696                 schedstat_inc(rq, yld_exp_empty);
3697
3698         if (array != target) {
3699                 dequeue_task(current, array);
3700                 enqueue_task(current, target);
3701         } else
3702                 /*
3703                  * requeue_task is cheaper so perform that if possible.
3704                  */
3705                 requeue_task(current, array);
3706
3707         /*
3708          * Since we are going to call schedule() anyway, there's
3709          * no need to preempt or enable interrupts:
3710          */
3711         __release(rq->lock);
3712         _raw_spin_unlock(&rq->lock);
3713         preempt_enable_no_resched();
3714
3715         schedule();
3716
3717         return 0;
3718 }
3719
3720 static inline void __cond_resched(void)
3721 {
3722         do {
3723                 add_preempt_count(PREEMPT_ACTIVE);
3724                 schedule();
3725                 sub_preempt_count(PREEMPT_ACTIVE);
3726         } while (need_resched());
3727 }
3728
3729 int __sched cond_resched(void)
3730 {
3731         if (need_resched()) {
3732                 __cond_resched();
3733                 return 1;
3734         }
3735         return 0;
3736 }
3737
3738 EXPORT_SYMBOL(cond_resched);
3739
3740 /*
3741  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
3742  * call schedule, and on return reacquire the lock.
3743  *
3744  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
3745  * operations here to prevent schedule() from being called twice (once via
3746  * spin_unlock(), once by hand).
3747  */
3748 int cond_resched_lock(spinlock_t * lock)
3749 {
3750         int ret = 0;
3751
3752         if (need_lockbreak(lock)) {
3753                 spin_unlock(lock);
3754                 cpu_relax();
3755                 ret = 1;
3756                 spin_lock(lock);
3757         }
3758         if (need_resched()) {
3759                 _raw_spin_unlock(lock);
3760                 preempt_enable_no_resched();
3761                 __cond_resched();
3762                 ret = 1;
3763                 spin_lock(lock);
3764         }
3765         return ret;
3766 }
3767
3768 EXPORT_SYMBOL(cond_resched_lock);
3769
3770 int __sched cond_resched_softirq(void)
3771 {
3772         BUG_ON(!in_softirq());
3773
3774         if (need_resched()) {
3775                 __local_bh_enable();
3776                 __cond_resched();
3777                 local_bh_disable();
3778                 return 1;
3779         }
3780         return 0;
3781 }
3782
3783 EXPORT_SYMBOL(cond_resched_softirq);
3784
3785
3786 /**
3787  * yield - yield the current processor to other threads.
3788  *
3789  * this is a shortcut for kernel-space yielding - it marks the
3790  * thread runnable and calls sys_sched_yield().
3791  */
3792 void __sched yield(void)
3793 {
3794         set_current_state(TASK_RUNNING);
3795         sys_sched_yield();
3796 }
3797
3798 EXPORT_SYMBOL(yield);
3799
3800 /*
3801  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
3802  * that process accounting knows that this is a task in IO wait state.
3803  *
3804  * But don't do that if it is a deliberate, throttling IO wait (this task
3805  * has set its backing_dev_info: the queue against which it should throttle)
3806  */
3807 void __sched io_schedule(void)
3808 {
3809         struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3810
3811         atomic_inc(&rq->nr_iowait);
3812         schedule();
3813         atomic_dec(&rq->nr_iowait);
3814 }
3815
3816 EXPORT_SYMBOL(io_schedule);
3817
3818 long __sched io_schedule_timeout(long timeout)
3819 {
3820         struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3821         long ret;
3822
3823         atomic_inc(&rq->nr_iowait);
3824         ret = schedule_timeout(timeout);
3825         atomic_dec(&rq->nr_iowait);
3826         return ret;
3827 }
3828
3829 /**
3830  * sys_sched_get_priority_max - return maximum RT priority.
3831  * @policy: scheduling class.
3832  *
3833  * this syscall returns the maximum rt_priority that can be used
3834  * by a given scheduling class.
3835  */
3836 asmlinkage long sys_sched_get_priority_max(int policy)
3837 {
3838         int ret = -EINVAL;
3839
3840         switch (policy) {
3841         case SCHED_FIFO:
3842         case SCHED_RR:
3843                 ret = MAX_USER_RT_PRIO-1;
3844                 break;
3845         case SCHED_NORMAL:
3846                 ret = 0;
3847                 break;
3848         }
3849         return ret;
3850 }
3851
3852 /**
3853  * sys_sched_get_priority_min - return minimum RT priority.
3854  * @policy: scheduling class.
3855  *
3856  * this syscall returns the minimum rt_priority that can be used
3857  * by a given scheduling class.
3858  */
3859 asmlinkage long sys_sched_get_priority_min(int policy)
3860 {
3861         int ret = -EINVAL;
3862
3863         switch (policy) {
3864         case SCHED_FIFO:
3865         case SCHED_RR:
3866                 ret = 1;
3867                 break;
3868         case SCHED_NORMAL:
3869                 ret = 0;
3870         }
3871         return ret;
3872 }
3873
3874 /**
3875  * sys_sched_rr_get_interval - return the default timeslice of a process.
3876  * @pid: pid of the process.
3877  * @interval: userspace pointer to the timeslice value.
3878  *
3879  * this syscall writes the default timeslice value of a given process
3880  * into the user-space timespec buffer. A value of '0' means infinity.
3881  */
3882 asmlinkage
3883 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3884 {
3885         int retval = -EINVAL;
3886         struct timespec t;
3887         task_t *p;
3888
3889         if (pid < 0)
3890                 goto out_nounlock;
3891
3892         retval = -ESRCH;
3893         read_lock(&tasklist_lock);
3894         p = find_process_by_pid(pid);
3895         if (!p)
3896                 goto out_unlock;
3897
3898         retval = security_task_getscheduler(p);
3899         if (retval)
3900                 goto out_unlock;
3901
3902         jiffies_to_timespec(p->policy & SCHED_FIFO ?
3903                                 0 : task_timeslice(p), &t);
3904         read_unlock(&tasklist_lock);
3905         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3906 out_nounlock:
3907         return retval;
3908 out_unlock:
3909         read_unlock(&tasklist_lock);
3910         return retval;
3911 }
3912
3913 static inline struct task_struct *eldest_child(struct task_struct *p)
3914 {
3915         if (list_empty(&p->children)) return NULL;
3916         return list_entry(p->children.next,struct task_struct,sibling);
3917 }
3918
3919 static inline struct task_struct *older_sibling(struct task_struct *p)
3920 {
3921         if (p->sibling.prev==&p->parent->children) return NULL;
3922         return list_entry(p->sibling.prev,struct task_struct,sibling);
3923 }
3924
3925 static inline struct task_struct *younger_sibling(struct task_struct *p)
3926 {
3927         if (p->sibling.next==&p->parent->children) return NULL;
3928         return list_entry(p->sibling.next,struct task_struct,sibling);
3929 }
3930
3931 static void show_task(task_t * p)
3932 {
3933         task_t *relative;
3934         unsigned state;
3935         unsigned long free = 0;
3936         static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
3937
3938         printk("%-13.13s ", p->comm);
3939         state = p->state ? __ffs(p->state) + 1 : 0;
3940         if (state < ARRAY_SIZE(stat_nam))
3941                 printk(stat_nam[state]);
3942         else
3943                 printk("?");
3944 #if (BITS_PER_LONG == 32)
3945         if (state == TASK_RUNNING)
3946                 printk(" running ");
3947         else
3948                 printk(" %08lX ", thread_saved_pc(p));
3949 #else
3950         if (state == TASK_RUNNING)
3951                 printk("  running task   ");
3952         else
3953                 printk(" %016lx ", thread_saved_pc(p));
3954 #endif
3955 #ifdef CONFIG_DEBUG_STACK_USAGE
3956         {
3957                 unsigned long * n = (unsigned long *) (p->thread_info+1);
3958                 while (!*n)
3959                         n++;
3960                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3961         }
3962 #endif
3963         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3964         if ((relative = eldest_child(p)))
3965                 printk("%5d ", relative->pid);
3966         else
3967                 printk("      ");
3968         if ((relative = younger_sibling(p)))
3969                 printk("%7d", relative->pid);
3970         else
3971                 printk("       ");
3972         if ((relative = older_sibling(p)))
3973                 printk(" %5d", relative->pid);
3974         else
3975                 printk("      ");
3976         if (!p->mm)
3977                 printk(" (L-TLB)\n");
3978         else
3979                 printk(" (NOTLB)\n");
3980
3981         if (state != TASK_RUNNING)
3982                 show_stack(p, NULL);
3983 }
3984
3985 void show_state(void)
3986 {
3987         task_t *g, *p;
3988
3989 #if (BITS_PER_LONG == 32)
3990         printk("\n"
3991                "                                               sibling\n");
3992         printk("  task             PC      pid father child younger older\n");
3993 #else
3994         printk("\n"
3995                "                                                       sibling\n");
3996         printk("  task                 PC          pid father child younger older\n");
3997 #endif
3998         read_lock(&tasklist_lock);
3999         do_each_thread(g, p) {
4000                 /*
4001                  * reset the NMI-timeout, listing all files on a slow
4002                  * console might take alot of time:
4003                  */
4004                 touch_nmi_watchdog();
4005                 show_task(p);
4006         } while_each_thread(g, p);
4007
4008         read_unlock(&tasklist_lock);
4009 }
4010
4011 void __devinit init_idle(task_t *idle, int cpu)
4012 {
4013         runqueue_t *rq = cpu_rq(cpu);
4014         unsigned long flags;
4015
4016         idle->sleep_avg = 0;
4017         idle->array = NULL;
4018         idle->prio = MAX_PRIO;
4019         idle->state = TASK_RUNNING;
4020         idle->cpus_allowed = cpumask_of_cpu(cpu);
4021         set_task_cpu(idle, cpu);
4022
4023         spin_lock_irqsave(&rq->lock, flags);
4024         rq->curr = rq->idle = idle;
4025         set_tsk_need_resched(idle);
4026         spin_unlock_irqrestore(&rq->lock, flags);
4027
4028         /* Set the preempt count _outside_ the spinlocks! */
4029 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4030         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
4031 #else
4032         idle->thread_info->preempt_count = 0;
4033 #endif
4034 }
4035
4036 /*
4037  * In a system that switches off the HZ timer nohz_cpu_mask
4038  * indicates which cpus entered this state. This is used
4039  * in the rcu update to wait only for active cpus. For system
4040  * which do not switch off the HZ timer nohz_cpu_mask should
4041  * always be CPU_MASK_NONE.
4042  */
4043 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4044
4045 #ifdef CONFIG_SMP
4046 /*
4047  * This is how migration works:
4048  *
4049  * 1) we queue a migration_req_t structure in the source CPU's
4050  *    runqueue and wake up that CPU's migration thread.
4051  * 2) we down() the locked semaphore => thread blocks.
4052  * 3) migration thread wakes up (implicitly it forces the migrated
4053  *    thread off the CPU)
4054  * 4) it gets the migration request and checks whether the migrated
4055  *    task is still in the wrong runqueue.
4056  * 5) if it's in the wrong runqueue then the migration thread removes
4057  *    it and puts it into the right queue.
4058  * 6) migration thread up()s the semaphore.
4059  * 7) we wake up and the migration is done.
4060  */
4061
4062 /*
4063  * Change a given task's CPU affinity. Migrate the thread to a
4064  * proper CPU and schedule it away if the CPU it's executing on
4065  * is removed from the allowed bitmask.
4066  *
4067  * NOTE: the caller must have a valid reference to the task, the
4068  * task must not exit() & deallocate itself prematurely.  The
4069  * call is not atomic; no spinlocks may be held.
4070  */
4071 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4072 {
4073         unsigned long flags;
4074         int ret = 0;
4075         migration_req_t req;
4076         runqueue_t *rq;
4077
4078         rq = task_rq_lock(p, &flags);
4079         if (!cpus_intersects(new_mask, cpu_online_map)) {
4080                 ret = -EINVAL;
4081                 goto out;
4082         }
4083
4084         p->cpus_allowed = new_mask;
4085         /* Can the task run on the task's current CPU? If so, we're done */
4086         if (cpu_isset(task_cpu(p), new_mask))
4087                 goto out;
4088
4089         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4090                 /* Need help from migration thread: drop lock and wait. */
4091                 task_rq_unlock(rq, &flags);
4092                 wake_up_process(rq->migration_thread);
4093                 wait_for_completion(&req.done);
4094                 tlb_migrate_finish(p->mm);
4095                 return 0;
4096         }
4097 out:
4098         task_rq_unlock(rq, &flags);
4099         return ret;
4100 }
4101
4102 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4103
4104 /*
4105  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4106  * this because either it can't run here any more (set_cpus_allowed()
4107  * away from this CPU, or CPU going down), or because we're
4108  * attempting to rebalance this task on exec (sched_exec).
4109  *
4110  * So we race with normal scheduler movements, but that's OK, as long
4111  * as the task is no longer on this CPU.
4112  */
4113 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4114 {
4115         runqueue_t *rq_dest, *rq_src;
4116
4117         if (unlikely(cpu_is_offline(dest_cpu)))
4118                 return;
4119
4120         rq_src = cpu_rq(src_cpu);
4121         rq_dest = cpu_rq(dest_cpu);
4122
4123         double_rq_lock(rq_src, rq_dest);
4124         /* Already moved. */
4125         if (task_cpu(p) != src_cpu)
4126                 goto out;
4127         /* Affinity changed (again). */
4128         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4129                 goto out;
4130
4131         set_task_cpu(p, dest_cpu);
4132         if (p->array) {
4133                 /*
4134                  * Sync timestamp with rq_dest's before activating.
4135                  * The same thing could be achieved by doing this step
4136                  * afterwards, and pretending it was a local activate.
4137                  * This way is cleaner and logically correct.
4138                  */
4139                 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4140                                 + rq_dest->timestamp_last_tick;
4141                 deactivate_task(p, rq_src);
4142                 activate_task(p, rq_dest, 0);
4143                 if (TASK_PREEMPTS_CURR(p, rq_dest))
4144                         resched_task(rq_dest->curr);
4145         }
4146
4147 out:
4148         double_rq_unlock(rq_src, rq_dest);
4149 }
4150
4151 /*
4152  * migration_thread - this is a highprio system thread that performs
4153  * thread migration by bumping thread off CPU then 'pushing' onto
4154  * another runqueue.
4155  */
4156 static int migration_thread(void * data)
4157 {
4158         runqueue_t *rq;
4159         int cpu = (long)data;
4160
4161         rq = cpu_rq(cpu);
4162         BUG_ON(rq->migration_thread != current);
4163
4164         set_current_state(TASK_INTERRUPTIBLE);
4165         while (!kthread_should_stop()) {
4166                 struct list_head *head;
4167                 migration_req_t *req;
4168
4169                 if (current->flags & PF_FREEZE)
4170                         refrigerator(PF_FREEZE);
4171
4172                 spin_lock_irq(&rq->lock);
4173
4174                 if (cpu_is_offline(cpu)) {
4175                         spin_unlock_irq(&rq->lock);
4176                         goto wait_to_die;
4177                 }
4178
4179                 if (rq->active_balance) {
4180                         active_load_balance(rq, cpu);
4181                         rq->active_balance = 0;
4182                 }
4183
4184                 head = &rq->migration_queue;
4185
4186                 if (list_empty(head)) {
4187                         spin_unlock_irq(&rq->lock);
4188                         schedule();
4189                         set_current_state(TASK_INTERRUPTIBLE);
4190                         continue;
4191                 }
4192                 req = list_entry(head->next, migration_req_t, list);
4193                 list_del_init(head->next);
4194
4195                 if (req->type == REQ_MOVE_TASK) {
4196                         spin_unlock(&rq->lock);
4197                         __migrate_task(req->task, cpu, req->dest_cpu);
4198                         local_irq_enable();
4199                 } else if (req->type == REQ_SET_DOMAIN) {
4200                         rq->sd = req->sd;
4201                         spin_unlock_irq(&rq->lock);
4202                 } else {
4203                         spin_unlock_irq(&rq->lock);
4204                         WARN_ON(1);
4205                 }
4206
4207                 complete(&req->done);
4208         }
4209         __set_current_state(TASK_RUNNING);
4210         return 0;
4211
4212 wait_to_die:
4213         /* Wait for kthread_stop */
4214         set_current_state(TASK_INTERRUPTIBLE);
4215         while (!kthread_should_stop()) {
4216                 schedule();
4217                 set_current_state(TASK_INTERRUPTIBLE);
4218         }
4219         __set_current_state(TASK_RUNNING);
4220         return 0;
4221 }
4222
4223 #ifdef CONFIG_HOTPLUG_CPU
4224 /* Figure out where task on dead CPU should go, use force if neccessary. */
4225 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4226 {
4227         int dest_cpu;
4228         cpumask_t mask;
4229
4230         /* On same node? */
4231         mask = node_to_cpumask(cpu_to_node(dead_cpu));
4232         cpus_and(mask, mask, tsk->cpus_allowed);
4233         dest_cpu = any_online_cpu(mask);
4234
4235         /* On any allowed CPU? */
4236         if (dest_cpu == NR_CPUS)
4237                 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4238
4239         /* No more Mr. Nice Guy. */
4240         if (dest_cpu == NR_CPUS) {
4241                 cpus_setall(tsk->cpus_allowed);
4242                 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4243
4244                 /*
4245                  * Don't tell them about moving exiting tasks or
4246                  * kernel threads (both mm NULL), since they never
4247                  * leave kernel.
4248                  */
4249                 if (tsk->mm && printk_ratelimit())
4250                         printk(KERN_INFO "process %d (%s) no "
4251                                "longer affine to cpu%d\n",
4252                                tsk->pid, tsk->comm, dead_cpu);
4253         }
4254         __migrate_task(tsk, dead_cpu, dest_cpu);
4255 }
4256
4257 /*
4258  * While a dead CPU has no uninterruptible tasks queued at this point,
4259  * it might still have a nonzero ->nr_uninterruptible counter, because
4260  * for performance reasons the counter is not stricly tracking tasks to
4261  * their home CPUs. So we just add the counter to another CPU's counter,
4262  * to keep the global sum constant after CPU-down:
4263  */
4264 static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4265 {
4266         runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4267         unsigned long flags;
4268
4269         local_irq_save(flags);
4270         double_rq_lock(rq_src, rq_dest);
4271         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4272         rq_src->nr_uninterruptible = 0;
4273         double_rq_unlock(rq_src, rq_dest);
4274         local_irq_restore(flags);
4275 }
4276
4277 /* Run through task list and migrate tasks from the dead cpu. */
4278 static void migrate_live_tasks(int src_cpu)
4279 {
4280         struct task_struct *tsk, *t;
4281
4282         write_lock_irq(&tasklist_lock);
4283
4284         do_each_thread(t, tsk) {
4285                 if (tsk == current)
4286                         continue;
4287
4288                 if (task_cpu(tsk) == src_cpu)
4289                         move_task_off_dead_cpu(src_cpu, tsk);
4290         } while_each_thread(t, tsk);
4291
4292         write_unlock_irq(&tasklist_lock);
4293 }
4294
4295 /* Schedules idle task to be the next runnable task on current CPU.
4296  * It does so by boosting its priority to highest possible and adding it to
4297  * the _front_ of runqueue. Used by CPU offline code.
4298  */
4299 void sched_idle_next(void)
4300 {
4301         int cpu = smp_processor_id();
4302         runqueue_t *rq = this_rq();
4303         struct task_struct *p = rq->idle;
4304         unsigned long flags;
4305
4306         /* cpu has to be offline */
4307         BUG_ON(cpu_online(cpu));
4308
4309         /* Strictly not necessary since rest of the CPUs are stopped by now
4310          * and interrupts disabled on current cpu.
4311          */
4312         spin_lock_irqsave(&rq->lock, flags);
4313
4314         __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4315         /* Add idle task to _front_ of it's priority queue */
4316         __activate_idle_task(p, rq);
4317
4318         spin_unlock_irqrestore(&rq->lock, flags);
4319 }
4320
4321 /* Ensures that the idle task is using init_mm right before its cpu goes
4322  * offline.
4323  */
4324 void idle_task_exit(void)
4325 {
4326         struct mm_struct *mm = current->active_mm;
4327
4328         BUG_ON(cpu_online(smp_processor_id()));
4329
4330         if (mm != &init_mm)
4331                 switch_mm(mm, &init_mm, current);
4332         mmdrop(mm);
4333 }
4334
4335 static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4336 {
4337         struct runqueue *rq = cpu_rq(dead_cpu);
4338
4339         /* Must be exiting, otherwise would be on tasklist. */
4340         BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4341
4342         /* Cannot have done final schedule yet: would have vanished. */
4343         BUG_ON(tsk->flags & PF_DEAD);
4344
4345         get_task_struct(tsk);
4346
4347         /*
4348          * Drop lock around migration; if someone else moves it,
4349          * that's OK.  No task can be added to this CPU, so iteration is
4350          * fine.
4351          */
4352         spin_unlock_irq(&rq->lock);
4353         move_task_off_dead_cpu(dead_cpu, tsk);
4354         spin_lock_irq(&rq->lock);
4355
4356         put_task_struct(tsk);
4357 }
4358
4359 /* release_task() removes task from tasklist, so we won't find dead tasks. */
4360 static void migrate_dead_tasks(unsigned int dead_cpu)
4361 {
4362         unsigned arr, i;
4363         struct runqueue *rq = cpu_rq(dead_cpu);
4364
4365         for (arr = 0; arr < 2; arr++) {
4366                 for (i = 0; i < MAX_PRIO; i++) {
4367                         struct list_head *list = &rq->arrays[arr].queue[i];
4368                         while (!list_empty(list))
4369                                 migrate_dead(dead_cpu,
4370                                              list_entry(list->next, task_t,
4371                                                         run_list));
4372                 }
4373         }
4374 }
4375 #endif /* CONFIG_HOTPLUG_CPU */
4376
4377 /*
4378  * migration_call - callback that gets triggered when a CPU is added.
4379  * Here we can start up the necessary migration thread for the new CPU.
4380  */
4381 static int migration_call(struct notifier_block *nfb, unsigned long action,
4382                           void *hcpu)
4383 {
4384         int cpu = (long)hcpu;
4385         struct task_struct *p;
4386         struct runqueue *rq;
4387         unsigned long flags;
4388
4389         switch (action) {
4390         case CPU_UP_PREPARE:
4391                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4392                 if (IS_ERR(p))
4393                         return NOTIFY_BAD;
4394                 p->flags |= PF_NOFREEZE;
4395                 kthread_bind(p, cpu);
4396                 /* Must be high prio: stop_machine expects to yield to it. */
4397                 rq = task_rq_lock(p, &flags);
4398                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4399                 task_rq_unlock(rq, &flags);
4400                 cpu_rq(cpu)->migration_thread = p;
4401                 break;
4402         case CPU_ONLINE:
4403                 /* Strictly unneccessary, as first user will wake it. */
4404                 wake_up_process(cpu_rq(cpu)->migration_thread);
4405                 break;
4406 #ifdef CONFIG_HOTPLUG_CPU
4407         case CPU_UP_CANCELED:
4408                 /* Unbind it from offline cpu so it can run.  Fall thru. */
4409                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4410                 kthread_stop(cpu_rq(cpu)->migration_thread);
4411                 cpu_rq(cpu)->migration_thread = NULL;
4412                 break;
4413         case CPU_DEAD:
4414                 migrate_live_tasks(cpu);
4415                 rq = cpu_rq(cpu);
4416                 kthread_stop(rq->migration_thread);
4417                 rq->migration_thread = NULL;
4418                 /* Idle task back to normal (off runqueue, low prio) */
4419                 rq = task_rq_lock(rq->idle, &flags);
4420                 deactivate_task(rq->idle, rq);
4421                 rq->idle->static_prio = MAX_PRIO;
4422                 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4423                 migrate_dead_tasks(cpu);
4424                 task_rq_unlock(rq, &flags);
4425                 migrate_nr_uninterruptible(rq);
4426                 BUG_ON(rq->nr_running != 0);
4427
4428                 /* No need to migrate the tasks: it was best-effort if
4429                  * they didn't do lock_cpu_hotplug().  Just wake up
4430                  * the requestors. */
4431                 spin_lock_irq(&rq->lock);
4432                 while (!list_empty(&rq->migration_queue)) {
4433                         migration_req_t *req;
4434                         req = list_entry(rq->migration_queue.next,
4435                                          migration_req_t, list);
4436                         BUG_ON(req->type != REQ_MOVE_TASK);
4437                         list_del_init(&req->list);
4438                         complete(&req->done);
4439                 }
4440                 spin_unlock_irq(&rq->lock);
4441                 break;
4442 #endif
4443         }
4444         return NOTIFY_OK;
4445 }
4446
4447 /* Register at highest priority so that task migration (migrate_all_tasks)
4448  * happens before everything else.
4449  */
4450 static struct notifier_block __devinitdata migration_notifier = {
4451         .notifier_call = migration_call,
4452         .priority = 10
4453 };
4454
4455 int __init migration_init(void)
4456 {
4457         void *cpu = (void *)(long)smp_processor_id();
4458         /* Start one for boot CPU. */
4459         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4460         migration_call(&migration_notifier, CPU_ONLINE, cpu);
4461         register_cpu_notifier(&migration_notifier);
4462         return 0;
4463 }
4464 #endif
4465
4466 #ifdef CONFIG_SMP
4467 #define SCHED_DOMAIN_DEBUG
4468 #ifdef SCHED_DOMAIN_DEBUG
4469 static void sched_domain_debug(struct sched_domain *sd, int cpu)
4470 {
4471         int level = 0;
4472
4473         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4474
4475         do {
4476                 int i;
4477                 char str[NR_CPUS];
4478                 struct sched_group *group = sd->groups;
4479                 cpumask_t groupmask;
4480
4481                 cpumask_scnprintf(str, NR_CPUS, sd->span);
4482                 cpus_clear(groupmask);
4483
4484                 printk(KERN_DEBUG);
4485                 for (i = 0; i < level + 1; i++)
4486                         printk(" ");
4487                 printk("domain %d: ", level);
4488
4489                 if (!(sd->flags & SD_LOAD_BALANCE)) {
4490                         printk("does not load-balance\n");
4491                         if (sd->parent)
4492                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4493                         break;
4494                 }
4495
4496                 printk("span %s\n", str);
4497
4498                 if (!cpu_isset(cpu, sd->span))
4499                         printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4500                 if (!cpu_isset(cpu, group->cpumask))
4501                         printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4502
4503                 printk(KERN_DEBUG);
4504                 for (i = 0; i < level + 2; i++)
4505                         printk(" ");
4506                 printk("groups:");
4507                 do {
4508                         if (!group) {
4509                                 printk("\n");
4510                                 printk(KERN_ERR "ERROR: group is NULL\n");
4511                                 break;
4512                         }
4513
4514                         if (!group->cpu_power) {
4515                                 printk("\n");
4516                                 printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4517                         }
4518
4519                         if (!cpus_weight(group->cpumask)) {
4520                                 printk("\n");
4521                                 printk(KERN_ERR "ERROR: empty group\n");
4522                         }
4523
4524                         if (cpus_intersects(groupmask, group->cpumask)) {
4525                                 printk("\n");
4526                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
4527                         }
4528
4529                         cpus_or(groupmask, groupmask, group->cpumask);
4530
4531                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4532                         printk(" %s", str);
4533
4534                         group = group->next;
4535                 } while (group != sd->groups);
4536                 printk("\n");
4537
4538                 if (!cpus_equal(sd->span, groupmask))
4539                         printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4540
4541                 level++;
4542                 sd = sd->parent;
4543
4544                 if (sd) {
4545                         if (!cpus_subset(groupmask, sd->span))
4546                                 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4547                 }
4548
4549         } while (sd);
4550 }
4551 #else
4552 #define sched_domain_debug(sd, cpu) {}
4553 #endif
4554
4555 /*
4556  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
4557  * hold the hotplug lock.
4558  */
4559 void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
4560 {
4561         migration_req_t req;
4562         unsigned long flags;
4563         runqueue_t *rq = cpu_rq(cpu);
4564         int local = 1;
4565
4566         sched_domain_debug(sd, cpu);
4567
4568         spin_lock_irqsave(&rq->lock, flags);
4569
4570         if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4571                 rq->sd = sd;
4572         } else {
4573                 init_completion(&req.done);
4574                 req.type = REQ_SET_DOMAIN;
4575                 req.sd = sd;
4576                 list_add(&req.list, &rq->migration_queue);
4577                 local = 0;
4578         }
4579
4580         spin_unlock_irqrestore(&rq->lock, flags);
4581
4582         if (!local) {
4583                 wake_up_process(rq->migration_thread);
4584                 wait_for_completion(&req.done);
4585         }
4586 }
4587
4588 /* cpus with isolated domains */
4589 cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4590
4591 /* Setup the mask of cpus configured for isolated domains */
4592 static int __init isolated_cpu_setup(char *str)
4593 {
4594         int ints[NR_CPUS], i;
4595
4596         str = get_options(str, ARRAY_SIZE(ints), ints);
4597         cpus_clear(cpu_isolated_map);
4598         for (i = 1; i <= ints[0]; i++)
4599                 if (ints[i] < NR_CPUS)
4600                         cpu_set(ints[i], cpu_isolated_map);
4601         return 1;
4602 }
4603
4604 __setup ("isolcpus=", isolated_cpu_setup);
4605
4606 /*
4607  * init_sched_build_groups takes an array of groups, the cpumask we wish
4608  * to span, and a pointer to a function which identifies what group a CPU
4609  * belongs to. The return value of group_fn must be a valid index into the
4610  * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
4611  * keep track of groups covered with a cpumask_t).
4612  *
4613  * init_sched_build_groups will build a circular linked list of the groups
4614  * covered by the given span, and will set each group's ->cpumask correctly,
4615  * and ->cpu_power to 0.
4616  */
4617 void __devinit init_sched_build_groups(struct sched_group groups[],
4618                         cpumask_t span, int (*group_fn)(int cpu))
4619 {
4620         struct sched_group *first = NULL, *last = NULL;
4621         cpumask_t covered = CPU_MASK_NONE;
4622         int i;
4623
4624         for_each_cpu_mask(i, span) {
4625                 int group = group_fn(i);
4626                 struct sched_group *sg = &groups[group];
4627                 int j;
4628
4629                 if (cpu_isset(i, covered))
4630                         continue;
4631
4632                 sg->cpumask = CPU_MASK_NONE;
4633                 sg->cpu_power = 0;
4634
4635                 for_each_cpu_mask(j, span) {
4636                         if (group_fn(j) != group)
4637                                 continue;
4638
4639                         cpu_set(j, covered);
4640                         cpu_set(j, sg->cpumask);
4641                 }
4642                 if (!first)
4643                         first = sg;
4644                 if (last)
4645                         last->next = sg;
4646                 last = sg;
4647         }
4648         last->next = first;
4649 }
4650
4651
4652 #ifdef ARCH_HAS_SCHED_DOMAIN
4653 extern void __devinit arch_init_sched_domains(void);
4654 extern void __devinit arch_destroy_sched_domains(void);
4655 #else
4656 #ifdef CONFIG_SCHED_SMT
4657 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4658 static struct sched_group sched_group_cpus[NR_CPUS];
4659 static int __devinit cpu_to_cpu_group(int cpu)
4660 {
4661         return cpu;
4662 }
4663 #endif
4664
4665 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4666 static struct sched_group sched_group_phys[NR_CPUS];
4667 static int __devinit cpu_to_phys_group(int cpu)
4668 {
4669 #ifdef CONFIG_SCHED_SMT
4670         return first_cpu(cpu_sibling_map[cpu]);
4671 #else
4672         return cpu;
4673 #endif
4674 }
4675
4676 #ifdef CONFIG_NUMA
4677
4678 static DEFINE_PER_CPU(struct sched_domain, node_domains);
4679 static struct sched_group sched_group_nodes[MAX_NUMNODES];
4680 static int __devinit cpu_to_node_group(int cpu)
4681 {
4682         return cpu_to_node(cpu);
4683 }
4684 #endif
4685
4686 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4687 /*
4688  * The domains setup code relies on siblings not spanning
4689  * multiple nodes. Make sure the architecture has a proper
4690  * siblings map:
4691  */
4692 static void check_sibling_maps(void)
4693 {
4694         int i, j;
4695
4696         for_each_online_cpu(i) {
4697                 for_each_cpu_mask(j, cpu_sibling_map[i]) {
4698                         if (cpu_to_node(i) != cpu_to_node(j)) {
4699                                 printk(KERN_INFO "warning: CPU %d siblings map "
4700                                         "to different node - isolating "
4701                                         "them.\n", i);
4702                                 cpu_sibling_map[i] = cpumask_of_cpu(i);
4703                                 break;
4704                         }
4705                 }
4706         }
4707 }
4708 #endif
4709
4710 /*
4711  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
4712  */
4713 static void __devinit arch_init_sched_domains(void)
4714 {
4715         int i;
4716         cpumask_t cpu_default_map;
4717
4718 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4719         check_sibling_maps();
4720 #endif
4721         /*
4722          * Setup mask for cpus without special case scheduling requirements.
4723          * For now this just excludes isolated cpus, but could be used to
4724          * exclude other special cases in the future.
4725          */
4726         cpus_complement(cpu_default_map, cpu_isolated_map);
4727         cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4728
4729         /*
4730          * Set up domains. Isolated domains just stay on the dummy domain.
4731          */
4732         for_each_cpu_mask(i, cpu_default_map) {
4733                 int group;
4734                 struct sched_domain *sd = NULL, *p;
4735                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4736
4737                 cpus_and(nodemask, nodemask, cpu_default_map);
4738
4739 #ifdef CONFIG_NUMA
4740                 sd = &per_cpu(node_domains, i);
4741                 group = cpu_to_node_group(i);
4742                 *sd = SD_NODE_INIT;
4743                 sd->span = cpu_default_map;
4744                 sd->groups = &sched_group_nodes[group];
4745 #endif
4746
4747                 p = sd;
4748                 sd = &per_cpu(phys_domains, i);
4749                 group = cpu_to_phys_group(i);
4750                 *sd = SD_CPU_INIT;
4751                 sd->span = nodemask;
4752                 sd->parent = p;
4753                 sd->groups = &sched_group_phys[group];
4754
4755 #ifdef CONFIG_SCHED_SMT
4756                 p = sd;
4757                 sd = &per_cpu(cpu_domains, i);
4758                 group = cpu_to_cpu_group(i);
4759                 *sd = SD_SIBLING_INIT;
4760                 sd->span = cpu_sibling_map[i];
4761                 cpus_and(sd->span, sd->span, cpu_default_map);
4762                 sd->parent = p;
4763                 sd->groups = &sched_group_cpus[group];
4764 #endif
4765         }
4766
4767 #ifdef CONFIG_SCHED_SMT
4768         /* Set up CPU (sibling) groups */
4769         for_each_online_cpu(i) {
4770                 cpumask_t this_sibling_map = cpu_sibling_map[i];
4771                 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4772                 if (i != first_cpu(this_sibling_map))
4773                         continue;
4774
4775                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
4776                                                 &cpu_to_cpu_group);
4777         }
4778 #endif
4779
4780         /* Set up physical groups */
4781         for (i = 0; i < MAX_NUMNODES; i++) {
4782                 cpumask_t nodemask = node_to_cpumask(i);
4783
4784                 cpus_and(nodemask, nodemask, cpu_default_map);
4785                 if (cpus_empty(nodemask))
4786                         continue;
4787
4788                 init_sched_build_groups(sched_group_phys, nodemask,
4789                                                 &cpu_to_phys_group);
4790         }
4791
4792 #ifdef CONFIG_NUMA
4793         /* Set up node groups */
4794         init_sched_build_groups(sched_group_nodes, cpu_default_map,
4795                                         &cpu_to_node_group);
4796 #endif
4797
4798         /* Calculate CPU power for physical packages and nodes */
4799         for_each_cpu_mask(i, cpu_default_map) {
4800                 int power;
4801                 struct sched_domain *sd;
4802 #ifdef CONFIG_SCHED_SMT
4803                 sd = &per_cpu(cpu_domains, i);
4804                 power = SCHED_LOAD_SCALE;
4805                 sd->groups->cpu_power = power;
4806 #endif
4807
4808                 sd = &per_cpu(phys_domains, i);
4809                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4810                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
4811                 sd->groups->cpu_power = power;
4812
4813 #ifdef CONFIG_NUMA
4814                 if (i == first_cpu(sd->groups->cpumask)) {
4815                         /* Only add "power" once for each physical package. */
4816                         sd = &per_cpu(node_domains, i);
4817                         sd->groups->cpu_power += power;
4818                 }
4819 #endif
4820         }
4821
4822         /* Attach the domains */
4823         for_each_online_cpu(i) {
4824                 struct sched_domain *sd;
4825 #ifdef CONFIG_SCHED_SMT
4826                 sd = &per_cpu(cpu_domains, i);
4827 #else
4828                 sd = &per_cpu(phys_domains, i);
4829 #endif
4830                 cpu_attach_domain(sd, i);
4831         }
4832 }
4833
4834 #ifdef CONFIG_HOTPLUG_CPU
4835 static void __devinit arch_destroy_sched_domains(void)
4836 {
4837         /* Do nothing: everything is statically allocated. */
4838 }
4839 #endif
4840
4841 #endif /* ARCH_HAS_SCHED_DOMAIN */
4842
4843 /*
4844  * Initial dummy domain for early boot and for hotplug cpu. Being static,
4845  * it is initialized to zero, so all balancing flags are cleared which is
4846  * what we want.
4847  */
4848 static struct sched_domain sched_domain_dummy;
4849
4850 #ifdef CONFIG_HOTPLUG_CPU
4851 /*
4852  * Force a reinitialization of the sched domains hierarchy.  The domains
4853  * and groups cannot be updated in place without racing with the balancing
4854  * code, so we temporarily attach all running cpus to a "dummy" domain
4855  * which will prevent rebalancing while the sched domains are recalculated.
4856  */
4857 static int update_sched_domains(struct notifier_block *nfb,
4858                                 unsigned long action, void *hcpu)
4859 {
4860         int i;
4861
4862         switch (action) {
4863         case CPU_UP_PREPARE:
4864         case CPU_DOWN_PREPARE:
4865                 for_each_online_cpu(i)
4866                         cpu_attach_domain(&sched_domain_dummy, i);
4867                 arch_destroy_sched_domains();
4868                 return NOTIFY_OK;
4869
4870         case CPU_UP_CANCELED:
4871         case CPU_DOWN_FAILED:
4872         case CPU_ONLINE:
4873         case CPU_DEAD:
4874                 /*
4875                  * Fall through and re-initialise the domains.
4876                  */
4877                 break;
4878         default:
4879                 return NOTIFY_DONE;
4880         }
4881
4882         /* The hotplug lock is already held by cpu_up/cpu_down */
4883         arch_init_sched_domains();
4884
4885         return NOTIFY_OK;
4886 }
4887 #endif
4888
4889 void __init sched_init_smp(void)
4890 {
4891         lock_cpu_hotplug();
4892         arch_init_sched_domains();
4893         unlock_cpu_hotplug();
4894         /* XXX: Theoretical race here - CPU may be hotplugged now */
4895         hotcpu_notifier(update_sched_domains, 0);
4896 }
4897 #else
4898 void __init sched_init_smp(void)
4899 {
4900 }
4901 #endif /* CONFIG_SMP */
4902
4903 int in_sched_functions(unsigned long addr)
4904 {
4905         /* Linker adds these: start and end of __sched functions */
4906         extern char __sched_text_start[], __sched_text_end[];
4907         return in_lock_functions(addr) ||
4908                 (addr >= (unsigned long)__sched_text_start
4909                 && addr < (unsigned long)__sched_text_end);
4910 }
4911
4912 void __init sched_init(void)
4913 {
4914         runqueue_t *rq;
4915         int i, j, k;
4916
4917         for (i = 0; i < NR_CPUS; i++) {
4918                 prio_array_t *array;
4919
4920                 rq = cpu_rq(i);
4921                 spin_lock_init(&rq->lock);
4922                 rq->nr_running = 0;
4923                 rq->active = rq->arrays;
4924                 rq->expired = rq->arrays + 1;
4925                 rq->best_expired_prio = MAX_PRIO;
4926
4927 #ifdef CONFIG_SMP
4928                 rq->sd = &sched_domain_dummy;
4929                 for (j = 1; j < 3; j++)
4930                         rq->cpu_load[j] = 0;
4931                 rq->active_balance = 0;
4932                 rq->push_cpu = 0;
4933                 rq->migration_thread = NULL;
4934                 INIT_LIST_HEAD(&rq->migration_queue);
4935 #endif
4936                 atomic_set(&rq->nr_iowait, 0);
4937
4938                 for (j = 0; j < 2; j++) {
4939                         array = rq->arrays + j;
4940                         for (k = 0; k < MAX_PRIO; k++) {
4941                                 INIT_LIST_HEAD(array->queue + k);
4942                                 __clear_bit(k, array->bitmap);
4943                         }
4944                         // delimiter for bitsearch
4945                         __set_bit(MAX_PRIO, array->bitmap);
4946                 }
4947         }
4948
4949         /*
4950          * The boot idle thread does lazy MMU switching as well:
4951          */
4952         atomic_inc(&init_mm.mm_count);
4953         enter_lazy_tlb(&init_mm, current);
4954
4955         /*
4956          * Make us the idle thread. Technically, schedule() should not be
4957          * called from this thread, however somewhere below it might be,
4958          * but because we are the idle thread, we just pick up running again
4959          * when this runqueue becomes "idle".
4960          */
4961         init_idle(current, smp_processor_id());
4962 }
4963
4964 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4965 void __might_sleep(char *file, int line)
4966 {
4967 #if defined(in_atomic)
4968         static unsigned long prev_jiffy;        /* ratelimiting */
4969
4970         if ((in_atomic() || irqs_disabled()) &&
4971             system_state == SYSTEM_RUNNING && !oops_in_progress) {
4972                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4973                         return;
4974                 prev_jiffy = jiffies;
4975                 printk(KERN_ERR "Debug: sleeping function called from invalid"
4976                                 " context at %s:%d\n", file, line);
4977                 printk("in_atomic():%d, irqs_disabled():%d\n",
4978                         in_atomic(), irqs_disabled());
4979                 dump_stack();
4980         }
4981 #endif
4982 }
4983 EXPORT_SYMBOL(__might_sleep);
4984 #endif
4985
4986 #ifdef CONFIG_MAGIC_SYSRQ
4987 void normalize_rt_tasks(void)
4988 {
4989         struct task_struct *p;
4990         prio_array_t *array;
4991         unsigned long flags;
4992         runqueue_t *rq;
4993
4994         read_lock_irq(&tasklist_lock);
4995         for_each_process (p) {
4996                 if (!rt_task(p))
4997                         continue;
4998
4999                 rq = task_rq_lock(p, &flags);
5000
5001                 array = p->array;
5002                 if (array)
5003                         deactivate_task(p, task_rq(p));
5004                 __setscheduler(p, SCHED_NORMAL, 0);
5005                 if (array) {
5006                         __activate_task(p, task_rq(p));
5007                         resched_task(rq->curr);
5008                 }
5009
5010                 task_rq_unlock(rq, &flags);
5011         }
5012         read_unlock_irq(&tasklist_lock);
5013 }
5014
5015 #endif /* CONFIG_MAGIC_SYSRQ */