[PATCH] sched: uninline task_timeslice
[safe/jmp/linux-2.6] / kernel / sched.c
1 /*
2  *  kernel/sched.c
3  *
4  *  Kernel scheduler and related syscalls
5  *
6  *  Copyright (C) 1991-2002  Linus Torvalds
7  *
8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9  *              make semaphores SMP safe
10  *  1998-11-19  Implemented schedule_timeout() and related stuff
11  *              by Andrea Arcangeli
12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
13  *              hybrid priority-list and round-robin design with
14  *              an array-switch method of distributing timeslices
15  *              and per-CPU runqueues.  Cleanups and useful suggestions
16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
17  *  2003-09-03  Interactivity tuning by Con Kolivas.
18  *  2004-04-02  Scheduler domains code by Nick Piggin
19  */
20
21 #include <linux/mm.h>
22 #include <linux/module.h>
23 #include <linux/nmi.h>
24 #include <linux/init.h>
25 #include <asm/uaccess.h>
26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h>
30 #include <linux/completion.h>
31 #include <linux/kernel_stat.h>
32 #include <linux/security.h>
33 #include <linux/notifier.h>
34 #include <linux/profile.h>
35 #include <linux/suspend.h>
36 #include <linux/blkdev.h>
37 #include <linux/delay.h>
38 #include <linux/smp.h>
39 #include <linux/threads.h>
40 #include <linux/timer.h>
41 #include <linux/rcupdate.h>
42 #include <linux/cpu.h>
43 #include <linux/cpuset.h>
44 #include <linux/percpu.h>
45 #include <linux/kthread.h>
46 #include <linux/seq_file.h>
47 #include <linux/syscalls.h>
48 #include <linux/times.h>
49 #include <linux/acct.h>
50 #include <asm/tlb.h>
51
52 #include <asm/unistd.h>
53
54 /*
55  * Convert user-nice values [ -20 ... 0 ... 19 ]
56  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
57  * and back.
58  */
59 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
60 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
61 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
62
63 /*
64  * 'User priority' is the nice value converted to something we
65  * can work with better when scaling various scheduler parameters,
66  * it's a [ 0 ... 39 ] range.
67  */
68 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
69 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
70 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
71
72 /*
73  * Some helpers for converting nanosecond timing to jiffy resolution
74  */
75 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
76 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
77
78 /*
79  * These are the 'tuning knobs' of the scheduler:
80  *
81  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
82  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
83  * Timeslices get refilled after they expire.
84  */
85 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
86 #define DEF_TIMESLICE           (100 * HZ / 1000)
87 #define ON_RUNQUEUE_WEIGHT       30
88 #define CHILD_PENALTY            95
89 #define PARENT_PENALTY          100
90 #define EXIT_WEIGHT               3
91 #define PRIO_BONUS_RATIO         25
92 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
93 #define INTERACTIVE_DELTA         2
94 #define MAX_SLEEP_AVG           (DEF_TIMESLICE * MAX_BONUS)
95 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
96 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
97
98 /*
99  * If a task is 'interactive' then we reinsert it in the active
100  * array after it has expired its current timeslice. (it will not
101  * continue to run immediately, it will still roundrobin with
102  * other interactive tasks.)
103  *
104  * This part scales the interactivity limit depending on niceness.
105  *
106  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
107  * Here are a few examples of different nice levels:
108  *
109  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
110  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
111  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
112  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
113  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
114  *
115  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
116  *  priority range a task can explore, a value of '1' means the
117  *  task is rated interactive.)
118  *
119  * Ie. nice +19 tasks can never get 'interactive' enough to be
120  * reinserted into the active array. And only heavily CPU-hog nice -20
121  * tasks will be expired. Default nice 0 tasks are somewhere between,
122  * it takes some effort for them to get interactive, but it's not
123  * too hard.
124  */
125
126 #define CURRENT_BONUS(p) \
127         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
128                 MAX_SLEEP_AVG)
129
130 #define GRANULARITY     (10 * HZ / 1000 ? : 1)
131
132 #ifdef CONFIG_SMP
133 #define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
134                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
135                         num_online_cpus())
136 #else
137 #define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
138                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
139 #endif
140
141 #define SCALE(v1,v1_max,v2_max) \
142         (v1) * (v2_max) / (v1_max)
143
144 #define DELTA(p) \
145         (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
146
147 #define TASK_INTERACTIVE(p) \
148         ((p)->prio <= (p)->static_prio - DELTA(p))
149
150 #define INTERACTIVE_SLEEP(p) \
151         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
152                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
153
154 #define TASK_PREEMPTS_CURR(p, rq) \
155         ((p)->prio < (rq)->curr->prio)
156
157 /*
158  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
159  * to time slice values: [800ms ... 100ms ... 5ms]
160  *
161  * The higher a thread's priority, the bigger timeslices
162  * it gets during one round of execution. But even the lowest
163  * priority thread gets MIN_TIMESLICE worth of execution time.
164  */
165
166 #define SCALE_PRIO(x, prio) \
167         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168
169 static unsigned int task_timeslice(task_t *p)
170 {
171         if (p->static_prio < NICE_TO_PRIO(0))
172                 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
173         else
174                 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
175 }
176 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)       \
177                                 < (long long) (sd)->cache_hot_time)
178
179 /*
180  * These are the runqueue data structures:
181  */
182
183 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
184
185 typedef struct runqueue runqueue_t;
186
187 struct prio_array {
188         unsigned int nr_active;
189         unsigned long bitmap[BITMAP_SIZE];
190         struct list_head queue[MAX_PRIO];
191 };
192
193 /*
194  * This is the main, per-CPU runqueue data structure.
195  *
196  * Locking rule: those places that want to lock multiple runqueues
197  * (such as the load balancing or the thread migration code), lock
198  * acquire operations must be ordered by ascending &runqueue.
199  */
200 struct runqueue {
201         spinlock_t lock;
202
203         /*
204          * nr_running and cpu_load should be in the same cacheline because
205          * remote CPUs use both these fields when doing load calculation.
206          */
207         unsigned long nr_running;
208 #ifdef CONFIG_SMP
209         unsigned long cpu_load[3];
210 #endif
211         unsigned long long nr_switches;
212
213         /*
214          * This is part of a global counter where only the total sum
215          * over all CPUs matters. A task can increase this counter on
216          * one CPU and if it got migrated afterwards it may decrease
217          * it on another CPU. Always updated under the runqueue lock:
218          */
219         unsigned long nr_uninterruptible;
220
221         unsigned long expired_timestamp;
222         unsigned long long timestamp_last_tick;
223         task_t *curr, *idle;
224         struct mm_struct *prev_mm;
225         prio_array_t *active, *expired, arrays[2];
226         int best_expired_prio;
227         atomic_t nr_iowait;
228
229 #ifdef CONFIG_SMP
230         struct sched_domain *sd;
231
232         /* For active balancing */
233         int active_balance;
234         int push_cpu;
235
236         task_t *migration_thread;
237         struct list_head migration_queue;
238 #endif
239
240 #ifdef CONFIG_SCHEDSTATS
241         /* latency stats */
242         struct sched_info rq_sched_info;
243
244         /* sys_sched_yield() stats */
245         unsigned long yld_exp_empty;
246         unsigned long yld_act_empty;
247         unsigned long yld_both_empty;
248         unsigned long yld_cnt;
249
250         /* schedule() stats */
251         unsigned long sched_switch;
252         unsigned long sched_cnt;
253         unsigned long sched_goidle;
254
255         /* try_to_wake_up() stats */
256         unsigned long ttwu_cnt;
257         unsigned long ttwu_local;
258 #endif
259 };
260
261 static DEFINE_PER_CPU(struct runqueue, runqueues);
262
263 #define for_each_domain(cpu, domain) \
264         for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
265
266 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
267 #define this_rq()               (&__get_cpu_var(runqueues))
268 #define task_rq(p)              cpu_rq(task_cpu(p))
269 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
270
271 /*
272  * Default context-switch locking:
273  */
274 #ifndef prepare_arch_switch
275 # define prepare_arch_switch(rq, next)  do { } while (0)
276 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
277 # define task_running(rq, p)            ((rq)->curr == (p))
278 #endif
279
280 /*
281  * task_rq_lock - lock the runqueue a given task resides on and disable
282  * interrupts.  Note the ordering: we can safely lookup the task_rq without
283  * explicitly disabling preemption.
284  */
285 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
286         __acquires(rq->lock)
287 {
288         struct runqueue *rq;
289
290 repeat_lock_task:
291         local_irq_save(*flags);
292         rq = task_rq(p);
293         spin_lock(&rq->lock);
294         if (unlikely(rq != task_rq(p))) {
295                 spin_unlock_irqrestore(&rq->lock, *flags);
296                 goto repeat_lock_task;
297         }
298         return rq;
299 }
300
301 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
302         __releases(rq->lock)
303 {
304         spin_unlock_irqrestore(&rq->lock, *flags);
305 }
306
307 #ifdef CONFIG_SCHEDSTATS
308 /*
309  * bump this up when changing the output format or the meaning of an existing
310  * format, so that tools can adapt (or abort)
311  */
312 #define SCHEDSTAT_VERSION 12
313
314 static int show_schedstat(struct seq_file *seq, void *v)
315 {
316         int cpu;
317
318         seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
319         seq_printf(seq, "timestamp %lu\n", jiffies);
320         for_each_online_cpu(cpu) {
321                 runqueue_t *rq = cpu_rq(cpu);
322 #ifdef CONFIG_SMP
323                 struct sched_domain *sd;
324                 int dcnt = 0;
325 #endif
326
327                 /* runqueue-specific stats */
328                 seq_printf(seq,
329                     "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
330                     cpu, rq->yld_both_empty,
331                     rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
332                     rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
333                     rq->ttwu_cnt, rq->ttwu_local,
334                     rq->rq_sched_info.cpu_time,
335                     rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
336
337                 seq_printf(seq, "\n");
338
339 #ifdef CONFIG_SMP
340                 /* domain-specific stats */
341                 for_each_domain(cpu, sd) {
342                         enum idle_type itype;
343                         char mask_str[NR_CPUS];
344
345                         cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
346                         seq_printf(seq, "domain%d %s", dcnt++, mask_str);
347                         for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
348                                         itype++) {
349                                 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
350                                     sd->lb_cnt[itype],
351                                     sd->lb_balanced[itype],
352                                     sd->lb_failed[itype],
353                                     sd->lb_imbalance[itype],
354                                     sd->lb_gained[itype],
355                                     sd->lb_hot_gained[itype],
356                                     sd->lb_nobusyq[itype],
357                                     sd->lb_nobusyg[itype]);
358                         }
359                         seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
360                             sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361                             sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
362                             sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
363                             sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
364                 }
365 #endif
366         }
367         return 0;
368 }
369
370 static int schedstat_open(struct inode *inode, struct file *file)
371 {
372         unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
373         char *buf = kmalloc(size, GFP_KERNEL);
374         struct seq_file *m;
375         int res;
376
377         if (!buf)
378                 return -ENOMEM;
379         res = single_open(file, show_schedstat, NULL);
380         if (!res) {
381                 m = file->private_data;
382                 m->buf = buf;
383                 m->size = size;
384         } else
385                 kfree(buf);
386         return res;
387 }
388
389 struct file_operations proc_schedstat_operations = {
390         .open    = schedstat_open,
391         .read    = seq_read,
392         .llseek  = seq_lseek,
393         .release = single_release,
394 };
395
396 # define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
397 # define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
398 #else /* !CONFIG_SCHEDSTATS */
399 # define schedstat_inc(rq, field)       do { } while (0)
400 # define schedstat_add(rq, field, amt)  do { } while (0)
401 #endif
402
403 /*
404  * rq_lock - lock a given runqueue and disable interrupts.
405  */
406 static inline runqueue_t *this_rq_lock(void)
407         __acquires(rq->lock)
408 {
409         runqueue_t *rq;
410
411         local_irq_disable();
412         rq = this_rq();
413         spin_lock(&rq->lock);
414
415         return rq;
416 }
417
418 #ifdef CONFIG_SCHEDSTATS
419 /*
420  * Called when a process is dequeued from the active array and given
421  * the cpu.  We should note that with the exception of interactive
422  * tasks, the expired queue will become the active queue after the active
423  * queue is empty, without explicitly dequeuing and requeuing tasks in the
424  * expired queue.  (Interactive tasks may be requeued directly to the
425  * active queue, thus delaying tasks in the expired queue from running;
426  * see scheduler_tick()).
427  *
428  * This function is only called from sched_info_arrive(), rather than
429  * dequeue_task(). Even though a task may be queued and dequeued multiple
430  * times as it is shuffled about, we're really interested in knowing how
431  * long it was from the *first* time it was queued to the time that it
432  * finally hit a cpu.
433  */
434 static inline void sched_info_dequeued(task_t *t)
435 {
436         t->sched_info.last_queued = 0;
437 }
438
439 /*
440  * Called when a task finally hits the cpu.  We can now calculate how
441  * long it was waiting to run.  We also note when it began so that we
442  * can keep stats on how long its timeslice is.
443  */
444 static inline void sched_info_arrive(task_t *t)
445 {
446         unsigned long now = jiffies, diff = 0;
447         struct runqueue *rq = task_rq(t);
448
449         if (t->sched_info.last_queued)
450                 diff = now - t->sched_info.last_queued;
451         sched_info_dequeued(t);
452         t->sched_info.run_delay += diff;
453         t->sched_info.last_arrival = now;
454         t->sched_info.pcnt++;
455
456         if (!rq)
457                 return;
458
459         rq->rq_sched_info.run_delay += diff;
460         rq->rq_sched_info.pcnt++;
461 }
462
463 /*
464  * Called when a process is queued into either the active or expired
465  * array.  The time is noted and later used to determine how long we
466  * had to wait for us to reach the cpu.  Since the expired queue will
467  * become the active queue after active queue is empty, without dequeuing
468  * and requeuing any tasks, we are interested in queuing to either. It
469  * is unusual but not impossible for tasks to be dequeued and immediately
470  * requeued in the same or another array: this can happen in sched_yield(),
471  * set_user_nice(), and even load_balance() as it moves tasks from runqueue
472  * to runqueue.
473  *
474  * This function is only called from enqueue_task(), but also only updates
475  * the timestamp if it is already not set.  It's assumed that
476  * sched_info_dequeued() will clear that stamp when appropriate.
477  */
478 static inline void sched_info_queued(task_t *t)
479 {
480         if (!t->sched_info.last_queued)
481                 t->sched_info.last_queued = jiffies;
482 }
483
484 /*
485  * Called when a process ceases being the active-running process, either
486  * voluntarily or involuntarily.  Now we can calculate how long we ran.
487  */
488 static inline void sched_info_depart(task_t *t)
489 {
490         struct runqueue *rq = task_rq(t);
491         unsigned long diff = jiffies - t->sched_info.last_arrival;
492
493         t->sched_info.cpu_time += diff;
494
495         if (rq)
496                 rq->rq_sched_info.cpu_time += diff;
497 }
498
499 /*
500  * Called when tasks are switched involuntarily due, typically, to expiring
501  * their time slice.  (This may also be called when switching to or from
502  * the idle task.)  We are only called when prev != next.
503  */
504 static inline void sched_info_switch(task_t *prev, task_t *next)
505 {
506         struct runqueue *rq = task_rq(prev);
507
508         /*
509          * prev now departs the cpu.  It's not interesting to record
510          * stats about how efficient we were at scheduling the idle
511          * process, however.
512          */
513         if (prev != rq->idle)
514                 sched_info_depart(prev);
515
516         if (next != rq->idle)
517                 sched_info_arrive(next);
518 }
519 #else
520 #define sched_info_queued(t)            do { } while (0)
521 #define sched_info_switch(t, next)      do { } while (0)
522 #endif /* CONFIG_SCHEDSTATS */
523
524 /*
525  * Adding/removing a task to/from a priority array:
526  */
527 static void dequeue_task(struct task_struct *p, prio_array_t *array)
528 {
529         array->nr_active--;
530         list_del(&p->run_list);
531         if (list_empty(array->queue + p->prio))
532                 __clear_bit(p->prio, array->bitmap);
533 }
534
535 static void enqueue_task(struct task_struct *p, prio_array_t *array)
536 {
537         sched_info_queued(p);
538         list_add_tail(&p->run_list, array->queue + p->prio);
539         __set_bit(p->prio, array->bitmap);
540         array->nr_active++;
541         p->array = array;
542 }
543
544 /*
545  * Put task to the end of the run list without the overhead of dequeue
546  * followed by enqueue.
547  */
548 static void requeue_task(struct task_struct *p, prio_array_t *array)
549 {
550         list_move_tail(&p->run_list, array->queue + p->prio);
551 }
552
553 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
554 {
555         list_add(&p->run_list, array->queue + p->prio);
556         __set_bit(p->prio, array->bitmap);
557         array->nr_active++;
558         p->array = array;
559 }
560
561 /*
562  * effective_prio - return the priority that is based on the static
563  * priority but is modified by bonuses/penalties.
564  *
565  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
566  * into the -5 ... 0 ... +5 bonus/penalty range.
567  *
568  * We use 25% of the full 0...39 priority range so that:
569  *
570  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
571  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
572  *
573  * Both properties are important to certain workloads.
574  */
575 static int effective_prio(task_t *p)
576 {
577         int bonus, prio;
578
579         if (rt_task(p))
580                 return p->prio;
581
582         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
583
584         prio = p->static_prio - bonus;
585         if (prio < MAX_RT_PRIO)
586                 prio = MAX_RT_PRIO;
587         if (prio > MAX_PRIO-1)
588                 prio = MAX_PRIO-1;
589         return prio;
590 }
591
592 /*
593  * __activate_task - move a task to the runqueue.
594  */
595 static inline void __activate_task(task_t *p, runqueue_t *rq)
596 {
597         enqueue_task(p, rq->active);
598         rq->nr_running++;
599 }
600
601 /*
602  * __activate_idle_task - move idle task to the _front_ of runqueue.
603  */
604 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
605 {
606         enqueue_task_head(p, rq->active);
607         rq->nr_running++;
608 }
609
610 static void recalc_task_prio(task_t *p, unsigned long long now)
611 {
612         /* Caller must always ensure 'now >= p->timestamp' */
613         unsigned long long __sleep_time = now - p->timestamp;
614         unsigned long sleep_time;
615
616         if (__sleep_time > NS_MAX_SLEEP_AVG)
617                 sleep_time = NS_MAX_SLEEP_AVG;
618         else
619                 sleep_time = (unsigned long)__sleep_time;
620
621         if (likely(sleep_time > 0)) {
622                 /*
623                  * User tasks that sleep a long time are categorised as
624                  * idle and will get just interactive status to stay active &
625                  * prevent them suddenly becoming cpu hogs and starving
626                  * other processes.
627                  */
628                 if (p->mm && p->activated != -1 &&
629                         sleep_time > INTERACTIVE_SLEEP(p)) {
630                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
631                                                 DEF_TIMESLICE);
632                 } else {
633                         /*
634                          * The lower the sleep avg a task has the more
635                          * rapidly it will rise with sleep time.
636                          */
637                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
638
639                         /*
640                          * Tasks waking from uninterruptible sleep are
641                          * limited in their sleep_avg rise as they
642                          * are likely to be waiting on I/O
643                          */
644                         if (p->activated == -1 && p->mm) {
645                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
646                                         sleep_time = 0;
647                                 else if (p->sleep_avg + sleep_time >=
648                                                 INTERACTIVE_SLEEP(p)) {
649                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
650                                         sleep_time = 0;
651                                 }
652                         }
653
654                         /*
655                          * This code gives a bonus to interactive tasks.
656                          *
657                          * The boost works by updating the 'average sleep time'
658                          * value here, based on ->timestamp. The more time a
659                          * task spends sleeping, the higher the average gets -
660                          * and the higher the priority boost gets as well.
661                          */
662                         p->sleep_avg += sleep_time;
663
664                         if (p->sleep_avg > NS_MAX_SLEEP_AVG)
665                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
666                 }
667         }
668
669         p->prio = effective_prio(p);
670 }
671
672 /*
673  * activate_task - move a task to the runqueue and do priority recalculation
674  *
675  * Update all the scheduling statistics stuff. (sleep average
676  * calculation, priority modifiers, etc.)
677  */
678 static void activate_task(task_t *p, runqueue_t *rq, int local)
679 {
680         unsigned long long now;
681
682         now = sched_clock();
683 #ifdef CONFIG_SMP
684         if (!local) {
685                 /* Compensate for drifting sched_clock */
686                 runqueue_t *this_rq = this_rq();
687                 now = (now - this_rq->timestamp_last_tick)
688                         + rq->timestamp_last_tick;
689         }
690 #endif
691
692         recalc_task_prio(p, now);
693
694         /*
695          * This checks to make sure it's not an uninterruptible task
696          * that is now waking up.
697          */
698         if (!p->activated) {
699                 /*
700                  * Tasks which were woken up by interrupts (ie. hw events)
701                  * are most likely of interactive nature. So we give them
702                  * the credit of extending their sleep time to the period
703                  * of time they spend on the runqueue, waiting for execution
704                  * on a CPU, first time around:
705                  */
706                 if (in_interrupt())
707                         p->activated = 2;
708                 else {
709                         /*
710                          * Normal first-time wakeups get a credit too for
711                          * on-runqueue time, but it will be weighted down:
712                          */
713                         p->activated = 1;
714                 }
715         }
716         p->timestamp = now;
717
718         __activate_task(p, rq);
719 }
720
721 /*
722  * deactivate_task - remove a task from the runqueue.
723  */
724 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
725 {
726         rq->nr_running--;
727         dequeue_task(p, p->array);
728         p->array = NULL;
729 }
730
731 /*
732  * resched_task - mark a task 'to be rescheduled now'.
733  *
734  * On UP this means the setting of the need_resched flag, on SMP it
735  * might also involve a cross-CPU call to trigger the scheduler on
736  * the target CPU.
737  */
738 #ifdef CONFIG_SMP
739 static void resched_task(task_t *p)
740 {
741         int need_resched, nrpolling;
742
743         assert_spin_locked(&task_rq(p)->lock);
744
745         /* minimise the chance of sending an interrupt to poll_idle() */
746         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
747         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
748         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
749
750         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
751                 smp_send_reschedule(task_cpu(p));
752 }
753 #else
754 static inline void resched_task(task_t *p)
755 {
756         set_tsk_need_resched(p);
757 }
758 #endif
759
760 /**
761  * task_curr - is this task currently executing on a CPU?
762  * @p: the task in question.
763  */
764 inline int task_curr(const task_t *p)
765 {
766         return cpu_curr(task_cpu(p)) == p;
767 }
768
769 #ifdef CONFIG_SMP
770 enum request_type {
771         REQ_MOVE_TASK,
772         REQ_SET_DOMAIN,
773 };
774
775 typedef struct {
776         struct list_head list;
777         enum request_type type;
778
779         /* For REQ_MOVE_TASK */
780         task_t *task;
781         int dest_cpu;
782
783         /* For REQ_SET_DOMAIN */
784         struct sched_domain *sd;
785
786         struct completion done;
787 } migration_req_t;
788
789 /*
790  * The task's runqueue lock must be held.
791  * Returns true if you have to wait for migration thread.
792  */
793 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
794 {
795         runqueue_t *rq = task_rq(p);
796
797         /*
798          * If the task is not on a runqueue (and not running), then
799          * it is sufficient to simply update the task's cpu field.
800          */
801         if (!p->array && !task_running(rq, p)) {
802                 set_task_cpu(p, dest_cpu);
803                 return 0;
804         }
805
806         init_completion(&req->done);
807         req->type = REQ_MOVE_TASK;
808         req->task = p;
809         req->dest_cpu = dest_cpu;
810         list_add(&req->list, &rq->migration_queue);
811         return 1;
812 }
813
814 /*
815  * wait_task_inactive - wait for a thread to unschedule.
816  *
817  * The caller must ensure that the task *will* unschedule sometime soon,
818  * else this function might spin for a *long* time. This function can't
819  * be called with interrupts off, or it may introduce deadlock with
820  * smp_call_function() if an IPI is sent by the same process we are
821  * waiting to become inactive.
822  */
823 void wait_task_inactive(task_t * p)
824 {
825         unsigned long flags;
826         runqueue_t *rq;
827         int preempted;
828
829 repeat:
830         rq = task_rq_lock(p, &flags);
831         /* Must be off runqueue entirely, not preempted. */
832         if (unlikely(p->array || task_running(rq, p))) {
833                 /* If it's preempted, we yield.  It could be a while. */
834                 preempted = !task_running(rq, p);
835                 task_rq_unlock(rq, &flags);
836                 cpu_relax();
837                 if (preempted)
838                         yield();
839                 goto repeat;
840         }
841         task_rq_unlock(rq, &flags);
842 }
843
844 /***
845  * kick_process - kick a running thread to enter/exit the kernel
846  * @p: the to-be-kicked thread
847  *
848  * Cause a process which is running on another CPU to enter
849  * kernel-mode, without any delay. (to get signals handled.)
850  *
851  * NOTE: this function doesnt have to take the runqueue lock,
852  * because all it wants to ensure is that the remote task enters
853  * the kernel. If the IPI races and the task has been migrated
854  * to another CPU then no harm is done and the purpose has been
855  * achieved as well.
856  */
857 void kick_process(task_t *p)
858 {
859         int cpu;
860
861         preempt_disable();
862         cpu = task_cpu(p);
863         if ((cpu != smp_processor_id()) && task_curr(p))
864                 smp_send_reschedule(cpu);
865         preempt_enable();
866 }
867
868 /*
869  * Return a low guess at the load of a migration-source cpu.
870  *
871  * We want to under-estimate the load of migration sources, to
872  * balance conservatively.
873  */
874 static inline unsigned long source_load(int cpu, int type)
875 {
876         runqueue_t *rq = cpu_rq(cpu);
877         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
878         if (type == 0)
879                 return load_now;
880
881         return min(rq->cpu_load[type-1], load_now);
882 }
883
884 /*
885  * Return a high guess at the load of a migration-target cpu
886  */
887 static inline unsigned long target_load(int cpu, int type)
888 {
889         runqueue_t *rq = cpu_rq(cpu);
890         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
891         if (type == 0)
892                 return load_now;
893
894         return max(rq->cpu_load[type-1], load_now);
895 }
896
897 /*
898  * find_idlest_group finds and returns the least busy CPU group within the
899  * domain.
900  */
901 static struct sched_group *
902 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
903 {
904         struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
905         unsigned long min_load = ULONG_MAX, this_load = 0;
906         int load_idx = sd->forkexec_idx;
907         int imbalance = 100 + (sd->imbalance_pct-100)/2;
908
909         do {
910                 unsigned long load, avg_load;
911                 int local_group;
912                 int i;
913
914                 local_group = cpu_isset(this_cpu, group->cpumask);
915                 /* XXX: put a cpus allowed check */
916
917                 /* Tally up the load of all CPUs in the group */
918                 avg_load = 0;
919
920                 for_each_cpu_mask(i, group->cpumask) {
921                         /* Bias balancing toward cpus of our domain */
922                         if (local_group)
923                                 load = source_load(i, load_idx);
924                         else
925                                 load = target_load(i, load_idx);
926
927                         avg_load += load;
928                 }
929
930                 /* Adjust by relative CPU power of the group */
931                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
932
933                 if (local_group) {
934                         this_load = avg_load;
935                         this = group;
936                 } else if (avg_load < min_load) {
937                         min_load = avg_load;
938                         idlest = group;
939                 }
940                 group = group->next;
941         } while (group != sd->groups);
942
943         if (!idlest || 100*this_load < imbalance*min_load)
944                 return NULL;
945         return idlest;
946 }
947
948 /*
949  * find_idlest_queue - find the idlest runqueue among the cpus in group.
950  */
951 static int find_idlest_cpu(struct sched_group *group, int this_cpu)
952 {
953         unsigned long load, min_load = ULONG_MAX;
954         int idlest = -1;
955         int i;
956
957         for_each_cpu_mask(i, group->cpumask) {
958                 load = source_load(i, 0);
959
960                 if (load < min_load || (load == min_load && i == this_cpu)) {
961                         min_load = load;
962                         idlest = i;
963                 }
964         }
965
966         return idlest;
967 }
968
969
970 #endif
971
972 /*
973  * wake_idle() will wake a task on an idle cpu if task->cpu is
974  * not idle and an idle cpu is available.  The span of cpus to
975  * search starts with cpus closest then further out as needed,
976  * so we always favor a closer, idle cpu.
977  *
978  * Returns the CPU we should wake onto.
979  */
980 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
981 static int wake_idle(int cpu, task_t *p)
982 {
983         cpumask_t tmp;
984         struct sched_domain *sd;
985         int i;
986
987         if (idle_cpu(cpu))
988                 return cpu;
989
990         for_each_domain(cpu, sd) {
991                 if (sd->flags & SD_WAKE_IDLE) {
992                         cpus_and(tmp, sd->span, p->cpus_allowed);
993                         for_each_cpu_mask(i, tmp) {
994                                 if (idle_cpu(i))
995                                         return i;
996                         }
997                 }
998                 else
999                         break;
1000         }
1001         return cpu;
1002 }
1003 #else
1004 static inline int wake_idle(int cpu, task_t *p)
1005 {
1006         return cpu;
1007 }
1008 #endif
1009
1010 /***
1011  * try_to_wake_up - wake up a thread
1012  * @p: the to-be-woken-up thread
1013  * @state: the mask of task states that can be woken
1014  * @sync: do a synchronous wakeup?
1015  *
1016  * Put it on the run-queue if it's not already there. The "current"
1017  * thread is always on the run-queue (except when the actual
1018  * re-schedule is in progress), and as such you're allowed to do
1019  * the simpler "current->state = TASK_RUNNING" to mark yourself
1020  * runnable without the overhead of this.
1021  *
1022  * returns failure only if the task is already active.
1023  */
1024 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
1025 {
1026         int cpu, this_cpu, success = 0;
1027         unsigned long flags;
1028         long old_state;
1029         runqueue_t *rq;
1030 #ifdef CONFIG_SMP
1031         unsigned long load, this_load;
1032         struct sched_domain *sd, *this_sd = NULL;
1033         int new_cpu;
1034 #endif
1035
1036         rq = task_rq_lock(p, &flags);
1037         old_state = p->state;
1038         if (!(old_state & state))
1039                 goto out;
1040
1041         if (p->array)
1042                 goto out_running;
1043
1044         cpu = task_cpu(p);
1045         this_cpu = smp_processor_id();
1046
1047 #ifdef CONFIG_SMP
1048         if (unlikely(task_running(rq, p)))
1049                 goto out_activate;
1050
1051         new_cpu = cpu;
1052
1053         schedstat_inc(rq, ttwu_cnt);
1054         if (cpu == this_cpu) {
1055                 schedstat_inc(rq, ttwu_local);
1056                 goto out_set_cpu;
1057         }
1058
1059         for_each_domain(this_cpu, sd) {
1060                 if (cpu_isset(cpu, sd->span)) {
1061                         schedstat_inc(sd, ttwu_wake_remote);
1062                         this_sd = sd;
1063                         break;
1064                 }
1065         }
1066
1067         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1068                 goto out_set_cpu;
1069
1070         /*
1071          * Check for affine wakeup and passive balancing possibilities.
1072          */
1073         if (this_sd) {
1074                 int idx = this_sd->wake_idx;
1075                 unsigned int imbalance;
1076
1077                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1078
1079                 load = source_load(cpu, idx);
1080                 this_load = target_load(this_cpu, idx);
1081
1082                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1083
1084                 if (this_sd->flags & SD_WAKE_AFFINE) {
1085                         unsigned long tl = this_load;
1086                         /*
1087                          * If sync wakeup then subtract the (maximum possible)
1088                          * effect of the currently running task from the load
1089                          * of the current CPU:
1090                          */
1091                         if (sync)
1092                                 tl -= SCHED_LOAD_SCALE;
1093
1094                         if ((tl <= load &&
1095                                 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
1096                                 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
1097                                 /*
1098                                  * This domain has SD_WAKE_AFFINE and
1099                                  * p is cache cold in this domain, and
1100                                  * there is no bad imbalance.
1101                                  */
1102                                 schedstat_inc(this_sd, ttwu_move_affine);
1103                                 goto out_set_cpu;
1104                         }
1105                 }
1106
1107                 /*
1108                  * Start passive balancing when half the imbalance_pct
1109                  * limit is reached.
1110                  */
1111                 if (this_sd->flags & SD_WAKE_BALANCE) {
1112                         if (imbalance*this_load <= 100*load) {
1113                                 schedstat_inc(this_sd, ttwu_move_balance);
1114                                 goto out_set_cpu;
1115                         }
1116                 }
1117         }
1118
1119         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1120 out_set_cpu:
1121         new_cpu = wake_idle(new_cpu, p);
1122         if (new_cpu != cpu) {
1123                 set_task_cpu(p, new_cpu);
1124                 task_rq_unlock(rq, &flags);
1125                 /* might preempt at this point */
1126                 rq = task_rq_lock(p, &flags);
1127                 old_state = p->state;
1128                 if (!(old_state & state))
1129                         goto out;
1130                 if (p->array)
1131                         goto out_running;
1132
1133                 this_cpu = smp_processor_id();
1134                 cpu = task_cpu(p);
1135         }
1136
1137 out_activate:
1138 #endif /* CONFIG_SMP */
1139         if (old_state == TASK_UNINTERRUPTIBLE) {
1140                 rq->nr_uninterruptible--;
1141                 /*
1142                  * Tasks on involuntary sleep don't earn
1143                  * sleep_avg beyond just interactive state.
1144                  */
1145                 p->activated = -1;
1146         }
1147
1148         /*
1149          * Sync wakeups (i.e. those types of wakeups where the waker
1150          * has indicated that it will leave the CPU in short order)
1151          * don't trigger a preemption, if the woken up task will run on
1152          * this cpu. (in this case the 'I will reschedule' promise of
1153          * the waker guarantees that the freshly woken up task is going
1154          * to be considered on this CPU.)
1155          */
1156         activate_task(p, rq, cpu == this_cpu);
1157         if (!sync || cpu != this_cpu) {
1158                 if (TASK_PREEMPTS_CURR(p, rq))
1159                         resched_task(rq->curr);
1160         }
1161         success = 1;
1162
1163 out_running:
1164         p->state = TASK_RUNNING;
1165 out:
1166         task_rq_unlock(rq, &flags);
1167
1168         return success;
1169 }
1170
1171 int fastcall wake_up_process(task_t * p)
1172 {
1173         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1174                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1175 }
1176
1177 EXPORT_SYMBOL(wake_up_process);
1178
1179 int fastcall wake_up_state(task_t *p, unsigned int state)
1180 {
1181         return try_to_wake_up(p, state, 0);
1182 }
1183
1184 /*
1185  * Perform scheduler related setup for a newly forked process p.
1186  * p is forked by current.
1187  */
1188 void fastcall sched_fork(task_t *p)
1189 {
1190         /*
1191          * We mark the process as running here, but have not actually
1192          * inserted it onto the runqueue yet. This guarantees that
1193          * nobody will actually run it, and a signal or other external
1194          * event cannot wake it up and insert it on the runqueue either.
1195          */
1196         p->state = TASK_RUNNING;
1197         INIT_LIST_HEAD(&p->run_list);
1198         p->array = NULL;
1199         spin_lock_init(&p->switch_lock);
1200 #ifdef CONFIG_SCHEDSTATS
1201         memset(&p->sched_info, 0, sizeof(p->sched_info));
1202 #endif
1203 #ifdef CONFIG_PREEMPT
1204         /*
1205          * During context-switch we hold precisely one spinlock, which
1206          * schedule_tail drops. (in the common case it's this_rq()->lock,
1207          * but it also can be p->switch_lock.) So we compensate with a count
1208          * of 1. Also, we want to start with kernel preemption disabled.
1209          */
1210         p->thread_info->preempt_count = 1;
1211 #endif
1212         /*
1213          * Share the timeslice between parent and child, thus the
1214          * total amount of pending timeslices in the system doesn't change,
1215          * resulting in more scheduling fairness.
1216          */
1217         local_irq_disable();
1218         p->time_slice = (current->time_slice + 1) >> 1;
1219         /*
1220          * The remainder of the first timeslice might be recovered by
1221          * the parent if the child exits early enough.
1222          */
1223         p->first_time_slice = 1;
1224         current->time_slice >>= 1;
1225         p->timestamp = sched_clock();
1226         if (unlikely(!current->time_slice)) {
1227                 /*
1228                  * This case is rare, it happens when the parent has only
1229                  * a single jiffy left from its timeslice. Taking the
1230                  * runqueue lock is not a problem.
1231                  */
1232                 current->time_slice = 1;
1233                 preempt_disable();
1234                 scheduler_tick();
1235                 local_irq_enable();
1236                 preempt_enable();
1237         } else
1238                 local_irq_enable();
1239 }
1240
1241 /*
1242  * wake_up_new_task - wake up a newly created task for the first time.
1243  *
1244  * This function will do some initial scheduler statistics housekeeping
1245  * that must be done for every newly created context, then puts the task
1246  * on the runqueue and wakes it.
1247  */
1248 void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1249 {
1250         unsigned long flags;
1251         int this_cpu, cpu;
1252         runqueue_t *rq, *this_rq;
1253 #ifdef CONFIG_SMP
1254         struct sched_domain *tmp, *sd = NULL;
1255 #endif
1256
1257         rq = task_rq_lock(p, &flags);
1258         BUG_ON(p->state != TASK_RUNNING);
1259         this_cpu = smp_processor_id();
1260         cpu = task_cpu(p);
1261
1262 #ifdef CONFIG_SMP
1263         for_each_domain(cpu, tmp)
1264                 if (tmp->flags & SD_BALANCE_FORK)
1265                         sd = tmp;
1266
1267         if (sd) {
1268                 int new_cpu;
1269                 struct sched_group *group;
1270
1271                 schedstat_inc(sd, sbf_cnt);
1272                 cpu = task_cpu(p);
1273                 group = find_idlest_group(sd, p, cpu);
1274                 if (!group) {
1275                         schedstat_inc(sd, sbf_balanced);
1276                         goto no_forkbalance;
1277                 }
1278
1279                 new_cpu = find_idlest_cpu(group, cpu);
1280                 if (new_cpu == -1 || new_cpu == cpu) {
1281                         schedstat_inc(sd, sbf_balanced);
1282                         goto no_forkbalance;
1283                 }
1284
1285                 if (cpu_isset(new_cpu, p->cpus_allowed)) {
1286                         schedstat_inc(sd, sbf_pushed);
1287                         set_task_cpu(p, new_cpu);
1288                         task_rq_unlock(rq, &flags);
1289                         rq = task_rq_lock(p, &flags);
1290                         cpu = task_cpu(p);
1291                 }
1292         }
1293
1294 no_forkbalance:
1295 #endif
1296         /*
1297          * We decrease the sleep average of forking parents
1298          * and children as well, to keep max-interactive tasks
1299          * from forking tasks that are max-interactive. The parent
1300          * (current) is done further down, under its lock.
1301          */
1302         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1303                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1304
1305         p->prio = effective_prio(p);
1306
1307         if (likely(cpu == this_cpu)) {
1308                 if (!(clone_flags & CLONE_VM)) {
1309                         /*
1310                          * The VM isn't cloned, so we're in a good position to
1311                          * do child-runs-first in anticipation of an exec. This
1312                          * usually avoids a lot of COW overhead.
1313                          */
1314                         if (unlikely(!current->array))
1315                                 __activate_task(p, rq);
1316                         else {
1317                                 p->prio = current->prio;
1318                                 list_add_tail(&p->run_list, &current->run_list);
1319                                 p->array = current->array;
1320                                 p->array->nr_active++;
1321                                 rq->nr_running++;
1322                         }
1323                         set_need_resched();
1324                 } else
1325                         /* Run child last */
1326                         __activate_task(p, rq);
1327                 /*
1328                  * We skip the following code due to cpu == this_cpu
1329                  *
1330                  *   task_rq_unlock(rq, &flags);
1331                  *   this_rq = task_rq_lock(current, &flags);
1332                  */
1333                 this_rq = rq;
1334         } else {
1335                 this_rq = cpu_rq(this_cpu);
1336
1337                 /*
1338                  * Not the local CPU - must adjust timestamp. This should
1339                  * get optimised away in the !CONFIG_SMP case.
1340                  */
1341                 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1342                                         + rq->timestamp_last_tick;
1343                 __activate_task(p, rq);
1344                 if (TASK_PREEMPTS_CURR(p, rq))
1345                         resched_task(rq->curr);
1346
1347                 /*
1348                  * Parent and child are on different CPUs, now get the
1349                  * parent runqueue to update the parent's ->sleep_avg:
1350                  */
1351                 task_rq_unlock(rq, &flags);
1352                 this_rq = task_rq_lock(current, &flags);
1353         }
1354         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1355                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1356         task_rq_unlock(this_rq, &flags);
1357 }
1358
1359 /*
1360  * Potentially available exiting-child timeslices are
1361  * retrieved here - this way the parent does not get
1362  * penalized for creating too many threads.
1363  *
1364  * (this cannot be used to 'generate' timeslices
1365  * artificially, because any timeslice recovered here
1366  * was given away by the parent in the first place.)
1367  */
1368 void fastcall sched_exit(task_t * p)
1369 {
1370         unsigned long flags;
1371         runqueue_t *rq;
1372
1373         /*
1374          * If the child was a (relative-) CPU hog then decrease
1375          * the sleep_avg of the parent as well.
1376          */
1377         rq = task_rq_lock(p->parent, &flags);
1378         if (p->first_time_slice) {
1379                 p->parent->time_slice += p->time_slice;
1380                 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1381                         p->parent->time_slice = task_timeslice(p);
1382         }
1383         if (p->sleep_avg < p->parent->sleep_avg)
1384                 p->parent->sleep_avg = p->parent->sleep_avg /
1385                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1386                 (EXIT_WEIGHT + 1);
1387         task_rq_unlock(rq, &flags);
1388 }
1389
1390 /**
1391  * finish_task_switch - clean up after a task-switch
1392  * @prev: the thread we just switched away from.
1393  *
1394  * We enter this with the runqueue still locked, and finish_arch_switch()
1395  * will unlock it along with doing any other architecture-specific cleanup
1396  * actions.
1397  *
1398  * Note that we may have delayed dropping an mm in context_switch(). If
1399  * so, we finish that here outside of the runqueue lock.  (Doing it
1400  * with the lock held can cause deadlocks; see schedule() for
1401  * details.)
1402  */
1403 static inline void finish_task_switch(task_t *prev)
1404         __releases(rq->lock)
1405 {
1406         runqueue_t *rq = this_rq();
1407         struct mm_struct *mm = rq->prev_mm;
1408         unsigned long prev_task_flags;
1409
1410         rq->prev_mm = NULL;
1411
1412         /*
1413          * A task struct has one reference for the use as "current".
1414          * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1415          * calls schedule one last time. The schedule call will never return,
1416          * and the scheduled task must drop that reference.
1417          * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1418          * still held, otherwise prev could be scheduled on another cpu, die
1419          * there before we look at prev->state, and then the reference would
1420          * be dropped twice.
1421          *              Manfred Spraul <manfred@colorfullife.com>
1422          */
1423         prev_task_flags = prev->flags;
1424         finish_arch_switch(rq, prev);
1425         if (mm)
1426                 mmdrop(mm);
1427         if (unlikely(prev_task_flags & PF_DEAD))
1428                 put_task_struct(prev);
1429 }
1430
1431 /**
1432  * schedule_tail - first thing a freshly forked thread must call.
1433  * @prev: the thread we just switched away from.
1434  */
1435 asmlinkage void schedule_tail(task_t *prev)
1436         __releases(rq->lock)
1437 {
1438         finish_task_switch(prev);
1439
1440         if (current->set_child_tid)
1441                 put_user(current->pid, current->set_child_tid);
1442 }
1443
1444 /*
1445  * context_switch - switch to the new MM and the new
1446  * thread's register state.
1447  */
1448 static inline
1449 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1450 {
1451         struct mm_struct *mm = next->mm;
1452         struct mm_struct *oldmm = prev->active_mm;
1453
1454         if (unlikely(!mm)) {
1455                 next->active_mm = oldmm;
1456                 atomic_inc(&oldmm->mm_count);
1457                 enter_lazy_tlb(oldmm, next);
1458         } else
1459                 switch_mm(oldmm, mm, next);
1460
1461         if (unlikely(!prev->mm)) {
1462                 prev->active_mm = NULL;
1463                 WARN_ON(rq->prev_mm);
1464                 rq->prev_mm = oldmm;
1465         }
1466
1467         /* Here we just switch the register state and the stack. */
1468         switch_to(prev, next, prev);
1469
1470         return prev;
1471 }
1472
1473 /*
1474  * nr_running, nr_uninterruptible and nr_context_switches:
1475  *
1476  * externally visible scheduler statistics: current number of runnable
1477  * threads, current number of uninterruptible-sleeping threads, total
1478  * number of context switches performed since bootup.
1479  */
1480 unsigned long nr_running(void)
1481 {
1482         unsigned long i, sum = 0;
1483
1484         for_each_online_cpu(i)
1485                 sum += cpu_rq(i)->nr_running;
1486
1487         return sum;
1488 }
1489
1490 unsigned long nr_uninterruptible(void)
1491 {
1492         unsigned long i, sum = 0;
1493
1494         for_each_cpu(i)
1495                 sum += cpu_rq(i)->nr_uninterruptible;
1496
1497         /*
1498          * Since we read the counters lockless, it might be slightly
1499          * inaccurate. Do not allow it to go below zero though:
1500          */
1501         if (unlikely((long)sum < 0))
1502                 sum = 0;
1503
1504         return sum;
1505 }
1506
1507 unsigned long long nr_context_switches(void)
1508 {
1509         unsigned long long i, sum = 0;
1510
1511         for_each_cpu(i)
1512                 sum += cpu_rq(i)->nr_switches;
1513
1514         return sum;
1515 }
1516
1517 unsigned long nr_iowait(void)
1518 {
1519         unsigned long i, sum = 0;
1520
1521         for_each_cpu(i)
1522                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1523
1524         return sum;
1525 }
1526
1527 #ifdef CONFIG_SMP
1528
1529 /*
1530  * double_rq_lock - safely lock two runqueues
1531  *
1532  * Note this does not disable interrupts like task_rq_lock,
1533  * you need to do so manually before calling.
1534  */
1535 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1536         __acquires(rq1->lock)
1537         __acquires(rq2->lock)
1538 {
1539         if (rq1 == rq2) {
1540                 spin_lock(&rq1->lock);
1541                 __acquire(rq2->lock);   /* Fake it out ;) */
1542         } else {
1543                 if (rq1 < rq2) {
1544                         spin_lock(&rq1->lock);
1545                         spin_lock(&rq2->lock);
1546                 } else {
1547                         spin_lock(&rq2->lock);
1548                         spin_lock(&rq1->lock);
1549                 }
1550         }
1551 }
1552
1553 /*
1554  * double_rq_unlock - safely unlock two runqueues
1555  *
1556  * Note this does not restore interrupts like task_rq_unlock,
1557  * you need to do so manually after calling.
1558  */
1559 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1560         __releases(rq1->lock)
1561         __releases(rq2->lock)
1562 {
1563         spin_unlock(&rq1->lock);
1564         if (rq1 != rq2)
1565                 spin_unlock(&rq2->lock);
1566         else
1567                 __release(rq2->lock);
1568 }
1569
1570 /*
1571  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1572  */
1573 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1574         __releases(this_rq->lock)
1575         __acquires(busiest->lock)
1576         __acquires(this_rq->lock)
1577 {
1578         if (unlikely(!spin_trylock(&busiest->lock))) {
1579                 if (busiest < this_rq) {
1580                         spin_unlock(&this_rq->lock);
1581                         spin_lock(&busiest->lock);
1582                         spin_lock(&this_rq->lock);
1583                 } else
1584                         spin_lock(&busiest->lock);
1585         }
1586 }
1587
1588 /*
1589  * If dest_cpu is allowed for this process, migrate the task to it.
1590  * This is accomplished by forcing the cpu_allowed mask to only
1591  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1592  * the cpu_allowed mask is restored.
1593  */
1594 static void sched_migrate_task(task_t *p, int dest_cpu)
1595 {
1596         migration_req_t req;
1597         runqueue_t *rq;
1598         unsigned long flags;
1599
1600         rq = task_rq_lock(p, &flags);
1601         if (!cpu_isset(dest_cpu, p->cpus_allowed)
1602             || unlikely(cpu_is_offline(dest_cpu)))
1603                 goto out;
1604
1605         /* force the process onto the specified CPU */
1606         if (migrate_task(p, dest_cpu, &req)) {
1607                 /* Need to wait for migration thread (might exit: take ref). */
1608                 struct task_struct *mt = rq->migration_thread;
1609                 get_task_struct(mt);
1610                 task_rq_unlock(rq, &flags);
1611                 wake_up_process(mt);
1612                 put_task_struct(mt);
1613                 wait_for_completion(&req.done);
1614                 return;
1615         }
1616 out:
1617         task_rq_unlock(rq, &flags);
1618 }
1619
1620 /*
1621  * sched_exec(): find the highest-level, exec-balance-capable
1622  * domain and try to migrate the task to the least loaded CPU.
1623  *
1624  * execve() is a valuable balancing opportunity, because at this point
1625  * the task has the smallest effective memory and cache footprint.
1626  */
1627 void sched_exec(void)
1628 {
1629         struct sched_domain *tmp, *sd = NULL;
1630         int new_cpu, this_cpu = get_cpu();
1631
1632         for_each_domain(this_cpu, tmp)
1633                 if (tmp->flags & SD_BALANCE_EXEC)
1634                         sd = tmp;
1635
1636         if (sd) {
1637                 struct sched_group *group;
1638                 schedstat_inc(sd, sbe_cnt);
1639                 group = find_idlest_group(sd, current, this_cpu);
1640                 if (!group) {
1641                         schedstat_inc(sd, sbe_balanced);
1642                         goto out;
1643                 }
1644                 new_cpu = find_idlest_cpu(group, this_cpu);
1645                 if (new_cpu == -1 || new_cpu == this_cpu) {
1646                         schedstat_inc(sd, sbe_balanced);
1647                         goto out;
1648                 }
1649
1650                 schedstat_inc(sd, sbe_pushed);
1651                 put_cpu();
1652                 sched_migrate_task(current, new_cpu);
1653                 return;
1654         }
1655 out:
1656         put_cpu();
1657 }
1658
1659 /*
1660  * pull_task - move a task from a remote runqueue to the local runqueue.
1661  * Both runqueues must be locked.
1662  */
1663 static inline
1664 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1665                runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1666 {
1667         dequeue_task(p, src_array);
1668         src_rq->nr_running--;
1669         set_task_cpu(p, this_cpu);
1670         this_rq->nr_running++;
1671         enqueue_task(p, this_array);
1672         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1673                                 + this_rq->timestamp_last_tick;
1674         /*
1675          * Note that idle threads have a prio of MAX_PRIO, for this test
1676          * to be always true for them.
1677          */
1678         if (TASK_PREEMPTS_CURR(p, this_rq))
1679                 resched_task(this_rq->curr);
1680 }
1681
1682 /*
1683  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1684  */
1685 static inline
1686 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1687              struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1688 {
1689         /*
1690          * We do not migrate tasks that are:
1691          * 1) running (obviously), or
1692          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1693          * 3) are cache-hot on their current CPU.
1694          */
1695         if (!cpu_isset(this_cpu, p->cpus_allowed))
1696                 return 0;
1697         *all_pinned = 0;
1698
1699         if (task_running(rq, p))
1700                 return 0;
1701
1702         /*
1703          * Aggressive migration if:
1704          * 1) task is cache cold, or
1705          * 2) too many balance attempts have failed.
1706          */
1707
1708         if (sd->nr_balance_failed > sd->cache_nice_tries)
1709                 return 1;
1710
1711         if (task_hot(p, rq->timestamp_last_tick, sd))
1712                 return 0;
1713         return 1;
1714 }
1715
1716 /*
1717  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1718  * as part of a balancing operation within "domain". Returns the number of
1719  * tasks moved.
1720  *
1721  * Called with both runqueues locked.
1722  */
1723 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1724                       unsigned long max_nr_move, struct sched_domain *sd,
1725                       enum idle_type idle, int *all_pinned)
1726 {
1727         prio_array_t *array, *dst_array;
1728         struct list_head *head, *curr;
1729         int idx, pulled = 0, pinned = 0;
1730         task_t *tmp;
1731
1732         if (max_nr_move == 0)
1733                 goto out;
1734
1735         pinned = 1;
1736
1737         /*
1738          * We first consider expired tasks. Those will likely not be
1739          * executed in the near future, and they are most likely to
1740          * be cache-cold, thus switching CPUs has the least effect
1741          * on them.
1742          */
1743         if (busiest->expired->nr_active) {
1744                 array = busiest->expired;
1745                 dst_array = this_rq->expired;
1746         } else {
1747                 array = busiest->active;
1748                 dst_array = this_rq->active;
1749         }
1750
1751 new_array:
1752         /* Start searching at priority 0: */
1753         idx = 0;
1754 skip_bitmap:
1755         if (!idx)
1756                 idx = sched_find_first_bit(array->bitmap);
1757         else
1758                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1759         if (idx >= MAX_PRIO) {
1760                 if (array == busiest->expired && busiest->active->nr_active) {
1761                         array = busiest->active;
1762                         dst_array = this_rq->active;
1763                         goto new_array;
1764                 }
1765                 goto out;
1766         }
1767
1768         head = array->queue + idx;
1769         curr = head->prev;
1770 skip_queue:
1771         tmp = list_entry(curr, task_t, run_list);
1772
1773         curr = curr->prev;
1774
1775         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1776                 if (curr != head)
1777                         goto skip_queue;
1778                 idx++;
1779                 goto skip_bitmap;
1780         }
1781
1782 #ifdef CONFIG_SCHEDSTATS
1783         if (task_hot(tmp, busiest->timestamp_last_tick, sd))
1784                 schedstat_inc(sd, lb_hot_gained[idle]);
1785 #endif
1786
1787         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1788         pulled++;
1789
1790         /* We only want to steal up to the prescribed number of tasks. */
1791         if (pulled < max_nr_move) {
1792                 if (curr != head)
1793                         goto skip_queue;
1794                 idx++;
1795                 goto skip_bitmap;
1796         }
1797 out:
1798         /*
1799          * Right now, this is the only place pull_task() is called,
1800          * so we can safely collect pull_task() stats here rather than
1801          * inside pull_task().
1802          */
1803         schedstat_add(sd, lb_gained[idle], pulled);
1804
1805         if (all_pinned)
1806                 *all_pinned = pinned;
1807         return pulled;
1808 }
1809
1810 /*
1811  * find_busiest_group finds and returns the busiest CPU group within the
1812  * domain. It calculates and returns the number of tasks which should be
1813  * moved to restore balance via the imbalance parameter.
1814  */
1815 static struct sched_group *
1816 find_busiest_group(struct sched_domain *sd, int this_cpu,
1817                    unsigned long *imbalance, enum idle_type idle)
1818 {
1819         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1820         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1821         int load_idx;
1822
1823         max_load = this_load = total_load = total_pwr = 0;
1824         if (idle == NOT_IDLE)
1825                 load_idx = sd->busy_idx;
1826         else if (idle == NEWLY_IDLE)
1827                 load_idx = sd->newidle_idx;
1828         else
1829                 load_idx = sd->idle_idx;
1830
1831         do {
1832                 unsigned long load;
1833                 int local_group;
1834                 int i;
1835
1836                 local_group = cpu_isset(this_cpu, group->cpumask);
1837
1838                 /* Tally up the load of all CPUs in the group */
1839                 avg_load = 0;
1840
1841                 for_each_cpu_mask(i, group->cpumask) {
1842                         /* Bias balancing toward cpus of our domain */
1843                         if (local_group)
1844                                 load = target_load(i, load_idx);
1845                         else
1846                                 load = source_load(i, load_idx);
1847
1848                         avg_load += load;
1849                 }
1850
1851                 total_load += avg_load;
1852                 total_pwr += group->cpu_power;
1853
1854                 /* Adjust by relative CPU power of the group */
1855                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1856
1857                 if (local_group) {
1858                         this_load = avg_load;
1859                         this = group;
1860                 } else if (avg_load > max_load) {
1861                         max_load = avg_load;
1862                         busiest = group;
1863                 }
1864                 group = group->next;
1865         } while (group != sd->groups);
1866
1867         if (!busiest || this_load >= max_load)
1868                 goto out_balanced;
1869
1870         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1871
1872         if (this_load >= avg_load ||
1873                         100*max_load <= sd->imbalance_pct*this_load)
1874                 goto out_balanced;
1875
1876         /*
1877          * We're trying to get all the cpus to the average_load, so we don't
1878          * want to push ourselves above the average load, nor do we wish to
1879          * reduce the max loaded cpu below the average load, as either of these
1880          * actions would just result in more rebalancing later, and ping-pong
1881          * tasks around. Thus we look for the minimum possible imbalance.
1882          * Negative imbalances (*we* are more loaded than anyone else) will
1883          * be counted as no imbalance for these purposes -- we can't fix that
1884          * by pulling tasks to us.  Be careful of negative numbers as they'll
1885          * appear as very large values with unsigned longs.
1886          */
1887         /* How much load to actually move to equalise the imbalance */
1888         *imbalance = min((max_load - avg_load) * busiest->cpu_power,
1889                                 (avg_load - this_load) * this->cpu_power)
1890                         / SCHED_LOAD_SCALE;
1891
1892         if (*imbalance < SCHED_LOAD_SCALE) {
1893                 unsigned long pwr_now = 0, pwr_move = 0;
1894                 unsigned long tmp;
1895
1896                 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1897                         *imbalance = 1;
1898                         return busiest;
1899                 }
1900
1901                 /*
1902                  * OK, we don't have enough imbalance to justify moving tasks,
1903                  * however we may be able to increase total CPU power used by
1904                  * moving them.
1905                  */
1906
1907                 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1908                 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1909                 pwr_now /= SCHED_LOAD_SCALE;
1910
1911                 /* Amount of load we'd subtract */
1912                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1913                 if (max_load > tmp)
1914                         pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1915                                                         max_load - tmp);
1916
1917                 /* Amount of load we'd add */
1918                 if (max_load*busiest->cpu_power <
1919                                 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
1920                         tmp = max_load*busiest->cpu_power/this->cpu_power;
1921                 else
1922                         tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1923                 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1924                 pwr_move /= SCHED_LOAD_SCALE;
1925
1926                 /* Move if we gain throughput */
1927                 if (pwr_move <= pwr_now)
1928                         goto out_balanced;
1929
1930                 *imbalance = 1;
1931                 return busiest;
1932         }
1933
1934         /* Get rid of the scaling factor, rounding down as we divide */
1935         *imbalance = *imbalance / SCHED_LOAD_SCALE;
1936         return busiest;
1937
1938 out_balanced:
1939
1940         *imbalance = 0;
1941         return NULL;
1942 }
1943
1944 /*
1945  * find_busiest_queue - find the busiest runqueue among the cpus in group.
1946  */
1947 static runqueue_t *find_busiest_queue(struct sched_group *group)
1948 {
1949         unsigned long load, max_load = 0;
1950         runqueue_t *busiest = NULL;
1951         int i;
1952
1953         for_each_cpu_mask(i, group->cpumask) {
1954                 load = source_load(i, 0);
1955
1956                 if (load > max_load) {
1957                         max_load = load;
1958                         busiest = cpu_rq(i);
1959                 }
1960         }
1961
1962         return busiest;
1963 }
1964
1965 /*
1966  * Check this_cpu to ensure it is balanced within domain. Attempt to move
1967  * tasks if there is an imbalance.
1968  *
1969  * Called with this_rq unlocked.
1970  */
1971 static int load_balance(int this_cpu, runqueue_t *this_rq,
1972                         struct sched_domain *sd, enum idle_type idle)
1973 {
1974         struct sched_group *group;
1975         runqueue_t *busiest;
1976         unsigned long imbalance;
1977         int nr_moved, all_pinned;
1978         int active_balance = 0;
1979
1980         spin_lock(&this_rq->lock);
1981         schedstat_inc(sd, lb_cnt[idle]);
1982
1983         group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1984         if (!group) {
1985                 schedstat_inc(sd, lb_nobusyg[idle]);
1986                 goto out_balanced;
1987         }
1988
1989         busiest = find_busiest_queue(group);
1990         if (!busiest) {
1991                 schedstat_inc(sd, lb_nobusyq[idle]);
1992                 goto out_balanced;
1993         }
1994
1995         BUG_ON(busiest == this_rq);
1996
1997         schedstat_add(sd, lb_imbalance[idle], imbalance);
1998
1999         nr_moved = 0;
2000         if (busiest->nr_running > 1) {
2001                 /*
2002                  * Attempt to move tasks. If find_busiest_group has found
2003                  * an imbalance but busiest->nr_running <= 1, the group is
2004                  * still unbalanced. nr_moved simply stays zero, so it is
2005                  * correctly treated as an imbalance.
2006                  */
2007                 double_lock_balance(this_rq, busiest);
2008                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2009                                                 imbalance, sd, idle,
2010                                                 &all_pinned);
2011                 spin_unlock(&busiest->lock);
2012
2013                 /* All tasks on this runqueue were pinned by CPU affinity */
2014                 if (unlikely(all_pinned))
2015                         goto out_balanced;
2016         }
2017
2018         spin_unlock(&this_rq->lock);
2019
2020         if (!nr_moved) {
2021                 schedstat_inc(sd, lb_failed[idle]);
2022                 sd->nr_balance_failed++;
2023
2024                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2025
2026                         spin_lock(&busiest->lock);
2027                         if (!busiest->active_balance) {
2028                                 busiest->active_balance = 1;
2029                                 busiest->push_cpu = this_cpu;
2030                                 active_balance = 1;
2031                         }
2032                         spin_unlock(&busiest->lock);
2033                         if (active_balance)
2034                                 wake_up_process(busiest->migration_thread);
2035
2036                         /*
2037                          * We've kicked active balancing, reset the failure
2038                          * counter.
2039                          */
2040                         sd->nr_balance_failed = sd->cache_nice_tries+1;
2041                 }
2042         } else
2043                 sd->nr_balance_failed = 0;
2044
2045         if (likely(!active_balance)) {
2046                 /* We were unbalanced, so reset the balancing interval */
2047                 sd->balance_interval = sd->min_interval;
2048         } else {
2049                 /*
2050                  * If we've begun active balancing, start to back off. This
2051                  * case may not be covered by the all_pinned logic if there
2052                  * is only 1 task on the busy runqueue (because we don't call
2053                  * move_tasks).
2054                  */
2055                 if (sd->balance_interval < sd->max_interval)
2056                         sd->balance_interval *= 2;
2057         }
2058
2059         return nr_moved;
2060
2061 out_balanced:
2062         spin_unlock(&this_rq->lock);
2063
2064         schedstat_inc(sd, lb_balanced[idle]);
2065
2066         sd->nr_balance_failed = 0;
2067         /* tune up the balancing interval */
2068         if (sd->balance_interval < sd->max_interval)
2069                 sd->balance_interval *= 2;
2070
2071         return 0;
2072 }
2073
2074 /*
2075  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2076  * tasks if there is an imbalance.
2077  *
2078  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2079  * this_rq is locked.
2080  */
2081 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2082                                 struct sched_domain *sd)
2083 {
2084         struct sched_group *group;
2085         runqueue_t *busiest = NULL;
2086         unsigned long imbalance;
2087         int nr_moved = 0;
2088
2089         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2090         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2091         if (!group) {
2092                 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2093                 goto out_balanced;
2094         }
2095
2096         busiest = find_busiest_queue(group);
2097         if (!busiest) {
2098                 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2099                 goto out_balanced;
2100         }
2101
2102         BUG_ON(busiest == this_rq);
2103
2104         /* Attempt to move tasks */
2105         double_lock_balance(this_rq, busiest);
2106
2107         schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2108         nr_moved = move_tasks(this_rq, this_cpu, busiest,
2109                                         imbalance, sd, NEWLY_IDLE, NULL);
2110         if (!nr_moved)
2111                 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2112         else
2113                 sd->nr_balance_failed = 0;
2114
2115         spin_unlock(&busiest->lock);
2116         return nr_moved;
2117
2118 out_balanced:
2119         schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2120         sd->nr_balance_failed = 0;
2121         return 0;
2122 }
2123
2124 /*
2125  * idle_balance is called by schedule() if this_cpu is about to become
2126  * idle. Attempts to pull tasks from other CPUs.
2127  */
2128 static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2129 {
2130         struct sched_domain *sd;
2131
2132         for_each_domain(this_cpu, sd) {
2133                 if (sd->flags & SD_BALANCE_NEWIDLE) {
2134                         if (load_balance_newidle(this_cpu, this_rq, sd)) {
2135                                 /* We've pulled tasks over so stop searching */
2136                                 break;
2137                         }
2138                 }
2139         }
2140 }
2141
2142 /*
2143  * active_load_balance is run by migration threads. It pushes running tasks
2144  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2145  * running on each physical CPU where possible, and avoids physical /
2146  * logical imbalances.
2147  *
2148  * Called with busiest_rq locked.
2149  */
2150 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2151 {
2152         struct sched_domain *sd;
2153         runqueue_t *target_rq;
2154         int target_cpu = busiest_rq->push_cpu;
2155
2156         if (busiest_rq->nr_running <= 1)
2157                 /* no task to move */
2158                 return;
2159
2160         target_rq = cpu_rq(target_cpu);
2161
2162         /*
2163          * This condition is "impossible", if it occurs
2164          * we need to fix it.  Originally reported by
2165          * Bjorn Helgaas on a 128-cpu setup.
2166          */
2167         BUG_ON(busiest_rq == target_rq);
2168
2169         /* move a task from busiest_rq to target_rq */
2170         double_lock_balance(busiest_rq, target_rq);
2171
2172         /* Search for an sd spanning us and the target CPU. */
2173         for_each_domain(target_cpu, sd)
2174                 if ((sd->flags & SD_LOAD_BALANCE) &&
2175                         cpu_isset(busiest_cpu, sd->span))
2176                                 break;
2177
2178         if (unlikely(sd == NULL))
2179                 goto out;
2180
2181         schedstat_inc(sd, alb_cnt);
2182
2183         if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2184                 schedstat_inc(sd, alb_pushed);
2185         else
2186                 schedstat_inc(sd, alb_failed);
2187 out:
2188         spin_unlock(&target_rq->lock);
2189 }
2190
2191 /*
2192  * rebalance_tick will get called every timer tick, on every CPU.
2193  *
2194  * It checks each scheduling domain to see if it is due to be balanced,
2195  * and initiates a balancing operation if so.
2196  *
2197  * Balancing parameters are set up in arch_init_sched_domains.
2198  */
2199
2200 /* Don't have all balancing operations going off at once */
2201 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2202
2203 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2204                            enum idle_type idle)
2205 {
2206         unsigned long old_load, this_load;
2207         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2208         struct sched_domain *sd;
2209         int i;
2210
2211         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2212         /* Update our load */
2213         for (i = 0; i < 3; i++) {
2214                 unsigned long new_load = this_load;
2215                 int scale = 1 << i;
2216                 old_load = this_rq->cpu_load[i];
2217                 /*
2218                  * Round up the averaging division if load is increasing. This
2219                  * prevents us from getting stuck on 9 if the load is 10, for
2220                  * example.
2221                  */
2222                 if (new_load > old_load)
2223                         new_load += scale-1;
2224                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2225         }
2226
2227         for_each_domain(this_cpu, sd) {
2228                 unsigned long interval;
2229
2230                 if (!(sd->flags & SD_LOAD_BALANCE))
2231                         continue;
2232
2233                 interval = sd->balance_interval;
2234                 if (idle != SCHED_IDLE)
2235                         interval *= sd->busy_factor;
2236
2237                 /* scale ms to jiffies */
2238                 interval = msecs_to_jiffies(interval);
2239                 if (unlikely(!interval))
2240                         interval = 1;
2241
2242                 if (j - sd->last_balance >= interval) {
2243                         if (load_balance(this_cpu, this_rq, sd, idle)) {
2244                                 /* We've pulled tasks over so no longer idle */
2245                                 idle = NOT_IDLE;
2246                         }
2247                         sd->last_balance += interval;
2248                 }
2249         }
2250 }
2251 #else
2252 /*
2253  * on UP we do not need to balance between CPUs:
2254  */
2255 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2256 {
2257 }
2258 static inline void idle_balance(int cpu, runqueue_t *rq)
2259 {
2260 }
2261 #endif
2262
2263 static inline int wake_priority_sleeper(runqueue_t *rq)
2264 {
2265         int ret = 0;
2266 #ifdef CONFIG_SCHED_SMT
2267         spin_lock(&rq->lock);
2268         /*
2269          * If an SMT sibling task has been put to sleep for priority
2270          * reasons reschedule the idle task to see if it can now run.
2271          */
2272         if (rq->nr_running) {
2273                 resched_task(rq->idle);
2274                 ret = 1;
2275         }
2276         spin_unlock(&rq->lock);
2277 #endif
2278         return ret;
2279 }
2280
2281 DEFINE_PER_CPU(struct kernel_stat, kstat);
2282
2283 EXPORT_PER_CPU_SYMBOL(kstat);
2284
2285 /*
2286  * This is called on clock ticks and on context switches.
2287  * Bank in p->sched_time the ns elapsed since the last tick or switch.
2288  */
2289 static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
2290                                     unsigned long long now)
2291 {
2292         unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
2293         p->sched_time += now - last;
2294 }
2295
2296 /*
2297  * Return current->sched_time plus any more ns on the sched_clock
2298  * that have not yet been banked.
2299  */
2300 unsigned long long current_sched_time(const task_t *tsk)
2301 {
2302         unsigned long long ns;
2303         unsigned long flags;
2304         local_irq_save(flags);
2305         ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
2306         ns = tsk->sched_time + (sched_clock() - ns);
2307         local_irq_restore(flags);
2308         return ns;
2309 }
2310
2311 /*
2312  * We place interactive tasks back into the active array, if possible.
2313  *
2314  * To guarantee that this does not starve expired tasks we ignore the
2315  * interactivity of a task if the first expired task had to wait more
2316  * than a 'reasonable' amount of time. This deadline timeout is
2317  * load-dependent, as the frequency of array switched decreases with
2318  * increasing number of running tasks. We also ignore the interactivity
2319  * if a better static_prio task has expired:
2320  */
2321 #define EXPIRED_STARVING(rq) \
2322         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2323                 (jiffies - (rq)->expired_timestamp >= \
2324                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2325                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
2326
2327 /*
2328  * Account user cpu time to a process.
2329  * @p: the process that the cpu time gets accounted to
2330  * @hardirq_offset: the offset to subtract from hardirq_count()
2331  * @cputime: the cpu time spent in user space since the last update
2332  */
2333 void account_user_time(struct task_struct *p, cputime_t cputime)
2334 {
2335         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2336         cputime64_t tmp;
2337
2338         p->utime = cputime_add(p->utime, cputime);
2339
2340         /* Add user time to cpustat. */
2341         tmp = cputime_to_cputime64(cputime);
2342         if (TASK_NICE(p) > 0)
2343                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
2344         else
2345                 cpustat->user = cputime64_add(cpustat->user, tmp);
2346 }
2347
2348 /*
2349  * Account system cpu time to a process.
2350  * @p: the process that the cpu time gets accounted to
2351  * @hardirq_offset: the offset to subtract from hardirq_count()
2352  * @cputime: the cpu time spent in kernel space since the last update
2353  */
2354 void account_system_time(struct task_struct *p, int hardirq_offset,
2355                          cputime_t cputime)
2356 {
2357         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2358         runqueue_t *rq = this_rq();
2359         cputime64_t tmp;
2360
2361         p->stime = cputime_add(p->stime, cputime);
2362
2363         /* Add system time to cpustat. */
2364         tmp = cputime_to_cputime64(cputime);
2365         if (hardirq_count() - hardirq_offset)
2366                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
2367         else if (softirq_count())
2368                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2369         else if (p != rq->idle)
2370                 cpustat->system = cputime64_add(cpustat->system, tmp);
2371         else if (atomic_read(&rq->nr_iowait) > 0)
2372                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2373         else
2374                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2375         /* Account for system time used */
2376         acct_update_integrals(p);
2377         /* Update rss highwater mark */
2378         update_mem_hiwater(p);
2379 }
2380
2381 /*
2382  * Account for involuntary wait time.
2383  * @p: the process from which the cpu time has been stolen
2384  * @steal: the cpu time spent in involuntary wait
2385  */
2386 void account_steal_time(struct task_struct *p, cputime_t steal)
2387 {
2388         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2389         cputime64_t tmp = cputime_to_cputime64(steal);
2390         runqueue_t *rq = this_rq();
2391
2392         if (p == rq->idle) {
2393                 p->stime = cputime_add(p->stime, steal);
2394                 if (atomic_read(&rq->nr_iowait) > 0)
2395                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2396                 else
2397                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
2398         } else
2399                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
2400 }
2401
2402 /*
2403  * This function gets called by the timer code, with HZ frequency.
2404  * We call it with interrupts disabled.
2405  *
2406  * It also gets called by the fork code, when changing the parent's
2407  * timeslices.
2408  */
2409 void scheduler_tick(void)
2410 {
2411         int cpu = smp_processor_id();
2412         runqueue_t *rq = this_rq();
2413         task_t *p = current;
2414         unsigned long long now = sched_clock();
2415
2416         update_cpu_clock(p, rq, now);
2417
2418         rq->timestamp_last_tick = now;
2419
2420         if (p == rq->idle) {
2421                 if (wake_priority_sleeper(rq))
2422                         goto out;
2423                 rebalance_tick(cpu, rq, SCHED_IDLE);
2424                 return;
2425         }
2426
2427         /* Task might have expired already, but not scheduled off yet */
2428         if (p->array != rq->active) {
2429                 set_tsk_need_resched(p);
2430                 goto out;
2431         }
2432         spin_lock(&rq->lock);
2433         /*
2434          * The task was running during this tick - update the
2435          * time slice counter. Note: we do not update a thread's
2436          * priority until it either goes to sleep or uses up its
2437          * timeslice. This makes it possible for interactive tasks
2438          * to use up their timeslices at their highest priority levels.
2439          */
2440         if (rt_task(p)) {
2441                 /*
2442                  * RR tasks need a special form of timeslice management.
2443                  * FIFO tasks have no timeslices.
2444                  */
2445                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2446                         p->time_slice = task_timeslice(p);
2447                         p->first_time_slice = 0;
2448                         set_tsk_need_resched(p);
2449
2450                         /* put it at the end of the queue: */
2451                         requeue_task(p, rq->active);
2452                 }
2453                 goto out_unlock;
2454         }
2455         if (!--p->time_slice) {
2456                 dequeue_task(p, rq->active);
2457                 set_tsk_need_resched(p);
2458                 p->prio = effective_prio(p);
2459                 p->time_slice = task_timeslice(p);
2460                 p->first_time_slice = 0;
2461
2462                 if (!rq->expired_timestamp)
2463                         rq->expired_timestamp = jiffies;
2464                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2465                         enqueue_task(p, rq->expired);
2466                         if (p->static_prio < rq->best_expired_prio)
2467                                 rq->best_expired_prio = p->static_prio;
2468                 } else
2469                         enqueue_task(p, rq->active);
2470         } else {
2471                 /*
2472                  * Prevent a too long timeslice allowing a task to monopolize
2473                  * the CPU. We do this by splitting up the timeslice into
2474                  * smaller pieces.
2475                  *
2476                  * Note: this does not mean the task's timeslices expire or
2477                  * get lost in any way, they just might be preempted by
2478                  * another task of equal priority. (one with higher
2479                  * priority would have preempted this task already.) We
2480                  * requeue this task to the end of the list on this priority
2481                  * level, which is in essence a round-robin of tasks with
2482                  * equal priority.
2483                  *
2484                  * This only applies to tasks in the interactive
2485                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
2486                  */
2487                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2488                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2489                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2490                         (p->array == rq->active)) {
2491
2492                         requeue_task(p, rq->active);
2493                         set_tsk_need_resched(p);
2494                 }
2495         }
2496 out_unlock:
2497         spin_unlock(&rq->lock);
2498 out:
2499         rebalance_tick(cpu, rq, NOT_IDLE);
2500 }
2501
2502 #ifdef CONFIG_SCHED_SMT
2503 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2504 {
2505         struct sched_domain *sd = this_rq->sd;
2506         cpumask_t sibling_map;
2507         int i;
2508
2509         if (!(sd->flags & SD_SHARE_CPUPOWER))
2510                 return;
2511
2512         /*
2513          * Unlock the current runqueue because we have to lock in
2514          * CPU order to avoid deadlocks. Caller knows that we might
2515          * unlock. We keep IRQs disabled.
2516          */
2517         spin_unlock(&this_rq->lock);
2518
2519         sibling_map = sd->span;
2520
2521         for_each_cpu_mask(i, sibling_map)
2522                 spin_lock(&cpu_rq(i)->lock);
2523         /*
2524          * We clear this CPU from the mask. This both simplifies the
2525          * inner loop and keps this_rq locked when we exit:
2526          */
2527         cpu_clear(this_cpu, sibling_map);
2528
2529         for_each_cpu_mask(i, sibling_map) {
2530                 runqueue_t *smt_rq = cpu_rq(i);
2531
2532                 /*
2533                  * If an SMT sibling task is sleeping due to priority
2534                  * reasons wake it up now.
2535                  */
2536                 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2537                         resched_task(smt_rq->idle);
2538         }
2539
2540         for_each_cpu_mask(i, sibling_map)
2541                 spin_unlock(&cpu_rq(i)->lock);
2542         /*
2543          * We exit with this_cpu's rq still held and IRQs
2544          * still disabled:
2545          */
2546 }
2547
2548 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2549 {
2550         struct sched_domain *sd = this_rq->sd;
2551         cpumask_t sibling_map;
2552         prio_array_t *array;
2553         int ret = 0, i;
2554         task_t *p;
2555
2556         if (!(sd->flags & SD_SHARE_CPUPOWER))
2557                 return 0;
2558
2559         /*
2560          * The same locking rules and details apply as for
2561          * wake_sleeping_dependent():
2562          */
2563         spin_unlock(&this_rq->lock);
2564         sibling_map = sd->span;
2565         for_each_cpu_mask(i, sibling_map)
2566                 spin_lock(&cpu_rq(i)->lock);
2567         cpu_clear(this_cpu, sibling_map);
2568
2569         /*
2570          * Establish next task to be run - it might have gone away because
2571          * we released the runqueue lock above:
2572          */
2573         if (!this_rq->nr_running)
2574                 goto out_unlock;
2575         array = this_rq->active;
2576         if (!array->nr_active)
2577                 array = this_rq->expired;
2578         BUG_ON(!array->nr_active);
2579
2580         p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2581                 task_t, run_list);
2582
2583         for_each_cpu_mask(i, sibling_map) {
2584                 runqueue_t *smt_rq = cpu_rq(i);
2585                 task_t *smt_curr = smt_rq->curr;
2586
2587                 /*
2588                  * If a user task with lower static priority than the
2589                  * running task on the SMT sibling is trying to schedule,
2590                  * delay it till there is proportionately less timeslice
2591                  * left of the sibling task to prevent a lower priority
2592                  * task from using an unfair proportion of the
2593                  * physical cpu's resources. -ck
2594                  */
2595                 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2596                         task_timeslice(p) || rt_task(smt_curr)) &&
2597                         p->mm && smt_curr->mm && !rt_task(p))
2598                                 ret = 1;
2599
2600                 /*
2601                  * Reschedule a lower priority task on the SMT sibling,
2602                  * or wake it up if it has been put to sleep for priority
2603                  * reasons.
2604                  */
2605                 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2606                         task_timeslice(smt_curr) || rt_task(p)) &&
2607                         smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2608                         (smt_curr == smt_rq->idle && smt_rq->nr_running))
2609                                 resched_task(smt_curr);
2610         }
2611 out_unlock:
2612         for_each_cpu_mask(i, sibling_map)
2613                 spin_unlock(&cpu_rq(i)->lock);
2614         return ret;
2615 }
2616 #else
2617 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2618 {
2619 }
2620
2621 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2622 {
2623         return 0;
2624 }
2625 #endif
2626
2627 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2628
2629 void fastcall add_preempt_count(int val)
2630 {
2631         /*
2632          * Underflow?
2633          */
2634         BUG_ON((preempt_count() < 0));
2635         preempt_count() += val;
2636         /*
2637          * Spinlock count overflowing soon?
2638          */
2639         BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2640 }
2641 EXPORT_SYMBOL(add_preempt_count);
2642
2643 void fastcall sub_preempt_count(int val)
2644 {
2645         /*
2646          * Underflow?
2647          */
2648         BUG_ON(val > preempt_count());
2649         /*
2650          * Is the spinlock portion underflowing?
2651          */
2652         BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2653         preempt_count() -= val;
2654 }
2655 EXPORT_SYMBOL(sub_preempt_count);
2656
2657 #endif
2658
2659 /*
2660  * schedule() is the main scheduler function.
2661  */
2662 asmlinkage void __sched schedule(void)
2663 {
2664         long *switch_count;
2665         task_t *prev, *next;
2666         runqueue_t *rq;
2667         prio_array_t *array;
2668         struct list_head *queue;
2669         unsigned long long now;
2670         unsigned long run_time;
2671         int cpu, idx;
2672
2673         /*
2674          * Test if we are atomic.  Since do_exit() needs to call into
2675          * schedule() atomically, we ignore that path for now.
2676          * Otherwise, whine if we are scheduling when we should not be.
2677          */
2678         if (likely(!current->exit_state)) {
2679                 if (unlikely(in_atomic())) {
2680                         printk(KERN_ERR "scheduling while atomic: "
2681                                 "%s/0x%08x/%d\n",
2682                                 current->comm, preempt_count(), current->pid);
2683                         dump_stack();
2684                 }
2685         }
2686         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2687
2688 need_resched:
2689         preempt_disable();
2690         prev = current;
2691         release_kernel_lock(prev);
2692 need_resched_nonpreemptible:
2693         rq = this_rq();
2694
2695         /*
2696          * The idle thread is not allowed to schedule!
2697          * Remove this check after it has been exercised a bit.
2698          */
2699         if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2700                 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2701                 dump_stack();
2702         }
2703
2704         schedstat_inc(rq, sched_cnt);
2705         now = sched_clock();
2706         if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
2707                 run_time = now - prev->timestamp;
2708                 if (unlikely((long long)(now - prev->timestamp) < 0))
2709                         run_time = 0;
2710         } else
2711                 run_time = NS_MAX_SLEEP_AVG;
2712
2713         /*
2714          * Tasks charged proportionately less run_time at high sleep_avg to
2715          * delay them losing their interactive status
2716          */
2717         run_time /= (CURRENT_BONUS(prev) ? : 1);
2718
2719         spin_lock_irq(&rq->lock);
2720
2721         if (unlikely(prev->flags & PF_DEAD))
2722                 prev->state = EXIT_DEAD;
2723
2724         switch_count = &prev->nivcsw;
2725         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2726                 switch_count = &prev->nvcsw;
2727                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2728                                 unlikely(signal_pending(prev))))
2729                         prev->state = TASK_RUNNING;
2730                 else {
2731                         if (prev->state == TASK_UNINTERRUPTIBLE)
2732                                 rq->nr_uninterruptible++;
2733                         deactivate_task(prev, rq);
2734                 }
2735         }
2736
2737         cpu = smp_processor_id();
2738         if (unlikely(!rq->nr_running)) {
2739 go_idle:
2740                 idle_balance(cpu, rq);
2741                 if (!rq->nr_running) {
2742                         next = rq->idle;
2743                         rq->expired_timestamp = 0;
2744                         wake_sleeping_dependent(cpu, rq);
2745                         /*
2746                          * wake_sleeping_dependent() might have released
2747                          * the runqueue, so break out if we got new
2748                          * tasks meanwhile:
2749                          */
2750                         if (!rq->nr_running)
2751                                 goto switch_tasks;
2752                 }
2753         } else {
2754                 if (dependent_sleeper(cpu, rq)) {
2755                         next = rq->idle;
2756                         goto switch_tasks;
2757                 }
2758                 /*
2759                  * dependent_sleeper() releases and reacquires the runqueue
2760                  * lock, hence go into the idle loop if the rq went
2761                  * empty meanwhile:
2762                  */
2763                 if (unlikely(!rq->nr_running))
2764                         goto go_idle;
2765         }
2766
2767         array = rq->active;
2768         if (unlikely(!array->nr_active)) {
2769                 /*
2770                  * Switch the active and expired arrays.
2771                  */
2772                 schedstat_inc(rq, sched_switch);
2773                 rq->active = rq->expired;
2774                 rq->expired = array;
2775                 array = rq->active;
2776                 rq->expired_timestamp = 0;
2777                 rq->best_expired_prio = MAX_PRIO;
2778         }
2779
2780         idx = sched_find_first_bit(array->bitmap);
2781         queue = array->queue + idx;
2782         next = list_entry(queue->next, task_t, run_list);
2783
2784         if (!rt_task(next) && next->activated > 0) {
2785                 unsigned long long delta = now - next->timestamp;
2786                 if (unlikely((long long)(now - next->timestamp) < 0))
2787                         delta = 0;
2788
2789                 if (next->activated == 1)
2790                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2791
2792                 array = next->array;
2793                 dequeue_task(next, array);
2794                 recalc_task_prio(next, next->timestamp + delta);
2795                 enqueue_task(next, array);
2796         }
2797         next->activated = 0;
2798 switch_tasks:
2799         if (next == rq->idle)
2800                 schedstat_inc(rq, sched_goidle);
2801         prefetch(next);
2802         clear_tsk_need_resched(prev);
2803         rcu_qsctr_inc(task_cpu(prev));
2804
2805         update_cpu_clock(prev, rq, now);
2806
2807         prev->sleep_avg -= run_time;
2808         if ((long)prev->sleep_avg <= 0)
2809                 prev->sleep_avg = 0;
2810         prev->timestamp = prev->last_ran = now;
2811
2812         sched_info_switch(prev, next);
2813         if (likely(prev != next)) {
2814                 next->timestamp = now;
2815                 rq->nr_switches++;
2816                 rq->curr = next;
2817                 ++*switch_count;
2818
2819                 prepare_arch_switch(rq, next);
2820                 prev = context_switch(rq, prev, next);
2821                 barrier();
2822
2823                 finish_task_switch(prev);
2824         } else
2825                 spin_unlock_irq(&rq->lock);
2826
2827         prev = current;
2828         if (unlikely(reacquire_kernel_lock(prev) < 0))
2829                 goto need_resched_nonpreemptible;
2830         preempt_enable_no_resched();
2831         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2832                 goto need_resched;
2833 }
2834
2835 EXPORT_SYMBOL(schedule);
2836
2837 #ifdef CONFIG_PREEMPT
2838 /*
2839  * this is is the entry point to schedule() from in-kernel preemption
2840  * off of preempt_enable.  Kernel preemptions off return from interrupt
2841  * occur there and call schedule directly.
2842  */
2843 asmlinkage void __sched preempt_schedule(void)
2844 {
2845         struct thread_info *ti = current_thread_info();
2846 #ifdef CONFIG_PREEMPT_BKL
2847         struct task_struct *task = current;
2848         int saved_lock_depth;
2849 #endif
2850         /*
2851          * If there is a non-zero preempt_count or interrupts are disabled,
2852          * we do not want to preempt the current task.  Just return..
2853          */
2854         if (unlikely(ti->preempt_count || irqs_disabled()))
2855                 return;
2856
2857 need_resched:
2858         add_preempt_count(PREEMPT_ACTIVE);
2859         /*
2860          * We keep the big kernel semaphore locked, but we
2861          * clear ->lock_depth so that schedule() doesnt
2862          * auto-release the semaphore:
2863          */
2864 #ifdef CONFIG_PREEMPT_BKL
2865         saved_lock_depth = task->lock_depth;
2866         task->lock_depth = -1;
2867 #endif
2868         schedule();
2869 #ifdef CONFIG_PREEMPT_BKL
2870         task->lock_depth = saved_lock_depth;
2871 #endif
2872         sub_preempt_count(PREEMPT_ACTIVE);
2873
2874         /* we could miss a preemption opportunity between schedule and now */
2875         barrier();
2876         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2877                 goto need_resched;
2878 }
2879
2880 EXPORT_SYMBOL(preempt_schedule);
2881
2882 /*
2883  * this is is the entry point to schedule() from kernel preemption
2884  * off of irq context.
2885  * Note, that this is called and return with irqs disabled. This will
2886  * protect us against recursive calling from irq.
2887  */
2888 asmlinkage void __sched preempt_schedule_irq(void)
2889 {
2890         struct thread_info *ti = current_thread_info();
2891 #ifdef CONFIG_PREEMPT_BKL
2892         struct task_struct *task = current;
2893         int saved_lock_depth;
2894 #endif
2895         /* Catch callers which need to be fixed*/
2896         BUG_ON(ti->preempt_count || !irqs_disabled());
2897
2898 need_resched:
2899         add_preempt_count(PREEMPT_ACTIVE);
2900         /*
2901          * We keep the big kernel semaphore locked, but we
2902          * clear ->lock_depth so that schedule() doesnt
2903          * auto-release the semaphore:
2904          */
2905 #ifdef CONFIG_PREEMPT_BKL
2906         saved_lock_depth = task->lock_depth;
2907         task->lock_depth = -1;
2908 #endif
2909         local_irq_enable();
2910         schedule();
2911         local_irq_disable();
2912 #ifdef CONFIG_PREEMPT_BKL
2913         task->lock_depth = saved_lock_depth;
2914 #endif
2915         sub_preempt_count(PREEMPT_ACTIVE);
2916
2917         /* we could miss a preemption opportunity between schedule and now */
2918         barrier();
2919         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2920                 goto need_resched;
2921 }
2922
2923 #endif /* CONFIG_PREEMPT */
2924
2925 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2926 {
2927         task_t *p = curr->private;
2928         return try_to_wake_up(p, mode, sync);
2929 }
2930
2931 EXPORT_SYMBOL(default_wake_function);
2932
2933 /*
2934  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
2935  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
2936  * number) then we wake all the non-exclusive tasks and one exclusive task.
2937  *
2938  * There are circumstances in which we can try to wake a task which has already
2939  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
2940  * zero in this (rare) case, and we handle it by continuing to scan the queue.
2941  */
2942 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2943                              int nr_exclusive, int sync, void *key)
2944 {
2945         struct list_head *tmp, *next;
2946
2947         list_for_each_safe(tmp, next, &q->task_list) {
2948                 wait_queue_t *curr;
2949                 unsigned flags;
2950                 curr = list_entry(tmp, wait_queue_t, task_list);
2951                 flags = curr->flags;
2952                 if (curr->func(curr, mode, sync, key) &&
2953                     (flags & WQ_FLAG_EXCLUSIVE) &&
2954                     !--nr_exclusive)
2955                         break;
2956         }
2957 }
2958
2959 /**
2960  * __wake_up - wake up threads blocked on a waitqueue.
2961  * @q: the waitqueue
2962  * @mode: which threads
2963  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2964  * @key: is directly passed to the wakeup function
2965  */
2966 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2967                                 int nr_exclusive, void *key)
2968 {
2969         unsigned long flags;
2970
2971         spin_lock_irqsave(&q->lock, flags);
2972         __wake_up_common(q, mode, nr_exclusive, 0, key);
2973         spin_unlock_irqrestore(&q->lock, flags);
2974 }
2975
2976 EXPORT_SYMBOL(__wake_up);
2977
2978 /*
2979  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2980  */
2981 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2982 {
2983         __wake_up_common(q, mode, 1, 0, NULL);
2984 }
2985
2986 /**
2987  * __wake_up_sync - wake up threads blocked on a waitqueue.
2988  * @q: the waitqueue
2989  * @mode: which threads
2990  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2991  *
2992  * The sync wakeup differs that the waker knows that it will schedule
2993  * away soon, so while the target thread will be woken up, it will not
2994  * be migrated to another CPU - ie. the two threads are 'synchronized'
2995  * with each other. This can prevent needless bouncing between CPUs.
2996  *
2997  * On UP it can prevent extra preemption.
2998  */
2999 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3000 {
3001         unsigned long flags;
3002         int sync = 1;
3003
3004         if (unlikely(!q))
3005                 return;
3006
3007         if (unlikely(!nr_exclusive))
3008                 sync = 0;
3009
3010         spin_lock_irqsave(&q->lock, flags);
3011         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3012         spin_unlock_irqrestore(&q->lock, flags);
3013 }
3014 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3015
3016 void fastcall complete(struct completion *x)
3017 {
3018         unsigned long flags;
3019
3020         spin_lock_irqsave(&x->wait.lock, flags);
3021         x->done++;
3022         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3023                          1, 0, NULL);
3024         spin_unlock_irqrestore(&x->wait.lock, flags);
3025 }
3026 EXPORT_SYMBOL(complete);
3027
3028 void fastcall complete_all(struct completion *x)
3029 {
3030         unsigned long flags;
3031
3032         spin_lock_irqsave(&x->wait.lock, flags);
3033         x->done += UINT_MAX/2;
3034         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3035                          0, 0, NULL);
3036         spin_unlock_irqrestore(&x->wait.lock, flags);
3037 }
3038 EXPORT_SYMBOL(complete_all);
3039
3040 void fastcall __sched wait_for_completion(struct completion *x)
3041 {
3042         might_sleep();
3043         spin_lock_irq(&x->wait.lock);
3044         if (!x->done) {
3045                 DECLARE_WAITQUEUE(wait, current);
3046
3047                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3048                 __add_wait_queue_tail(&x->wait, &wait);
3049                 do {
3050                         __set_current_state(TASK_UNINTERRUPTIBLE);
3051                         spin_unlock_irq(&x->wait.lock);
3052                         schedule();
3053                         spin_lock_irq(&x->wait.lock);
3054                 } while (!x->done);
3055                 __remove_wait_queue(&x->wait, &wait);
3056         }
3057         x->done--;
3058         spin_unlock_irq(&x->wait.lock);
3059 }
3060 EXPORT_SYMBOL(wait_for_completion);
3061
3062 unsigned long fastcall __sched
3063 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3064 {
3065         might_sleep();
3066
3067         spin_lock_irq(&x->wait.lock);
3068         if (!x->done) {
3069                 DECLARE_WAITQUEUE(wait, current);
3070
3071                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3072                 __add_wait_queue_tail(&x->wait, &wait);
3073                 do {
3074                         __set_current_state(TASK_UNINTERRUPTIBLE);
3075                         spin_unlock_irq(&x->wait.lock);
3076                         timeout = schedule_timeout(timeout);
3077                         spin_lock_irq(&x->wait.lock);
3078                         if (!timeout) {
3079                                 __remove_wait_queue(&x->wait, &wait);
3080                                 goto out;
3081                         }
3082                 } while (!x->done);
3083                 __remove_wait_queue(&x->wait, &wait);
3084         }
3085         x->done--;
3086 out:
3087         spin_unlock_irq(&x->wait.lock);
3088         return timeout;
3089 }
3090 EXPORT_SYMBOL(wait_for_completion_timeout);
3091
3092 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3093 {
3094         int ret = 0;
3095
3096         might_sleep();
3097
3098         spin_lock_irq(&x->wait.lock);
3099         if (!x->done) {
3100                 DECLARE_WAITQUEUE(wait, current);
3101
3102                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3103                 __add_wait_queue_tail(&x->wait, &wait);
3104                 do {
3105                         if (signal_pending(current)) {
3106                                 ret = -ERESTARTSYS;
3107                                 __remove_wait_queue(&x->wait, &wait);
3108                                 goto out;
3109                         }
3110                         __set_current_state(TASK_INTERRUPTIBLE);
3111                         spin_unlock_irq(&x->wait.lock);
3112                         schedule();
3113                         spin_lock_irq(&x->wait.lock);
3114                 } while (!x->done);
3115                 __remove_wait_queue(&x->wait, &wait);
3116         }
3117         x->done--;
3118 out:
3119         spin_unlock_irq(&x->wait.lock);
3120
3121         return ret;
3122 }
3123 EXPORT_SYMBOL(wait_for_completion_interruptible);
3124
3125 unsigned long fastcall __sched
3126 wait_for_completion_interruptible_timeout(struct completion *x,
3127                                           unsigned long timeout)
3128 {
3129         might_sleep();
3130
3131         spin_lock_irq(&x->wait.lock);
3132         if (!x->done) {
3133                 DECLARE_WAITQUEUE(wait, current);
3134
3135                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3136                 __add_wait_queue_tail(&x->wait, &wait);
3137                 do {
3138                         if (signal_pending(current)) {
3139                                 timeout = -ERESTARTSYS;
3140                                 __remove_wait_queue(&x->wait, &wait);
3141                                 goto out;
3142                         }
3143                         __set_current_state(TASK_INTERRUPTIBLE);
3144                         spin_unlock_irq(&x->wait.lock);
3145                         timeout = schedule_timeout(timeout);
3146                         spin_lock_irq(&x->wait.lock);
3147                         if (!timeout) {
3148                                 __remove_wait_queue(&x->wait, &wait);
3149                                 goto out;
3150                         }
3151                 } while (!x->done);
3152                 __remove_wait_queue(&x->wait, &wait);
3153         }
3154         x->done--;
3155 out:
3156         spin_unlock_irq(&x->wait.lock);
3157         return timeout;
3158 }
3159 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3160
3161
3162 #define SLEEP_ON_VAR                                    \
3163         unsigned long flags;                            \
3164         wait_queue_t wait;                              \
3165         init_waitqueue_entry(&wait, current);
3166
3167 #define SLEEP_ON_HEAD                                   \
3168         spin_lock_irqsave(&q->lock,flags);              \
3169         __add_wait_queue(q, &wait);                     \
3170         spin_unlock(&q->lock);
3171
3172 #define SLEEP_ON_TAIL                                   \
3173         spin_lock_irq(&q->lock);                        \
3174         __remove_wait_queue(q, &wait);                  \
3175         spin_unlock_irqrestore(&q->lock, flags);
3176
3177 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3178 {
3179         SLEEP_ON_VAR
3180
3181         current->state = TASK_INTERRUPTIBLE;
3182
3183         SLEEP_ON_HEAD
3184         schedule();
3185         SLEEP_ON_TAIL
3186 }
3187
3188 EXPORT_SYMBOL(interruptible_sleep_on);
3189
3190 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3191 {
3192         SLEEP_ON_VAR
3193
3194         current->state = TASK_INTERRUPTIBLE;
3195
3196         SLEEP_ON_HEAD
3197         timeout = schedule_timeout(timeout);
3198         SLEEP_ON_TAIL
3199
3200         return timeout;
3201 }
3202
3203 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3204
3205 void fastcall __sched sleep_on(wait_queue_head_t *q)
3206 {
3207         SLEEP_ON_VAR
3208
3209         current->state = TASK_UNINTERRUPTIBLE;
3210
3211         SLEEP_ON_HEAD
3212         schedule();
3213         SLEEP_ON_TAIL
3214 }
3215
3216 EXPORT_SYMBOL(sleep_on);
3217
3218 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3219 {
3220         SLEEP_ON_VAR
3221
3222         current->state = TASK_UNINTERRUPTIBLE;
3223
3224         SLEEP_ON_HEAD
3225         timeout = schedule_timeout(timeout);
3226         SLEEP_ON_TAIL
3227
3228         return timeout;
3229 }
3230
3231 EXPORT_SYMBOL(sleep_on_timeout);
3232
3233 void set_user_nice(task_t *p, long nice)
3234 {
3235         unsigned long flags;
3236         prio_array_t *array;
3237         runqueue_t *rq;
3238         int old_prio, new_prio, delta;
3239
3240         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3241                 return;
3242         /*
3243          * We have to be careful, if called from sys_setpriority(),
3244          * the task might be in the middle of scheduling on another CPU.
3245          */
3246         rq = task_rq_lock(p, &flags);
3247         /*
3248          * The RT priorities are set via sched_setscheduler(), but we still
3249          * allow the 'normal' nice value to be set - but as expected
3250          * it wont have any effect on scheduling until the task is
3251          * not SCHED_NORMAL:
3252          */
3253         if (rt_task(p)) {
3254                 p->static_prio = NICE_TO_PRIO(nice);
3255                 goto out_unlock;
3256         }
3257         array = p->array;
3258         if (array)
3259                 dequeue_task(p, array);
3260
3261         old_prio = p->prio;
3262         new_prio = NICE_TO_PRIO(nice);
3263         delta = new_prio - old_prio;
3264         p->static_prio = NICE_TO_PRIO(nice);
3265         p->prio += delta;
3266
3267         if (array) {
3268                 enqueue_task(p, array);
3269                 /*
3270                  * If the task increased its priority or is running and
3271                  * lowered its priority, then reschedule its CPU:
3272                  */
3273                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3274                         resched_task(rq->curr);
3275         }
3276 out_unlock:
3277         task_rq_unlock(rq, &flags);
3278 }
3279
3280 EXPORT_SYMBOL(set_user_nice);
3281
3282 /*
3283  * can_nice - check if a task can reduce its nice value
3284  * @p: task
3285  * @nice: nice value
3286  */
3287 int can_nice(const task_t *p, const int nice)
3288 {
3289         /* convert nice value [19,-20] to rlimit style value [0,39] */
3290         int nice_rlim = 19 - nice;
3291         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3292                 capable(CAP_SYS_NICE));
3293 }
3294
3295 #ifdef __ARCH_WANT_SYS_NICE
3296
3297 /*
3298  * sys_nice - change the priority of the current process.
3299  * @increment: priority increment
3300  *
3301  * sys_setpriority is a more generic, but much slower function that
3302  * does similar things.
3303  */
3304 asmlinkage long sys_nice(int increment)
3305 {
3306         int retval;
3307         long nice;
3308
3309         /*
3310          * Setpriority might change our priority at the same moment.
3311          * We don't have to worry. Conceptually one call occurs first
3312          * and we have a single winner.
3313          */
3314         if (increment < -40)
3315                 increment = -40;
3316         if (increment > 40)
3317                 increment = 40;
3318
3319         nice = PRIO_TO_NICE(current->static_prio) + increment;
3320         if (nice < -20)
3321                 nice = -20;
3322         if (nice > 19)
3323                 nice = 19;
3324
3325         if (increment < 0 && !can_nice(current, nice))
3326                 return -EPERM;
3327
3328         retval = security_task_setnice(current, nice);
3329         if (retval)
3330                 return retval;
3331
3332         set_user_nice(current, nice);
3333         return 0;
3334 }
3335
3336 #endif
3337
3338 /**
3339  * task_prio - return the priority value of a given task.
3340  * @p: the task in question.
3341  *
3342  * This is the priority value as seen by users in /proc.
3343  * RT tasks are offset by -200. Normal tasks are centered
3344  * around 0, value goes from -16 to +15.
3345  */
3346 int task_prio(const task_t *p)
3347 {
3348         return p->prio - MAX_RT_PRIO;
3349 }
3350
3351 /**
3352  * task_nice - return the nice value of a given task.
3353  * @p: the task in question.
3354  */
3355 int task_nice(const task_t *p)
3356 {
3357         return TASK_NICE(p);
3358 }
3359
3360 /*
3361  * The only users of task_nice are binfmt_elf and binfmt_elf32.
3362  * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3363  * Therefore, task_nice is needed if there is a compat_mode.
3364  */
3365 #ifdef CONFIG_COMPAT
3366 EXPORT_SYMBOL_GPL(task_nice);
3367 #endif
3368
3369 /**
3370  * idle_cpu - is a given cpu idle currently?
3371  * @cpu: the processor in question.
3372  */
3373 int idle_cpu(int cpu)
3374 {
3375         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3376 }
3377
3378 EXPORT_SYMBOL_GPL(idle_cpu);
3379
3380 /**
3381  * idle_task - return the idle task for a given cpu.
3382  * @cpu: the processor in question.
3383  */
3384 task_t *idle_task(int cpu)
3385 {
3386         return cpu_rq(cpu)->idle;
3387 }
3388
3389 /**
3390  * find_process_by_pid - find a process with a matching PID value.
3391  * @pid: the pid in question.
3392  */
3393 static inline task_t *find_process_by_pid(pid_t pid)
3394 {
3395         return pid ? find_task_by_pid(pid) : current;
3396 }
3397
3398 /* Actually do priority change: must hold rq lock. */
3399 static void __setscheduler(struct task_struct *p, int policy, int prio)
3400 {
3401         BUG_ON(p->array);
3402         p->policy = policy;
3403         p->rt_priority = prio;
3404         if (policy != SCHED_NORMAL)
3405                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3406         else
3407                 p->prio = p->static_prio;
3408 }
3409
3410 /**
3411  * sched_setscheduler - change the scheduling policy and/or RT priority of
3412  * a thread.
3413  * @p: the task in question.
3414  * @policy: new policy.
3415  * @param: structure containing the new RT priority.
3416  */
3417 int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
3418 {
3419         int retval;
3420         int oldprio, oldpolicy = -1;
3421         prio_array_t *array;
3422         unsigned long flags;
3423         runqueue_t *rq;
3424
3425 recheck:
3426         /* double check policy once rq lock held */
3427         if (policy < 0)
3428                 policy = oldpolicy = p->policy;
3429         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3430                                 policy != SCHED_NORMAL)
3431                         return -EINVAL;
3432         /*
3433          * Valid priorities for SCHED_FIFO and SCHED_RR are
3434          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3435          */
3436         if (param->sched_priority < 0 ||
3437             param->sched_priority > MAX_USER_RT_PRIO-1)
3438                 return -EINVAL;
3439         if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3440                 return -EINVAL;
3441
3442         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3443             param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
3444             !capable(CAP_SYS_NICE))
3445                 return -EPERM;
3446         if ((current->euid != p->euid) && (current->euid != p->uid) &&
3447             !capable(CAP_SYS_NICE))
3448                 return -EPERM;
3449
3450         retval = security_task_setscheduler(p, policy, param);
3451         if (retval)
3452                 return retval;
3453         /*
3454          * To be able to change p->policy safely, the apropriate
3455          * runqueue lock must be held.
3456          */
3457         rq = task_rq_lock(p, &flags);
3458         /* recheck policy now with rq lock held */
3459         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3460                 policy = oldpolicy = -1;
3461                 task_rq_unlock(rq, &flags);
3462                 goto recheck;
3463         }
3464         array = p->array;
3465         if (array)
3466                 deactivate_task(p, rq);
3467         oldprio = p->prio;
3468         __setscheduler(p, policy, param->sched_priority);
3469         if (array) {
3470                 __activate_task(p, rq);
3471                 /*
3472                  * Reschedule if we are currently running on this runqueue and
3473                  * our priority decreased, or if we are not currently running on
3474                  * this runqueue and our priority is higher than the current's
3475                  */
3476                 if (task_running(rq, p)) {
3477                         if (p->prio > oldprio)
3478                                 resched_task(rq->curr);
3479                 } else if (TASK_PREEMPTS_CURR(p, rq))
3480                         resched_task(rq->curr);
3481         }
3482         task_rq_unlock(rq, &flags);
3483         return 0;
3484 }
3485 EXPORT_SYMBOL_GPL(sched_setscheduler);
3486
3487 static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3488 {
3489         int retval;
3490         struct sched_param lparam;
3491         struct task_struct *p;
3492
3493         if (!param || pid < 0)
3494                 return -EINVAL;
3495         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3496                 return -EFAULT;
3497         read_lock_irq(&tasklist_lock);
3498         p = find_process_by_pid(pid);
3499         if (!p) {
3500                 read_unlock_irq(&tasklist_lock);
3501                 return -ESRCH;
3502         }
3503         retval = sched_setscheduler(p, policy, &lparam);
3504         read_unlock_irq(&tasklist_lock);
3505         return retval;
3506 }
3507
3508 /**
3509  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3510  * @pid: the pid in question.
3511  * @policy: new policy.
3512  * @param: structure containing the new RT priority.
3513  */
3514 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3515                                        struct sched_param __user *param)
3516 {
3517         return do_sched_setscheduler(pid, policy, param);
3518 }
3519
3520 /**
3521  * sys_sched_setparam - set/change the RT priority of a thread
3522  * @pid: the pid in question.
3523  * @param: structure containing the new RT priority.
3524  */
3525 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3526 {
3527         return do_sched_setscheduler(pid, -1, param);
3528 }
3529
3530 /**
3531  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3532  * @pid: the pid in question.
3533  */
3534 asmlinkage long sys_sched_getscheduler(pid_t pid)
3535 {
3536         int retval = -EINVAL;
3537         task_t *p;
3538
3539         if (pid < 0)
3540                 goto out_nounlock;
3541
3542         retval = -ESRCH;
3543         read_lock(&tasklist_lock);
3544         p = find_process_by_pid(pid);
3545         if (p) {
3546                 retval = security_task_getscheduler(p);
3547                 if (!retval)
3548                         retval = p->policy;
3549         }
3550         read_unlock(&tasklist_lock);
3551
3552 out_nounlock:
3553         return retval;
3554 }
3555
3556 /**
3557  * sys_sched_getscheduler - get the RT priority of a thread
3558  * @pid: the pid in question.
3559  * @param: structure containing the RT priority.
3560  */
3561 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3562 {
3563         struct sched_param lp;
3564         int retval = -EINVAL;
3565         task_t *p;
3566
3567         if (!param || pid < 0)
3568                 goto out_nounlock;
3569
3570         read_lock(&tasklist_lock);
3571         p = find_process_by_pid(pid);
3572         retval = -ESRCH;
3573         if (!p)
3574                 goto out_unlock;
3575
3576         retval = security_task_getscheduler(p);
3577         if (retval)
3578                 goto out_unlock;
3579
3580         lp.sched_priority = p->rt_priority;
3581         read_unlock(&tasklist_lock);
3582
3583         /*
3584          * This one might sleep, we cannot do it with a spinlock held ...
3585          */
3586         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3587
3588 out_nounlock:
3589         return retval;
3590
3591 out_unlock:
3592         read_unlock(&tasklist_lock);
3593         return retval;
3594 }
3595
3596 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3597 {
3598         task_t *p;
3599         int retval;
3600         cpumask_t cpus_allowed;
3601
3602         lock_cpu_hotplug();
3603         read_lock(&tasklist_lock);
3604
3605         p = find_process_by_pid(pid);
3606         if (!p) {
3607                 read_unlock(&tasklist_lock);
3608                 unlock_cpu_hotplug();
3609                 return -ESRCH;
3610         }
3611
3612         /*
3613          * It is not safe to call set_cpus_allowed with the
3614          * tasklist_lock held.  We will bump the task_struct's
3615          * usage count and then drop tasklist_lock.
3616          */
3617         get_task_struct(p);
3618         read_unlock(&tasklist_lock);
3619
3620         retval = -EPERM;
3621         if ((current->euid != p->euid) && (current->euid != p->uid) &&
3622                         !capable(CAP_SYS_NICE))
3623                 goto out_unlock;
3624
3625         cpus_allowed = cpuset_cpus_allowed(p);
3626         cpus_and(new_mask, new_mask, cpus_allowed);
3627         retval = set_cpus_allowed(p, new_mask);
3628
3629 out_unlock:
3630         put_task_struct(p);
3631         unlock_cpu_hotplug();
3632         return retval;
3633 }
3634
3635 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3636                              cpumask_t *new_mask)
3637 {
3638         if (len < sizeof(cpumask_t)) {
3639                 memset(new_mask, 0, sizeof(cpumask_t));
3640         } else if (len > sizeof(cpumask_t)) {
3641                 len = sizeof(cpumask_t);
3642         }
3643         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3644 }
3645
3646 /**
3647  * sys_sched_setaffinity - set the cpu affinity of a process
3648  * @pid: pid of the process
3649  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3650  * @user_mask_ptr: user-space pointer to the new cpu mask
3651  */
3652 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3653                                       unsigned long __user *user_mask_ptr)
3654 {
3655         cpumask_t new_mask;
3656         int retval;
3657
3658         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3659         if (retval)
3660                 return retval;
3661
3662         return sched_setaffinity(pid, new_mask);
3663 }
3664
3665 /*
3666  * Represents all cpu's present in the system
3667  * In systems capable of hotplug, this map could dynamically grow
3668  * as new cpu's are detected in the system via any platform specific
3669  * method, such as ACPI for e.g.
3670  */
3671
3672 cpumask_t cpu_present_map;
3673 EXPORT_SYMBOL(cpu_present_map);
3674
3675 #ifndef CONFIG_SMP
3676 cpumask_t cpu_online_map = CPU_MASK_ALL;
3677 cpumask_t cpu_possible_map = CPU_MASK_ALL;
3678 #endif
3679
3680 long sched_getaffinity(pid_t pid, cpumask_t *mask)
3681 {
3682         int retval;
3683         task_t *p;
3684
3685         lock_cpu_hotplug();
3686         read_lock(&tasklist_lock);
3687
3688         retval = -ESRCH;
3689         p = find_process_by_pid(pid);
3690         if (!p)
3691                 goto out_unlock;
3692
3693         retval = 0;
3694         cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3695
3696 out_unlock:
3697         read_unlock(&tasklist_lock);
3698         unlock_cpu_hotplug();
3699         if (retval)
3700                 return retval;
3701
3702         return 0;
3703 }
3704
3705 /**
3706  * sys_sched_getaffinity - get the cpu affinity of a process
3707  * @pid: pid of the process
3708  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3709  * @user_mask_ptr: user-space pointer to hold the current cpu mask
3710  */
3711 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3712                                       unsigned long __user *user_mask_ptr)
3713 {
3714         int ret;
3715         cpumask_t mask;
3716
3717         if (len < sizeof(cpumask_t))
3718                 return -EINVAL;
3719
3720         ret = sched_getaffinity(pid, &mask);
3721         if (ret < 0)
3722                 return ret;
3723
3724         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3725                 return -EFAULT;
3726
3727         return sizeof(cpumask_t);
3728 }
3729
3730 /**
3731  * sys_sched_yield - yield the current processor to other threads.
3732  *
3733  * this function yields the current CPU by moving the calling thread
3734  * to the expired array. If there are no other threads running on this
3735  * CPU then this function will return.
3736  */
3737 asmlinkage long sys_sched_yield(void)
3738 {
3739         runqueue_t *rq = this_rq_lock();
3740         prio_array_t *array = current->array;
3741         prio_array_t *target = rq->expired;
3742
3743         schedstat_inc(rq, yld_cnt);
3744         /*
3745          * We implement yielding by moving the task into the expired
3746          * queue.
3747          *
3748          * (special rule: RT tasks will just roundrobin in the active
3749          *  array.)
3750          */
3751         if (rt_task(current))
3752                 target = rq->active;
3753
3754         if (current->array->nr_active == 1) {
3755                 schedstat_inc(rq, yld_act_empty);
3756                 if (!rq->expired->nr_active)
3757                         schedstat_inc(rq, yld_both_empty);
3758         } else if (!rq->expired->nr_active)
3759                 schedstat_inc(rq, yld_exp_empty);
3760
3761         if (array != target) {
3762                 dequeue_task(current, array);
3763                 enqueue_task(current, target);
3764         } else
3765                 /*
3766                  * requeue_task is cheaper so perform that if possible.
3767                  */
3768                 requeue_task(current, array);
3769
3770         /*
3771          * Since we are going to call schedule() anyway, there's
3772          * no need to preempt or enable interrupts:
3773          */
3774         __release(rq->lock);
3775         _raw_spin_unlock(&rq->lock);
3776         preempt_enable_no_resched();
3777
3778         schedule();
3779
3780         return 0;
3781 }
3782
3783 static inline void __cond_resched(void)
3784 {
3785         do {
3786                 add_preempt_count(PREEMPT_ACTIVE);
3787                 schedule();
3788                 sub_preempt_count(PREEMPT_ACTIVE);
3789         } while (need_resched());
3790 }
3791
3792 int __sched cond_resched(void)
3793 {
3794         if (need_resched()) {
3795                 __cond_resched();
3796                 return 1;
3797         }
3798         return 0;
3799 }
3800
3801 EXPORT_SYMBOL(cond_resched);
3802
3803 /*
3804  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
3805  * call schedule, and on return reacquire the lock.
3806  *
3807  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
3808  * operations here to prevent schedule() from being called twice (once via
3809  * spin_unlock(), once by hand).
3810  */
3811 int cond_resched_lock(spinlock_t * lock)
3812 {
3813         int ret = 0;
3814
3815         if (need_lockbreak(lock)) {
3816                 spin_unlock(lock);
3817                 cpu_relax();
3818                 ret = 1;
3819                 spin_lock(lock);
3820         }
3821         if (need_resched()) {
3822                 _raw_spin_unlock(lock);
3823                 preempt_enable_no_resched();
3824                 __cond_resched();
3825                 ret = 1;
3826                 spin_lock(lock);
3827         }
3828         return ret;
3829 }
3830
3831 EXPORT_SYMBOL(cond_resched_lock);
3832
3833 int __sched cond_resched_softirq(void)
3834 {
3835         BUG_ON(!in_softirq());
3836
3837         if (need_resched()) {
3838                 __local_bh_enable();
3839                 __cond_resched();
3840                 local_bh_disable();
3841                 return 1;
3842         }
3843         return 0;
3844 }
3845
3846 EXPORT_SYMBOL(cond_resched_softirq);
3847
3848
3849 /**
3850  * yield - yield the current processor to other threads.
3851  *
3852  * this is a shortcut for kernel-space yielding - it marks the
3853  * thread runnable and calls sys_sched_yield().
3854  */
3855 void __sched yield(void)
3856 {
3857         set_current_state(TASK_RUNNING);
3858         sys_sched_yield();
3859 }
3860
3861 EXPORT_SYMBOL(yield);
3862
3863 /*
3864  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
3865  * that process accounting knows that this is a task in IO wait state.
3866  *
3867  * But don't do that if it is a deliberate, throttling IO wait (this task
3868  * has set its backing_dev_info: the queue against which it should throttle)
3869  */
3870 void __sched io_schedule(void)
3871 {
3872         struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3873
3874         atomic_inc(&rq->nr_iowait);
3875         schedule();
3876         atomic_dec(&rq->nr_iowait);
3877 }
3878
3879 EXPORT_SYMBOL(io_schedule);
3880
3881 long __sched io_schedule_timeout(long timeout)
3882 {
3883         struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3884         long ret;
3885
3886         atomic_inc(&rq->nr_iowait);
3887         ret = schedule_timeout(timeout);
3888         atomic_dec(&rq->nr_iowait);
3889         return ret;
3890 }
3891
3892 /**
3893  * sys_sched_get_priority_max - return maximum RT priority.
3894  * @policy: scheduling class.
3895  *
3896  * this syscall returns the maximum rt_priority that can be used
3897  * by a given scheduling class.
3898  */
3899 asmlinkage long sys_sched_get_priority_max(int policy)
3900 {
3901         int ret = -EINVAL;
3902
3903         switch (policy) {
3904         case SCHED_FIFO:
3905         case SCHED_RR:
3906                 ret = MAX_USER_RT_PRIO-1;
3907                 break;
3908         case SCHED_NORMAL:
3909                 ret = 0;
3910                 break;
3911         }
3912         return ret;
3913 }
3914
3915 /**
3916  * sys_sched_get_priority_min - return minimum RT priority.
3917  * @policy: scheduling class.
3918  *
3919  * this syscall returns the minimum rt_priority that can be used
3920  * by a given scheduling class.
3921  */
3922 asmlinkage long sys_sched_get_priority_min(int policy)
3923 {
3924         int ret = -EINVAL;
3925
3926         switch (policy) {
3927         case SCHED_FIFO:
3928         case SCHED_RR:
3929                 ret = 1;
3930                 break;
3931         case SCHED_NORMAL:
3932                 ret = 0;
3933         }
3934         return ret;
3935 }
3936
3937 /**
3938  * sys_sched_rr_get_interval - return the default timeslice of a process.
3939  * @pid: pid of the process.
3940  * @interval: userspace pointer to the timeslice value.
3941  *
3942  * this syscall writes the default timeslice value of a given process
3943  * into the user-space timespec buffer. A value of '0' means infinity.
3944  */
3945 asmlinkage
3946 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3947 {
3948         int retval = -EINVAL;
3949         struct timespec t;
3950         task_t *p;
3951
3952         if (pid < 0)
3953                 goto out_nounlock;
3954
3955         retval = -ESRCH;
3956         read_lock(&tasklist_lock);
3957         p = find_process_by_pid(pid);
3958         if (!p)
3959                 goto out_unlock;
3960
3961         retval = security_task_getscheduler(p);
3962         if (retval)
3963                 goto out_unlock;
3964
3965         jiffies_to_timespec(p->policy & SCHED_FIFO ?
3966                                 0 : task_timeslice(p), &t);
3967         read_unlock(&tasklist_lock);
3968         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3969 out_nounlock:
3970         return retval;
3971 out_unlock:
3972         read_unlock(&tasklist_lock);
3973         return retval;
3974 }
3975
3976 static inline struct task_struct *eldest_child(struct task_struct *p)
3977 {
3978         if (list_empty(&p->children)) return NULL;
3979         return list_entry(p->children.next,struct task_struct,sibling);
3980 }
3981
3982 static inline struct task_struct *older_sibling(struct task_struct *p)
3983 {
3984         if (p->sibling.prev==&p->parent->children) return NULL;
3985         return list_entry(p->sibling.prev,struct task_struct,sibling);
3986 }
3987
3988 static inline struct task_struct *younger_sibling(struct task_struct *p)
3989 {
3990         if (p->sibling.next==&p->parent->children) return NULL;
3991         return list_entry(p->sibling.next,struct task_struct,sibling);
3992 }
3993
3994 static void show_task(task_t * p)
3995 {
3996         task_t *relative;
3997         unsigned state;
3998         unsigned long free = 0;
3999         static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
4000
4001         printk("%-13.13s ", p->comm);
4002         state = p->state ? __ffs(p->state) + 1 : 0;
4003         if (state < ARRAY_SIZE(stat_nam))
4004                 printk(stat_nam[state]);
4005         else
4006                 printk("?");
4007 #if (BITS_PER_LONG == 32)
4008         if (state == TASK_RUNNING)
4009                 printk(" running ");
4010         else
4011                 printk(" %08lX ", thread_saved_pc(p));
4012 #else
4013         if (state == TASK_RUNNING)
4014                 printk("  running task   ");
4015         else
4016                 printk(" %016lx ", thread_saved_pc(p));
4017 #endif
4018 #ifdef CONFIG_DEBUG_STACK_USAGE
4019         {
4020                 unsigned long * n = (unsigned long *) (p->thread_info+1);
4021                 while (!*n)
4022                         n++;
4023                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
4024         }
4025 #endif
4026         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
4027         if ((relative = eldest_child(p)))
4028                 printk("%5d ", relative->pid);
4029         else
4030                 printk("      ");
4031         if ((relative = younger_sibling(p)))
4032                 printk("%7d", relative->pid);
4033         else
4034                 printk("       ");
4035         if ((relative = older_sibling(p)))
4036                 printk(" %5d", relative->pid);
4037         else
4038                 printk("      ");
4039         if (!p->mm)
4040                 printk(" (L-TLB)\n");
4041         else
4042                 printk(" (NOTLB)\n");
4043
4044         if (state != TASK_RUNNING)
4045                 show_stack(p, NULL);
4046 }
4047
4048 void show_state(void)
4049 {
4050         task_t *g, *p;
4051
4052 #if (BITS_PER_LONG == 32)
4053         printk("\n"
4054                "                                               sibling\n");
4055         printk("  task             PC      pid father child younger older\n");
4056 #else
4057         printk("\n"
4058                "                                                       sibling\n");
4059         printk("  task                 PC          pid father child younger older\n");
4060 #endif
4061         read_lock(&tasklist_lock);
4062         do_each_thread(g, p) {
4063                 /*
4064                  * reset the NMI-timeout, listing all files on a slow
4065                  * console might take alot of time:
4066                  */
4067                 touch_nmi_watchdog();
4068                 show_task(p);
4069         } while_each_thread(g, p);
4070
4071         read_unlock(&tasklist_lock);
4072 }
4073
4074 void __devinit init_idle(task_t *idle, int cpu)
4075 {
4076         runqueue_t *rq = cpu_rq(cpu);
4077         unsigned long flags;
4078
4079         idle->sleep_avg = 0;
4080         idle->array = NULL;
4081         idle->prio = MAX_PRIO;
4082         idle->state = TASK_RUNNING;
4083         idle->cpus_allowed = cpumask_of_cpu(cpu);
4084         set_task_cpu(idle, cpu);
4085
4086         spin_lock_irqsave(&rq->lock, flags);
4087         rq->curr = rq->idle = idle;
4088         set_tsk_need_resched(idle);
4089         spin_unlock_irqrestore(&rq->lock, flags);
4090
4091         /* Set the preempt count _outside_ the spinlocks! */
4092 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4093         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
4094 #else
4095         idle->thread_info->preempt_count = 0;
4096 #endif
4097 }
4098
4099 /*
4100  * In a system that switches off the HZ timer nohz_cpu_mask
4101  * indicates which cpus entered this state. This is used
4102  * in the rcu update to wait only for active cpus. For system
4103  * which do not switch off the HZ timer nohz_cpu_mask should
4104  * always be CPU_MASK_NONE.
4105  */
4106 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4107
4108 #ifdef CONFIG_SMP
4109 /*
4110  * This is how migration works:
4111  *
4112  * 1) we queue a migration_req_t structure in the source CPU's
4113  *    runqueue and wake up that CPU's migration thread.
4114  * 2) we down() the locked semaphore => thread blocks.
4115  * 3) migration thread wakes up (implicitly it forces the migrated
4116  *    thread off the CPU)
4117  * 4) it gets the migration request and checks whether the migrated
4118  *    task is still in the wrong runqueue.
4119  * 5) if it's in the wrong runqueue then the migration thread removes
4120  *    it and puts it into the right queue.
4121  * 6) migration thread up()s the semaphore.
4122  * 7) we wake up and the migration is done.
4123  */
4124
4125 /*
4126  * Change a given task's CPU affinity. Migrate the thread to a
4127  * proper CPU and schedule it away if the CPU it's executing on
4128  * is removed from the allowed bitmask.
4129  *
4130  * NOTE: the caller must have a valid reference to the task, the
4131  * task must not exit() & deallocate itself prematurely.  The
4132  * call is not atomic; no spinlocks may be held.
4133  */
4134 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4135 {
4136         unsigned long flags;
4137         int ret = 0;
4138         migration_req_t req;
4139         runqueue_t *rq;
4140
4141         rq = task_rq_lock(p, &flags);
4142         if (!cpus_intersects(new_mask, cpu_online_map)) {
4143                 ret = -EINVAL;
4144                 goto out;
4145         }
4146
4147         p->cpus_allowed = new_mask;
4148         /* Can the task run on the task's current CPU? If so, we're done */
4149         if (cpu_isset(task_cpu(p), new_mask))
4150                 goto out;
4151
4152         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4153                 /* Need help from migration thread: drop lock and wait. */
4154                 task_rq_unlock(rq, &flags);
4155                 wake_up_process(rq->migration_thread);
4156                 wait_for_completion(&req.done);
4157                 tlb_migrate_finish(p->mm);
4158                 return 0;
4159         }
4160 out:
4161         task_rq_unlock(rq, &flags);
4162         return ret;
4163 }
4164
4165 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4166
4167 /*
4168  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4169  * this because either it can't run here any more (set_cpus_allowed()
4170  * away from this CPU, or CPU going down), or because we're
4171  * attempting to rebalance this task on exec (sched_exec).
4172  *
4173  * So we race with normal scheduler movements, but that's OK, as long
4174  * as the task is no longer on this CPU.
4175  */
4176 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4177 {
4178         runqueue_t *rq_dest, *rq_src;
4179
4180         if (unlikely(cpu_is_offline(dest_cpu)))
4181                 return;
4182
4183         rq_src = cpu_rq(src_cpu);
4184         rq_dest = cpu_rq(dest_cpu);
4185
4186         double_rq_lock(rq_src, rq_dest);
4187         /* Already moved. */
4188         if (task_cpu(p) != src_cpu)
4189                 goto out;
4190         /* Affinity changed (again). */
4191         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4192                 goto out;
4193
4194         set_task_cpu(p, dest_cpu);
4195         if (p->array) {
4196                 /*
4197                  * Sync timestamp with rq_dest's before activating.
4198                  * The same thing could be achieved by doing this step
4199                  * afterwards, and pretending it was a local activate.
4200                  * This way is cleaner and logically correct.
4201                  */
4202                 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4203                                 + rq_dest->timestamp_last_tick;
4204                 deactivate_task(p, rq_src);
4205                 activate_task(p, rq_dest, 0);
4206                 if (TASK_PREEMPTS_CURR(p, rq_dest))
4207                         resched_task(rq_dest->curr);
4208         }
4209
4210 out:
4211         double_rq_unlock(rq_src, rq_dest);
4212 }
4213
4214 /*
4215  * migration_thread - this is a highprio system thread that performs
4216  * thread migration by bumping thread off CPU then 'pushing' onto
4217  * another runqueue.
4218  */
4219 static int migration_thread(void * data)
4220 {
4221         runqueue_t *rq;
4222         int cpu = (long)data;
4223
4224         rq = cpu_rq(cpu);
4225         BUG_ON(rq->migration_thread != current);
4226
4227         set_current_state(TASK_INTERRUPTIBLE);
4228         while (!kthread_should_stop()) {
4229                 struct list_head *head;
4230                 migration_req_t *req;
4231
4232                 if (current->flags & PF_FREEZE)
4233                         refrigerator(PF_FREEZE);
4234
4235                 spin_lock_irq(&rq->lock);
4236
4237                 if (cpu_is_offline(cpu)) {
4238                         spin_unlock_irq(&rq->lock);
4239                         goto wait_to_die;
4240                 }
4241
4242                 if (rq->active_balance) {
4243                         active_load_balance(rq, cpu);
4244                         rq->active_balance = 0;
4245                 }
4246
4247                 head = &rq->migration_queue;
4248
4249                 if (list_empty(head)) {
4250                         spin_unlock_irq(&rq->lock);
4251                         schedule();
4252                         set_current_state(TASK_INTERRUPTIBLE);
4253                         continue;
4254                 }
4255                 req = list_entry(head->next, migration_req_t, list);
4256                 list_del_init(head->next);
4257
4258                 if (req->type == REQ_MOVE_TASK) {
4259                         spin_unlock(&rq->lock);
4260                         __migrate_task(req->task, cpu, req->dest_cpu);
4261                         local_irq_enable();
4262                 } else if (req->type == REQ_SET_DOMAIN) {
4263                         rq->sd = req->sd;
4264                         spin_unlock_irq(&rq->lock);
4265                 } else {
4266                         spin_unlock_irq(&rq->lock);
4267                         WARN_ON(1);
4268                 }
4269
4270                 complete(&req->done);
4271         }
4272         __set_current_state(TASK_RUNNING);
4273         return 0;
4274
4275 wait_to_die:
4276         /* Wait for kthread_stop */
4277         set_current_state(TASK_INTERRUPTIBLE);
4278         while (!kthread_should_stop()) {
4279                 schedule();
4280                 set_current_state(TASK_INTERRUPTIBLE);
4281         }
4282         __set_current_state(TASK_RUNNING);
4283         return 0;
4284 }
4285
4286 #ifdef CONFIG_HOTPLUG_CPU
4287 /* Figure out where task on dead CPU should go, use force if neccessary. */
4288 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4289 {
4290         int dest_cpu;
4291         cpumask_t mask;
4292
4293         /* On same node? */
4294         mask = node_to_cpumask(cpu_to_node(dead_cpu));
4295         cpus_and(mask, mask, tsk->cpus_allowed);
4296         dest_cpu = any_online_cpu(mask);
4297
4298         /* On any allowed CPU? */
4299         if (dest_cpu == NR_CPUS)
4300                 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4301
4302         /* No more Mr. Nice Guy. */
4303         if (dest_cpu == NR_CPUS) {
4304                 cpus_setall(tsk->cpus_allowed);
4305                 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4306
4307                 /*
4308                  * Don't tell them about moving exiting tasks or
4309                  * kernel threads (both mm NULL), since they never
4310                  * leave kernel.
4311                  */
4312                 if (tsk->mm && printk_ratelimit())
4313                         printk(KERN_INFO "process %d (%s) no "
4314                                "longer affine to cpu%d\n",
4315                                tsk->pid, tsk->comm, dead_cpu);
4316         }
4317         __migrate_task(tsk, dead_cpu, dest_cpu);
4318 }
4319
4320 /*
4321  * While a dead CPU has no uninterruptible tasks queued at this point,
4322  * it might still have a nonzero ->nr_uninterruptible counter, because
4323  * for performance reasons the counter is not stricly tracking tasks to
4324  * their home CPUs. So we just add the counter to another CPU's counter,
4325  * to keep the global sum constant after CPU-down:
4326  */
4327 static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4328 {
4329         runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4330         unsigned long flags;
4331
4332         local_irq_save(flags);
4333         double_rq_lock(rq_src, rq_dest);
4334         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4335         rq_src->nr_uninterruptible = 0;
4336         double_rq_unlock(rq_src, rq_dest);
4337         local_irq_restore(flags);
4338 }
4339
4340 /* Run through task list and migrate tasks from the dead cpu. */
4341 static void migrate_live_tasks(int src_cpu)
4342 {
4343         struct task_struct *tsk, *t;
4344
4345         write_lock_irq(&tasklist_lock);
4346
4347         do_each_thread(t, tsk) {
4348                 if (tsk == current)
4349                         continue;
4350
4351                 if (task_cpu(tsk) == src_cpu)
4352                         move_task_off_dead_cpu(src_cpu, tsk);
4353         } while_each_thread(t, tsk);
4354
4355         write_unlock_irq(&tasklist_lock);
4356 }
4357
4358 /* Schedules idle task to be the next runnable task on current CPU.
4359  * It does so by boosting its priority to highest possible and adding it to
4360  * the _front_ of runqueue. Used by CPU offline code.
4361  */
4362 void sched_idle_next(void)
4363 {
4364         int cpu = smp_processor_id();
4365         runqueue_t *rq = this_rq();
4366         struct task_struct *p = rq->idle;
4367         unsigned long flags;
4368
4369         /* cpu has to be offline */
4370         BUG_ON(cpu_online(cpu));
4371
4372         /* Strictly not necessary since rest of the CPUs are stopped by now
4373          * and interrupts disabled on current cpu.
4374          */
4375         spin_lock_irqsave(&rq->lock, flags);
4376
4377         __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4378         /* Add idle task to _front_ of it's priority queue */
4379         __activate_idle_task(p, rq);
4380
4381         spin_unlock_irqrestore(&rq->lock, flags);
4382 }
4383
4384 /* Ensures that the idle task is using init_mm right before its cpu goes
4385  * offline.
4386  */
4387 void idle_task_exit(void)
4388 {
4389         struct mm_struct *mm = current->active_mm;
4390
4391         BUG_ON(cpu_online(smp_processor_id()));
4392
4393         if (mm != &init_mm)
4394                 switch_mm(mm, &init_mm, current);
4395         mmdrop(mm);
4396 }
4397
4398 static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4399 {
4400         struct runqueue *rq = cpu_rq(dead_cpu);
4401
4402         /* Must be exiting, otherwise would be on tasklist. */
4403         BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4404
4405         /* Cannot have done final schedule yet: would have vanished. */
4406         BUG_ON(tsk->flags & PF_DEAD);
4407
4408         get_task_struct(tsk);
4409
4410         /*
4411          * Drop lock around migration; if someone else moves it,
4412          * that's OK.  No task can be added to this CPU, so iteration is
4413          * fine.
4414          */
4415         spin_unlock_irq(&rq->lock);
4416         move_task_off_dead_cpu(dead_cpu, tsk);
4417         spin_lock_irq(&rq->lock);
4418
4419         put_task_struct(tsk);
4420 }
4421
4422 /* release_task() removes task from tasklist, so we won't find dead tasks. */
4423 static void migrate_dead_tasks(unsigned int dead_cpu)
4424 {
4425         unsigned arr, i;
4426         struct runqueue *rq = cpu_rq(dead_cpu);
4427
4428         for (arr = 0; arr < 2; arr++) {
4429                 for (i = 0; i < MAX_PRIO; i++) {
4430                         struct list_head *list = &rq->arrays[arr].queue[i];
4431                         while (!list_empty(list))
4432                                 migrate_dead(dead_cpu,
4433                                              list_entry(list->next, task_t,
4434                                                         run_list));
4435                 }
4436         }
4437 }
4438 #endif /* CONFIG_HOTPLUG_CPU */
4439
4440 /*
4441  * migration_call - callback that gets triggered when a CPU is added.
4442  * Here we can start up the necessary migration thread for the new CPU.
4443  */
4444 static int migration_call(struct notifier_block *nfb, unsigned long action,
4445                           void *hcpu)
4446 {
4447         int cpu = (long)hcpu;
4448         struct task_struct *p;
4449         struct runqueue *rq;
4450         unsigned long flags;
4451
4452         switch (action) {
4453         case CPU_UP_PREPARE:
4454                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4455                 if (IS_ERR(p))
4456                         return NOTIFY_BAD;
4457                 p->flags |= PF_NOFREEZE;
4458                 kthread_bind(p, cpu);
4459                 /* Must be high prio: stop_machine expects to yield to it. */
4460                 rq = task_rq_lock(p, &flags);
4461                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4462                 task_rq_unlock(rq, &flags);
4463                 cpu_rq(cpu)->migration_thread = p;
4464                 break;
4465         case CPU_ONLINE:
4466                 /* Strictly unneccessary, as first user will wake it. */
4467                 wake_up_process(cpu_rq(cpu)->migration_thread);
4468                 break;
4469 #ifdef CONFIG_HOTPLUG_CPU
4470         case CPU_UP_CANCELED:
4471                 /* Unbind it from offline cpu so it can run.  Fall thru. */
4472                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4473                 kthread_stop(cpu_rq(cpu)->migration_thread);
4474                 cpu_rq(cpu)->migration_thread = NULL;
4475                 break;
4476         case CPU_DEAD:
4477                 migrate_live_tasks(cpu);
4478                 rq = cpu_rq(cpu);
4479                 kthread_stop(rq->migration_thread);
4480                 rq->migration_thread = NULL;
4481                 /* Idle task back to normal (off runqueue, low prio) */
4482                 rq = task_rq_lock(rq->idle, &flags);
4483                 deactivate_task(rq->idle, rq);
4484                 rq->idle->static_prio = MAX_PRIO;
4485                 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4486                 migrate_dead_tasks(cpu);
4487                 task_rq_unlock(rq, &flags);
4488                 migrate_nr_uninterruptible(rq);
4489                 BUG_ON(rq->nr_running != 0);
4490
4491                 /* No need to migrate the tasks: it was best-effort if
4492                  * they didn't do lock_cpu_hotplug().  Just wake up
4493                  * the requestors. */
4494                 spin_lock_irq(&rq->lock);
4495                 while (!list_empty(&rq->migration_queue)) {
4496                         migration_req_t *req;
4497                         req = list_entry(rq->migration_queue.next,
4498                                          migration_req_t, list);
4499                         BUG_ON(req->type != REQ_MOVE_TASK);
4500                         list_del_init(&req->list);
4501                         complete(&req->done);
4502                 }
4503                 spin_unlock_irq(&rq->lock);
4504                 break;
4505 #endif
4506         }
4507         return NOTIFY_OK;
4508 }
4509
4510 /* Register at highest priority so that task migration (migrate_all_tasks)
4511  * happens before everything else.
4512  */
4513 static struct notifier_block __devinitdata migration_notifier = {
4514         .notifier_call = migration_call,
4515         .priority = 10
4516 };
4517
4518 int __init migration_init(void)
4519 {
4520         void *cpu = (void *)(long)smp_processor_id();
4521         /* Start one for boot CPU. */
4522         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4523         migration_call(&migration_notifier, CPU_ONLINE, cpu);
4524         register_cpu_notifier(&migration_notifier);
4525         return 0;
4526 }
4527 #endif
4528
4529 #ifdef CONFIG_SMP
4530 #define SCHED_DOMAIN_DEBUG
4531 #ifdef SCHED_DOMAIN_DEBUG
4532 static void sched_domain_debug(struct sched_domain *sd, int cpu)
4533 {
4534         int level = 0;
4535
4536         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4537
4538         do {
4539                 int i;
4540                 char str[NR_CPUS];
4541                 struct sched_group *group = sd->groups;
4542                 cpumask_t groupmask;
4543
4544                 cpumask_scnprintf(str, NR_CPUS, sd->span);
4545                 cpus_clear(groupmask);
4546
4547                 printk(KERN_DEBUG);
4548                 for (i = 0; i < level + 1; i++)
4549                         printk(" ");
4550                 printk("domain %d: ", level);
4551
4552                 if (!(sd->flags & SD_LOAD_BALANCE)) {
4553                         printk("does not load-balance\n");
4554                         if (sd->parent)
4555                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4556                         break;
4557                 }
4558
4559                 printk("span %s\n", str);
4560
4561                 if (!cpu_isset(cpu, sd->span))
4562                         printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4563                 if (!cpu_isset(cpu, group->cpumask))
4564                         printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4565
4566                 printk(KERN_DEBUG);
4567                 for (i = 0; i < level + 2; i++)
4568                         printk(" ");
4569                 printk("groups:");
4570                 do {
4571                         if (!group) {
4572                                 printk("\n");
4573                                 printk(KERN_ERR "ERROR: group is NULL\n");
4574                                 break;
4575                         }
4576
4577                         if (!group->cpu_power) {
4578                                 printk("\n");
4579                                 printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4580                         }
4581
4582                         if (!cpus_weight(group->cpumask)) {
4583                                 printk("\n");
4584                                 printk(KERN_ERR "ERROR: empty group\n");
4585                         }
4586
4587                         if (cpus_intersects(groupmask, group->cpumask)) {
4588                                 printk("\n");
4589                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
4590                         }
4591
4592                         cpus_or(groupmask, groupmask, group->cpumask);
4593
4594                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4595                         printk(" %s", str);
4596
4597                         group = group->next;
4598                 } while (group != sd->groups);
4599                 printk("\n");
4600
4601                 if (!cpus_equal(sd->span, groupmask))
4602                         printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4603
4604                 level++;
4605                 sd = sd->parent;
4606
4607                 if (sd) {
4608                         if (!cpus_subset(groupmask, sd->span))
4609                                 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4610                 }
4611
4612         } while (sd);
4613 }
4614 #else
4615 #define sched_domain_debug(sd, cpu) {}
4616 #endif
4617
4618 /*
4619  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
4620  * hold the hotplug lock.
4621  */
4622 void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
4623 {
4624         migration_req_t req;
4625         unsigned long flags;
4626         runqueue_t *rq = cpu_rq(cpu);
4627         int local = 1;
4628
4629         sched_domain_debug(sd, cpu);
4630
4631         spin_lock_irqsave(&rq->lock, flags);
4632
4633         if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4634                 rq->sd = sd;
4635         } else {
4636                 init_completion(&req.done);
4637                 req.type = REQ_SET_DOMAIN;
4638                 req.sd = sd;
4639                 list_add(&req.list, &rq->migration_queue);
4640                 local = 0;
4641         }
4642
4643         spin_unlock_irqrestore(&rq->lock, flags);
4644
4645         if (!local) {
4646                 wake_up_process(rq->migration_thread);
4647                 wait_for_completion(&req.done);
4648         }
4649 }
4650
4651 /* cpus with isolated domains */
4652 cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4653
4654 /* Setup the mask of cpus configured for isolated domains */
4655 static int __init isolated_cpu_setup(char *str)
4656 {
4657         int ints[NR_CPUS], i;
4658
4659         str = get_options(str, ARRAY_SIZE(ints), ints);
4660         cpus_clear(cpu_isolated_map);
4661         for (i = 1; i <= ints[0]; i++)
4662                 if (ints[i] < NR_CPUS)
4663                         cpu_set(ints[i], cpu_isolated_map);
4664         return 1;
4665 }
4666
4667 __setup ("isolcpus=", isolated_cpu_setup);
4668
4669 /*
4670  * init_sched_build_groups takes an array of groups, the cpumask we wish
4671  * to span, and a pointer to a function which identifies what group a CPU
4672  * belongs to. The return value of group_fn must be a valid index into the
4673  * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
4674  * keep track of groups covered with a cpumask_t).
4675  *
4676  * init_sched_build_groups will build a circular linked list of the groups
4677  * covered by the given span, and will set each group's ->cpumask correctly,
4678  * and ->cpu_power to 0.
4679  */
4680 void __devinit init_sched_build_groups(struct sched_group groups[],
4681                         cpumask_t span, int (*group_fn)(int cpu))
4682 {
4683         struct sched_group *first = NULL, *last = NULL;
4684         cpumask_t covered = CPU_MASK_NONE;
4685         int i;
4686
4687         for_each_cpu_mask(i, span) {
4688                 int group = group_fn(i);
4689                 struct sched_group *sg = &groups[group];
4690                 int j;
4691
4692                 if (cpu_isset(i, covered))
4693                         continue;
4694
4695                 sg->cpumask = CPU_MASK_NONE;
4696                 sg->cpu_power = 0;
4697
4698                 for_each_cpu_mask(j, span) {
4699                         if (group_fn(j) != group)
4700                                 continue;
4701
4702                         cpu_set(j, covered);
4703                         cpu_set(j, sg->cpumask);
4704                 }
4705                 if (!first)
4706                         first = sg;
4707                 if (last)
4708                         last->next = sg;
4709                 last = sg;
4710         }
4711         last->next = first;
4712 }
4713
4714
4715 #ifdef ARCH_HAS_SCHED_DOMAIN
4716 extern void __devinit arch_init_sched_domains(void);
4717 extern void __devinit arch_destroy_sched_domains(void);
4718 #else
4719 #ifdef CONFIG_SCHED_SMT
4720 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4721 static struct sched_group sched_group_cpus[NR_CPUS];
4722 static int __devinit cpu_to_cpu_group(int cpu)
4723 {
4724         return cpu;
4725 }
4726 #endif
4727
4728 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4729 static struct sched_group sched_group_phys[NR_CPUS];
4730 static int __devinit cpu_to_phys_group(int cpu)
4731 {
4732 #ifdef CONFIG_SCHED_SMT
4733         return first_cpu(cpu_sibling_map[cpu]);
4734 #else
4735         return cpu;
4736 #endif
4737 }
4738
4739 #ifdef CONFIG_NUMA
4740
4741 static DEFINE_PER_CPU(struct sched_domain, node_domains);
4742 static struct sched_group sched_group_nodes[MAX_NUMNODES];
4743 static int __devinit cpu_to_node_group(int cpu)
4744 {
4745         return cpu_to_node(cpu);
4746 }
4747 #endif
4748
4749 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4750 /*
4751  * The domains setup code relies on siblings not spanning
4752  * multiple nodes. Make sure the architecture has a proper
4753  * siblings map:
4754  */
4755 static void check_sibling_maps(void)
4756 {
4757         int i, j;
4758
4759         for_each_online_cpu(i) {
4760                 for_each_cpu_mask(j, cpu_sibling_map[i]) {
4761                         if (cpu_to_node(i) != cpu_to_node(j)) {
4762                                 printk(KERN_INFO "warning: CPU %d siblings map "
4763                                         "to different node - isolating "
4764                                         "them.\n", i);
4765                                 cpu_sibling_map[i] = cpumask_of_cpu(i);
4766                                 break;
4767                         }
4768                 }
4769         }
4770 }
4771 #endif
4772
4773 /*
4774  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
4775  */
4776 static void __devinit arch_init_sched_domains(void)
4777 {
4778         int i;
4779         cpumask_t cpu_default_map;
4780
4781 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4782         check_sibling_maps();
4783 #endif
4784         /*
4785          * Setup mask for cpus without special case scheduling requirements.
4786          * For now this just excludes isolated cpus, but could be used to
4787          * exclude other special cases in the future.
4788          */
4789         cpus_complement(cpu_default_map, cpu_isolated_map);
4790         cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4791
4792         /*
4793          * Set up domains. Isolated domains just stay on the dummy domain.
4794          */
4795         for_each_cpu_mask(i, cpu_default_map) {
4796                 int group;
4797                 struct sched_domain *sd = NULL, *p;
4798                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4799
4800                 cpus_and(nodemask, nodemask, cpu_default_map);
4801
4802 #ifdef CONFIG_NUMA
4803                 sd = &per_cpu(node_domains, i);
4804                 group = cpu_to_node_group(i);
4805                 *sd = SD_NODE_INIT;
4806                 sd->span = cpu_default_map;
4807                 sd->groups = &sched_group_nodes[group];
4808 #endif
4809
4810                 p = sd;
4811                 sd = &per_cpu(phys_domains, i);
4812                 group = cpu_to_phys_group(i);
4813                 *sd = SD_CPU_INIT;
4814                 sd->span = nodemask;
4815                 sd->parent = p;
4816                 sd->groups = &sched_group_phys[group];
4817
4818 #ifdef CONFIG_SCHED_SMT
4819                 p = sd;
4820                 sd = &per_cpu(cpu_domains, i);
4821                 group = cpu_to_cpu_group(i);
4822                 *sd = SD_SIBLING_INIT;
4823                 sd->span = cpu_sibling_map[i];
4824                 cpus_and(sd->span, sd->span, cpu_default_map);
4825                 sd->parent = p;
4826                 sd->groups = &sched_group_cpus[group];
4827 #endif
4828         }
4829
4830 #ifdef CONFIG_SCHED_SMT
4831         /* Set up CPU (sibling) groups */
4832         for_each_online_cpu(i) {
4833                 cpumask_t this_sibling_map = cpu_sibling_map[i];
4834                 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4835                 if (i != first_cpu(this_sibling_map))
4836                         continue;
4837
4838                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
4839                                                 &cpu_to_cpu_group);
4840         }
4841 #endif
4842
4843         /* Set up physical groups */
4844         for (i = 0; i < MAX_NUMNODES; i++) {
4845                 cpumask_t nodemask = node_to_cpumask(i);
4846
4847                 cpus_and(nodemask, nodemask, cpu_default_map);
4848                 if (cpus_empty(nodemask))
4849                         continue;
4850
4851                 init_sched_build_groups(sched_group_phys, nodemask,
4852                                                 &cpu_to_phys_group);
4853         }
4854
4855 #ifdef CONFIG_NUMA
4856         /* Set up node groups */
4857         init_sched_build_groups(sched_group_nodes, cpu_default_map,
4858                                         &cpu_to_node_group);
4859 #endif
4860
4861         /* Calculate CPU power for physical packages and nodes */
4862         for_each_cpu_mask(i, cpu_default_map) {
4863                 int power;
4864                 struct sched_domain *sd;
4865 #ifdef CONFIG_SCHED_SMT
4866                 sd = &per_cpu(cpu_domains, i);
4867                 power = SCHED_LOAD_SCALE;
4868                 sd->groups->cpu_power = power;
4869 #endif
4870
4871                 sd = &per_cpu(phys_domains, i);
4872                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4873                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
4874                 sd->groups->cpu_power = power;
4875
4876 #ifdef CONFIG_NUMA
4877                 if (i == first_cpu(sd->groups->cpumask)) {
4878                         /* Only add "power" once for each physical package. */
4879                         sd = &per_cpu(node_domains, i);
4880                         sd->groups->cpu_power += power;
4881                 }
4882 #endif
4883         }
4884
4885         /* Attach the domains */
4886         for_each_online_cpu(i) {
4887                 struct sched_domain *sd;
4888 #ifdef CONFIG_SCHED_SMT
4889                 sd = &per_cpu(cpu_domains, i);
4890 #else
4891                 sd = &per_cpu(phys_domains, i);
4892 #endif
4893                 cpu_attach_domain(sd, i);
4894         }
4895 }
4896
4897 #ifdef CONFIG_HOTPLUG_CPU
4898 static void __devinit arch_destroy_sched_domains(void)
4899 {
4900         /* Do nothing: everything is statically allocated. */
4901 }
4902 #endif
4903
4904 #endif /* ARCH_HAS_SCHED_DOMAIN */
4905
4906 /*
4907  * Initial dummy domain for early boot and for hotplug cpu. Being static,
4908  * it is initialized to zero, so all balancing flags are cleared which is
4909  * what we want.
4910  */
4911 static struct sched_domain sched_domain_dummy;
4912
4913 #ifdef CONFIG_HOTPLUG_CPU
4914 /*
4915  * Force a reinitialization of the sched domains hierarchy.  The domains
4916  * and groups cannot be updated in place without racing with the balancing
4917  * code, so we temporarily attach all running cpus to a "dummy" domain
4918  * which will prevent rebalancing while the sched domains are recalculated.
4919  */
4920 static int update_sched_domains(struct notifier_block *nfb,
4921                                 unsigned long action, void *hcpu)
4922 {
4923         int i;
4924
4925         switch (action) {
4926         case CPU_UP_PREPARE:
4927         case CPU_DOWN_PREPARE:
4928                 for_each_online_cpu(i)
4929                         cpu_attach_domain(&sched_domain_dummy, i);
4930                 arch_destroy_sched_domains();
4931                 return NOTIFY_OK;
4932
4933         case CPU_UP_CANCELED:
4934         case CPU_DOWN_FAILED:
4935         case CPU_ONLINE:
4936         case CPU_DEAD:
4937                 /*
4938                  * Fall through and re-initialise the domains.
4939                  */
4940                 break;
4941         default:
4942                 return NOTIFY_DONE;
4943         }
4944
4945         /* The hotplug lock is already held by cpu_up/cpu_down */
4946         arch_init_sched_domains();
4947
4948         return NOTIFY_OK;
4949 }
4950 #endif
4951
4952 void __init sched_init_smp(void)
4953 {
4954         lock_cpu_hotplug();
4955         arch_init_sched_domains();
4956         unlock_cpu_hotplug();
4957         /* XXX: Theoretical race here - CPU may be hotplugged now */
4958         hotcpu_notifier(update_sched_domains, 0);
4959 }
4960 #else
4961 void __init sched_init_smp(void)
4962 {
4963 }
4964 #endif /* CONFIG_SMP */
4965
4966 int in_sched_functions(unsigned long addr)
4967 {
4968         /* Linker adds these: start and end of __sched functions */
4969         extern char __sched_text_start[], __sched_text_end[];
4970         return in_lock_functions(addr) ||
4971                 (addr >= (unsigned long)__sched_text_start
4972                 && addr < (unsigned long)__sched_text_end);
4973 }
4974
4975 void __init sched_init(void)
4976 {
4977         runqueue_t *rq;
4978         int i, j, k;
4979
4980         for (i = 0; i < NR_CPUS; i++) {
4981                 prio_array_t *array;
4982
4983                 rq = cpu_rq(i);
4984                 spin_lock_init(&rq->lock);
4985                 rq->nr_running = 0;
4986                 rq->active = rq->arrays;
4987                 rq->expired = rq->arrays + 1;
4988                 rq->best_expired_prio = MAX_PRIO;
4989
4990 #ifdef CONFIG_SMP
4991                 rq->sd = &sched_domain_dummy;
4992                 for (j = 1; j < 3; j++)
4993                         rq->cpu_load[j] = 0;
4994                 rq->active_balance = 0;
4995                 rq->push_cpu = 0;
4996                 rq->migration_thread = NULL;
4997                 INIT_LIST_HEAD(&rq->migration_queue);
4998 #endif
4999                 atomic_set(&rq->nr_iowait, 0);
5000
5001                 for (j = 0; j < 2; j++) {
5002                         array = rq->arrays + j;
5003                         for (k = 0; k < MAX_PRIO; k++) {
5004                                 INIT_LIST_HEAD(array->queue + k);
5005                                 __clear_bit(k, array->bitmap);
5006                         }
5007                         // delimiter for bitsearch
5008                         __set_bit(MAX_PRIO, array->bitmap);
5009                 }
5010         }
5011
5012         /*
5013          * The boot idle thread does lazy MMU switching as well:
5014          */
5015         atomic_inc(&init_mm.mm_count);
5016         enter_lazy_tlb(&init_mm, current);
5017
5018         /*
5019          * Make us the idle thread. Technically, schedule() should not be
5020          * called from this thread, however somewhere below it might be,
5021          * but because we are the idle thread, we just pick up running again
5022          * when this runqueue becomes "idle".
5023          */
5024         init_idle(current, smp_processor_id());
5025 }
5026
5027 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5028 void __might_sleep(char *file, int line)
5029 {
5030 #if defined(in_atomic)
5031         static unsigned long prev_jiffy;        /* ratelimiting */
5032
5033         if ((in_atomic() || irqs_disabled()) &&
5034             system_state == SYSTEM_RUNNING && !oops_in_progress) {
5035                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
5036                         return;
5037                 prev_jiffy = jiffies;
5038                 printk(KERN_ERR "Debug: sleeping function called from invalid"
5039                                 " context at %s:%d\n", file, line);
5040                 printk("in_atomic():%d, irqs_disabled():%d\n",
5041                         in_atomic(), irqs_disabled());
5042                 dump_stack();
5043         }
5044 #endif
5045 }
5046 EXPORT_SYMBOL(__might_sleep);
5047 #endif
5048
5049 #ifdef CONFIG_MAGIC_SYSRQ
5050 void normalize_rt_tasks(void)
5051 {
5052         struct task_struct *p;
5053         prio_array_t *array;
5054         unsigned long flags;
5055         runqueue_t *rq;
5056
5057         read_lock_irq(&tasklist_lock);
5058         for_each_process (p) {
5059                 if (!rt_task(p))
5060                         continue;
5061
5062                 rq = task_rq_lock(p, &flags);
5063
5064                 array = p->array;
5065                 if (array)
5066                         deactivate_task(p, task_rq(p));
5067                 __setscheduler(p, SCHED_NORMAL, 0);
5068                 if (array) {
5069                         __activate_task(p, task_rq(p));
5070                         resched_task(rq->curr);
5071                 }
5072
5073                 task_rq_unlock(rq, &flags);
5074         }
5075         read_unlock_irq(&tasklist_lock);
5076 }
5077
5078 #endif /* CONFIG_MAGIC_SYSRQ */