SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/sched_fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 /*
  24  * Targeted preemption latency for CPU-bound tasks:
  25  * (default: 20ms, units: nanoseconds)
  26  *
  27  * NOTE: this latency value is not the same as the concept of
  28  * 'timeslice length' - timeslices in CFS are of variable length.
  29  * (to see the precise effective timeslice length of your workload,
  30  *  run vmstat and monitor the context-switches field)
  31  *
  32  * On SMP systems the value of this is multiplied by the log2 of the
  33  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  34  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  35  * Targeted preemption latency for CPU-bound tasks:
  36  */
  37 const_debug unsigned int sysctl_sched_latency = 20000000ULL;
  38
  39 /*
  40  * After fork, child runs first. (default) If set to 0 then
  41  * parent will (try to) run first.
  42  */
  43 const_debug unsigned int sysctl_sched_child_runs_first = 1;
  44
  45 /*
  46  * Minimal preemption granularity for CPU-bound tasks:
  47  * (default: 2 msec, units: nanoseconds)
  48  */
  49 unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
  50
  51 /*
  52  * sys_sched_yield() compat mode
  53  *
  54  * This option switches the agressive yield implementation of the
  55  * old scheduler back on.
  56  */
  57 unsigned int __read_mostly sysctl_sched_compat_yield;
  58
  59 /*
  60  * SCHED_BATCH wake-up granularity.
  61  * (default: 25 msec, units: nanoseconds)
  62  *
  63  * This option delays the preemption effects of decoupled workloads
  64  * and reduces their over-scheduling. Synchronous workloads will still
  65  * have immediate wakeup/sleep latencies.
  66  */
  67 const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  68
  69 /*
  70  * SCHED_OTHER wake-up granularity.
  71  * (default: 1 msec, units: nanoseconds)
  72  *
  73  * This option delays the preemption effects of decoupled workloads
  74  * and reduces their over-scheduling. Synchronous workloads will still
  75  * have immediate wakeup/sleep latencies.
  76  */
  77 const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
  78
  79 unsigned int sysctl_sched_runtime_limit __read_mostly;
  80
  81 extern struct sched_class fair_sched_class;
  82
  83 /**************************************************************
  84  * CFS operations on generic schedulable entities:
  85  */
  86
  87 #ifdef CONFIG_FAIR_GROUP_SCHED
  88
  89 /* cpu runqueue to which this cfs_rq is attached */
  90 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  91 {
  92         return cfs_rq->rq;
  93 }
  94
  95 /* An entity is a task if it doesn't "own" a runqueue */
  96 #define entity_is_task(se)      (!se->my_q)
  97
  98 #else   /* CONFIG_FAIR_GROUP_SCHED */
  99
 100 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 101 {
 102         return container_of(cfs_rq, struct rq, cfs);
 103 }
 104
 105 #define entity_is_task(se)      1
 106
 107 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 108
 109 static inline struct task_struct *task_of(struct sched_entity *se)
 110 {
 111         return container_of(se, struct task_struct, se);
 112 }
 113
 114
 115 /**************************************************************
 116  * Scheduling class tree data structure manipulation methods:
 117  */
 118
 119 static inline void
 120 set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
 121 {
 122         struct sched_entity *se;
 123
 124         cfs_rq->rb_leftmost = leftmost;
 125         if (leftmost) {
 126                 se = rb_entry(leftmost, struct sched_entity, run_node);
 127                 cfs_rq->min_vruntime = max(se->vruntime,
 128                                                 cfs_rq->min_vruntime);
 129         }
 130 }
 131
 132 /*
 133  * Enqueue an entity into the rb-tree:
 134  */
 135 static void
 136 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 137 {
 138         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 139         struct rb_node *parent = NULL;
 140         struct sched_entity *entry;
 141         s64 key = se->fair_key;
 142         int leftmost = 1;
 143
 144         /*
 145          * Find the right place in the rbtree:
 146          */
 147         while (*link) {
 148                 parent = *link;
 149                 entry = rb_entry(parent, struct sched_entity, run_node);
 150                 /*
 151                  * We dont care about collisions. Nodes with
 152                  * the same key stay together.
 153                  */
 154                 if (key - entry->fair_key < 0) {
 155                         link = &parent->rb_left;
 156                 } else {
 157                         link = &parent->rb_right;
 158                         leftmost = 0;
 159                 }
 160         }
 161
 162         /*
 163          * Maintain a cache of leftmost tree entries (it is frequently
 164          * used):
 165          */
 166         if (leftmost)
 167                 set_leftmost(cfs_rq, &se->run_node);
 168
 169         rb_link_node(&se->run_node, parent, link);
 170         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 171         update_load_add(&cfs_rq->load, se->load.weight);
 172         cfs_rq->nr_running++;
 173         se->on_rq = 1;
 174
 175         schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 176 }
 177
 178 static void
 179 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 180 {
 181         if (cfs_rq->rb_leftmost == &se->run_node)
 182                 set_leftmost(cfs_rq, rb_next(&se->run_node));
 183
 184         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 185         update_load_sub(&cfs_rq->load, se->load.weight);
 186         cfs_rq->nr_running--;
 187         se->on_rq = 0;
 188
 189         schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 190 }
 191
 192 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
 193 {
 194         return cfs_rq->rb_leftmost;
 195 }
 196
 197 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 198 {
 199         return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 200 }
 201
 202 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 203 {
 204         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 205         struct sched_entity *se = NULL;
 206         struct rb_node *parent;
 207
 208         while (*link) {
 209                 parent = *link;
 210                 se = rb_entry(parent, struct sched_entity, run_node);
 211                 link = &parent->rb_right;
 212         }
 213
 214         return se;
 215 }
 216
 217 /**************************************************************
 218  * Scheduling class statistics methods:
 219  */
 220
 221 static u64 __sched_period(unsigned long nr_running)
 222 {
 223         u64 period = sysctl_sched_latency;
 224         unsigned long nr_latency =
 225                 sysctl_sched_latency / sysctl_sched_min_granularity;
 226
 227         if (unlikely(nr_running > nr_latency)) {
 228                 period *= nr_running;
 229                 do_div(period, nr_latency);
 230         }
 231
 232         return period;
 233 }
 234
 235 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 236 {
 237         u64 period = __sched_period(cfs_rq->nr_running);
 238
 239         period *= se->load.weight;
 240         do_div(period, cfs_rq->load.weight);
 241
 242         return period;
 243 }
 244
 245 static inline void
 246 limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
 247 {
 248         long limit = sysctl_sched_runtime_limit;
 249
 250         /*
 251          * Niced tasks have the same history dynamic range as
 252          * non-niced tasks:
 253          */
 254         if (unlikely(se->wait_runtime > limit)) {
 255                 se->wait_runtime = limit;
 256                 schedstat_inc(se, wait_runtime_overruns);
 257                 schedstat_inc(cfs_rq, wait_runtime_overruns);
 258         }
 259         if (unlikely(se->wait_runtime < -limit)) {
 260                 se->wait_runtime = -limit;
 261                 schedstat_inc(se, wait_runtime_underruns);
 262                 schedstat_inc(cfs_rq, wait_runtime_underruns);
 263         }
 264 }
 265
 266 static inline void
 267 __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 268 {
 269         se->wait_runtime += delta;
 270         schedstat_add(se, sum_wait_runtime, delta);
 271         limit_wait_runtime(cfs_rq, se);
 272 }
 273
 274 static void
 275 add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 276 {
 277         schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 278         __add_wait_runtime(cfs_rq, se, delta);
 279         schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 280 }
 281
 282 /*
 283  * Update the current task's runtime statistics. Skip current tasks that
 284  * are not in our scheduling class.
 285  */
 286 static inline void
 287 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 288               unsigned long delta_exec)
 289 {
 290         unsigned long delta, delta_fair, delta_mine, delta_exec_weighted;
 291         struct load_weight *lw = &cfs_rq->load;
 292         unsigned long load = lw->weight;
 293
 294         schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 295
 296         curr->sum_exec_runtime += delta_exec;
 297         cfs_rq->exec_clock += delta_exec;
 298         delta_exec_weighted = delta_exec;
 299         if (unlikely(curr->load.weight != NICE_0_LOAD)) {
 300                 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
 301                                                         &curr->load);
 302         }
 303         curr->vruntime += delta_exec_weighted;
 304
 305         if (!sched_feat(FAIR_SLEEPERS))
 306                 return;
 307
 308         if (unlikely(!load))
 309                 return;
 310
 311         delta_fair = calc_delta_fair(delta_exec, lw);
 312         delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 313
 314         if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
 315                 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
 316                 delta = min(delta, (unsigned long)(
 317                         (long)sysctl_sched_runtime_limit - curr->wait_runtime));
 318                 cfs_rq->sleeper_bonus -= delta;
 319                 delta_mine -= delta;
 320         }
 321
 322         cfs_rq->fair_clock += delta_fair;
 323         /*
 324          * We executed delta_exec amount of time on the CPU,
 325          * but we were only entitled to delta_mine amount of
 326          * time during that period (if nr_running == 1 then
 327          * the two values are equal)
 328          * [Note: delta_mine - delta_exec is negative]:
 329          */
 330         add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
 331 }
 332
 333 static void update_curr(struct cfs_rq *cfs_rq)
 334 {
 335         struct sched_entity *curr = cfs_rq->curr;
 336         u64 now = rq_of(cfs_rq)->clock;
 337         unsigned long delta_exec;
 338
 339         if (unlikely(!curr))
 340                 return;
 341
 342         /*
 343          * Get the amount of time the current task was running
 344          * since the last time we changed load (this cannot
 345          * overflow on 32 bits):
 346          */
 347         delta_exec = (unsigned long)(now - curr->exec_start);
 348
 349         __update_curr(cfs_rq, curr, delta_exec);
 350         curr->exec_start = now;
 351 }
 352
 353 static inline void
 354 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 355 {
 356         se->wait_start_fair = cfs_rq->fair_clock;
 357         schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 358 }
 359
 360 static inline unsigned long
 361 calc_weighted(unsigned long delta, struct sched_entity *se)
 362 {
 363         unsigned long weight = se->load.weight;
 364
 365         if (unlikely(weight != NICE_0_LOAD))
 366                 return (u64)delta * se->load.weight >> NICE_0_SHIFT;
 367         else
 368                 return delta;
 369 }
 370
 371 /*
 372  * Task is being enqueued - update stats:
 373  */
 374 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 375 {
 376         /*
 377          * Are we enqueueing a waiting task? (for current tasks
 378          * a dequeue/enqueue event is a NOP)
 379          */
 380         if (se != cfs_rq->curr)
 381                 update_stats_wait_start(cfs_rq, se);
 382         /*
 383          * Update the key:
 384          */
 385         se->fair_key = se->vruntime;
 386 }
 387
 388 /*
 389  * Note: must be called with a freshly updated rq->fair_clock.
 390  */
 391 static inline void
 392 __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se,
 393                         unsigned long delta_fair)
 394 {
 395         schedstat_set(se->wait_max, max(se->wait_max,
 396                         rq_of(cfs_rq)->clock - se->wait_start));
 397
 398         delta_fair = calc_weighted(delta_fair, se);
 399
 400         add_wait_runtime(cfs_rq, se, delta_fair);
 401 }
 402
 403 static void
 404 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 405 {
 406         unsigned long delta_fair;
 407
 408         if (unlikely(!se->wait_start_fair))
 409                 return;
 410
 411         delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 412                         (u64)(cfs_rq->fair_clock - se->wait_start_fair));
 413
 414         __update_stats_wait_end(cfs_rq, se, delta_fair);
 415
 416         se->wait_start_fair = 0;
 417         schedstat_set(se->wait_start, 0);
 418 }
 419
 420 static inline void
 421 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 422 {
 423         update_curr(cfs_rq);
 424         /*
 425          * Mark the end of the wait period if dequeueing a
 426          * waiting task:
 427          */
 428         if (se != cfs_rq->curr)
 429                 update_stats_wait_end(cfs_rq, se);
 430 }
 431
 432 /*
 433  * We are picking a new current task - update its stats:
 434  */
 435 static inline void
 436 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 437 {
 438         /*
 439          * We are starting a new run period:
 440          */
 441         se->exec_start = rq_of(cfs_rq)->clock;
 442 }
 443
 444 /*
 445  * We are descheduling a task - update its stats:
 446  */
 447 static inline void
 448 update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 449 {
 450         se->exec_start = 0;
 451 }
 452
 453 /**************************************************
 454  * Scheduling class queueing methods:
 455  */
 456
 457 static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se,
 458                               unsigned long delta_fair)
 459 {
 460         unsigned long load = cfs_rq->load.weight;
 461         long prev_runtime;
 462
 463         /*
 464          * Do not boost sleepers if there's too much bonus 'in flight'
 465          * already:
 466          */
 467         if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
 468                 return;
 469
 470         if (sched_feat(SLEEPER_LOAD_AVG))
 471                 load = rq_of(cfs_rq)->cpu_load[2];
 472
 473         /*
 474          * Fix up delta_fair with the effect of us running
 475          * during the whole sleep period:
 476          */
 477         if (sched_feat(SLEEPER_AVG))
 478                 delta_fair = div64_likely32((u64)delta_fair * load,
 479                                                 load + se->load.weight);
 480
 481         delta_fair = calc_weighted(delta_fair, se);
 482
 483         prev_runtime = se->wait_runtime;
 484         __add_wait_runtime(cfs_rq, se, delta_fair);
 485         delta_fair = se->wait_runtime - prev_runtime;
 486
 487         /*
 488          * Track the amount of bonus we've given to sleepers:
 489          */
 490         cfs_rq->sleeper_bonus += delta_fair;
 491 }
 492
 493 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 494 {
 495         struct task_struct *tsk = task_of(se);
 496         unsigned long delta_fair;
 497
 498         if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
 499                          !sched_feat(FAIR_SLEEPERS))
 500                 return;
 501
 502         delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 503                 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
 504
 505         __enqueue_sleeper(cfs_rq, se, delta_fair);
 506
 507         se->sleep_start_fair = 0;
 508
 509 #ifdef CONFIG_SCHEDSTATS
 510         if (se->sleep_start) {
 511                 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
 512
 513                 if ((s64)delta < 0)
 514                         delta = 0;
 515
 516                 if (unlikely(delta > se->sleep_max))
 517                         se->sleep_max = delta;
 518
 519                 se->sleep_start = 0;
 520                 se->sum_sleep_runtime += delta;
 521         }
 522         if (se->block_start) {
 523                 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
 524
 525                 if ((s64)delta < 0)
 526                         delta = 0;
 527
 528                 if (unlikely(delta > se->block_max))
 529                         se->block_max = delta;
 530
 531                 se->block_start = 0;
 532                 se->sum_sleep_runtime += delta;
 533
 534                 /*
 535                  * Blocking time is in units of nanosecs, so shift by 20 to
 536                  * get a milliseconds-range estimation of the amount of
 537                  * time that the task spent sleeping:
 538                  */
 539                 if (unlikely(prof_on == SLEEP_PROFILING)) {
 540                         profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 541                                      delta >> 20);
 542                 }
 543         }
 544 #endif
 545 }
 546
 547 static void
 548 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 549 {
 550         struct sched_entity *last = __pick_last_entity(cfs_rq);
 551         u64 min_runtime, latency;
 552
 553         min_runtime = cfs_rq->min_vruntime;
 554         if (last) {
 555                 min_runtime += last->vruntime;
 556                 min_runtime >>= 1;
 557                 if (initial && sched_feat(START_DEBIT))
 558                         min_runtime += sysctl_sched_latency/2;
 559         }
 560
 561         if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) {
 562                 latency = sysctl_sched_latency;
 563                 if (min_runtime > latency)
 564                         min_runtime -= latency;
 565                 else
 566                         min_runtime = 0;
 567         }
 568
 569         se->vruntime = max(se->vruntime, min_runtime);
 570 }
 571
 572 static void
 573 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 574 {
 575         /*
 576          * Update the fair clock.
 577          */
 578         update_curr(cfs_rq);
 579
 580         if (wakeup) {
 581                 place_entity(cfs_rq, se, 0);
 582                 enqueue_sleeper(cfs_rq, se);
 583         }
 584
 585         update_stats_enqueue(cfs_rq, se);
 586         __enqueue_entity(cfs_rq, se);
 587 }
 588
 589 static void
 590 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 591 {
 592         update_stats_dequeue(cfs_rq, se);
 593         if (sleep) {
 594                 se->sleep_start_fair = cfs_rq->fair_clock;
 595 #ifdef CONFIG_SCHEDSTATS
 596                 if (entity_is_task(se)) {
 597                         struct task_struct *tsk = task_of(se);
 598
 599                         if (tsk->state & TASK_INTERRUPTIBLE)
 600                                 se->sleep_start = rq_of(cfs_rq)->clock;
 601                         if (tsk->state & TASK_UNINTERRUPTIBLE)
 602                                 se->block_start = rq_of(cfs_rq)->clock;
 603                 }
 604 #endif
 605         }
 606         __dequeue_entity(cfs_rq, se);
 607 }
 608
 609 /*
 610  * Preempt the current task with a newly woken task if needed:
 611  */
 612 static void
 613 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 614 {
 615         unsigned long ideal_runtime, delta_exec;
 616
 617         ideal_runtime = sched_slice(cfs_rq, curr);
 618         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 619         if (delta_exec > ideal_runtime)
 620                 resched_task(rq_of(cfs_rq)->curr);
 621 }
 622
 623 static inline void
 624 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 625 {
 626         /*
 627          * Any task has to be enqueued before it get to execute on
 628          * a CPU. So account for the time it spent waiting on the
 629          * runqueue. (note, here we rely on pick_next_task() having
 630          * done a put_prev_task_fair() shortly before this, which
 631          * updated rq->fair_clock - used by update_stats_wait_end())
 632          */
 633         update_stats_wait_end(cfs_rq, se);
 634         update_stats_curr_start(cfs_rq, se);
 635         cfs_rq->curr = se;
 636 #ifdef CONFIG_SCHEDSTATS
 637         /*
 638          * Track our maximum slice length, if the CPU's load is at
 639          * least twice that of our own weight (i.e. dont track it
 640          * when there are only lesser-weight tasks around):
 641          */
 642         if (rq_of(cfs_rq)->ls.load.weight >= 2*se->load.weight) {
 643                 se->slice_max = max(se->slice_max,
 644                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
 645         }
 646 #endif
 647         se->prev_sum_exec_runtime = se->sum_exec_runtime;
 648 }
 649
 650 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 651 {
 652         struct sched_entity *se = __pick_next_entity(cfs_rq);
 653
 654         set_next_entity(cfs_rq, se);
 655
 656         return se;
 657 }
 658
 659 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 660 {
 661         /*
 662          * If still on the runqueue then deactivate_task()
 663          * was not called and update_curr() has to be done:
 664          */
 665         if (prev->on_rq)
 666                 update_curr(cfs_rq);
 667
 668         update_stats_curr_end(cfs_rq, prev);
 669
 670         if (prev->on_rq)
 671                 update_stats_wait_start(cfs_rq, prev);
 672         cfs_rq->curr = NULL;
 673 }
 674
 675 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 676 {
 677         /*
 678          * Dequeue and enqueue the task to update its
 679          * position within the tree:
 680          */
 681         dequeue_entity(cfs_rq, curr, 0);
 682         enqueue_entity(cfs_rq, curr, 0);
 683
 684         if (cfs_rq->nr_running > 1)
 685                 check_preempt_tick(cfs_rq, curr);
 686 }
 687
 688 /**************************************************
 689  * CFS operations on tasks:
 690  */
 691
 692 #ifdef CONFIG_FAIR_GROUP_SCHED
 693
 694 /* Walk up scheduling entities hierarchy */
 695 #define for_each_sched_entity(se) \
 696                 for (; se; se = se->parent)
 697
 698 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 699 {
 700         return p->se.cfs_rq;
 701 }
 702
 703 /* runqueue on which this entity is (to be) queued */
 704 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 705 {
 706         return se->cfs_rq;
 707 }
 708
 709 /* runqueue "owned" by this group */
 710 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 711 {
 712         return grp->my_q;
 713 }
 714
 715 /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
 716  * another cpu ('this_cpu')
 717  */
 718 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 719 {
 720         /* A later patch will take group into account */
 721         return &cpu_rq(this_cpu)->cfs;
 722 }
 723
 724 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 725 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 726         list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 727
 728 /* Do the two (enqueued) tasks belong to the same group ? */
 729 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 730 {
 731         if (curr->se.cfs_rq == p->se.cfs_rq)
 732                 return 1;
 733
 734         return 0;
 735 }
 736
 737 #else   /* CONFIG_FAIR_GROUP_SCHED */
 738
 739 #define for_each_sched_entity(se) \
 740                 for (; se; se = NULL)
 741
 742 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 743 {
 744         return &task_rq(p)->cfs;
 745 }
 746
 747 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 748 {
 749         struct task_struct *p = task_of(se);
 750         struct rq *rq = task_rq(p);
 751
 752         return &rq->cfs;
 753 }
 754
 755 /* runqueue "owned" by this group */
 756 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 757 {
 758         return NULL;
 759 }
 760
 761 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 762 {
 763         return &cpu_rq(this_cpu)->cfs;
 764 }
 765
 766 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 767                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 768
 769 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 770 {
 771         return 1;
 772 }
 773
 774 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 775
 776 /*
 777  * The enqueue_task method is called before nr_running is
 778  * increased. Here we update the fair scheduling stats and
 779  * then put the task into the rbtree:
 780  */
 781 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 782 {
 783         struct cfs_rq *cfs_rq;
 784         struct sched_entity *se = &p->se;
 785
 786         for_each_sched_entity(se) {
 787                 if (se->on_rq)
 788                         break;
 789                 cfs_rq = cfs_rq_of(se);
 790                 enqueue_entity(cfs_rq, se, wakeup);
 791         }
 792 }
 793
 794 /*
 795  * The dequeue_task method is called before nr_running is
 796  * decreased. We remove the task from the rbtree and
 797  * update the fair scheduling stats:
 798  */
 799 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 800 {
 801         struct cfs_rq *cfs_rq;
 802         struct sched_entity *se = &p->se;
 803
 804         for_each_sched_entity(se) {
 805                 cfs_rq = cfs_rq_of(se);
 806                 dequeue_entity(cfs_rq, se, sleep);
 807                 /* Don't dequeue parent if it has other entities besides us */
 808                 if (cfs_rq->load.weight)
 809                         break;
 810         }
 811 }
 812
 813 /*
 814  * sched_yield() support is very simple - we dequeue and enqueue.
 815  *
 816  * If compat_yield is turned on then we requeue to the end of the tree.
 817  */
 818 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 819 {
 820         struct cfs_rq *cfs_rq = task_cfs_rq(p);
 821         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 822         struct sched_entity *rightmost, *se = &p->se;
 823         struct rb_node *parent;
 824
 825         /*
 826          * Are we the only task in the tree?
 827          */
 828         if (unlikely(cfs_rq->nr_running == 1))
 829                 return;
 830
 831         if (likely(!sysctl_sched_compat_yield)) {
 832                 __update_rq_clock(rq);
 833                 /*
 834                  * Dequeue and enqueue the task to update its
 835                  * position within the tree:
 836                  */
 837                 dequeue_entity(cfs_rq, &p->se, 0);
 838                 enqueue_entity(cfs_rq, &p->se, 0);
 839
 840                 return;
 841         }
 842         /*
 843          * Find the rightmost entry in the rbtree:
 844          */
 845         do {
 846                 parent = *link;
 847                 link = &parent->rb_right;
 848         } while (*link);
 849
 850         rightmost = rb_entry(parent, struct sched_entity, run_node);
 851         /*
 852          * Already in the rightmost position?
 853          */
 854         if (unlikely(rightmost == se))
 855                 return;
 856
 857         /*
 858          * Minimally necessary key value to be last in the tree:
 859          */
 860         se->fair_key = rightmost->fair_key + 1;
 861
 862         if (cfs_rq->rb_leftmost == &se->run_node)
 863                 cfs_rq->rb_leftmost = rb_next(&se->run_node);
 864         /*
 865          * Relink the task to the rightmost position:
 866          */
 867         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 868         rb_link_node(&se->run_node, parent, link);
 869         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 870 }
 871
 872 /*
 873  * Preempt the current task with a newly woken task if needed:
 874  */
 875 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 876 {
 877         struct task_struct *curr = rq->curr;
 878         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 879
 880         if (unlikely(rt_prio(p->prio))) {
 881                 update_rq_clock(rq);
 882                 update_curr(cfs_rq);
 883                 resched_task(curr);
 884                 return;
 885         }
 886         if (is_same_group(curr, p)) {
 887                 s64 delta = curr->se.vruntime - p->se.vruntime;
 888
 889                 if (delta > (s64)sysctl_sched_wakeup_granularity)
 890                         resched_task(curr);
 891         }
 892 }
 893
 894 static struct task_struct *pick_next_task_fair(struct rq *rq)
 895 {
 896         struct cfs_rq *cfs_rq = &rq->cfs;
 897         struct sched_entity *se;
 898
 899         if (unlikely(!cfs_rq->nr_running))
 900                 return NULL;
 901
 902         do {
 903                 se = pick_next_entity(cfs_rq);
 904                 cfs_rq = group_cfs_rq(se);
 905         } while (cfs_rq);
 906
 907         return task_of(se);
 908 }
 909
 910 /*
 911  * Account for a descheduled task:
 912  */
 913 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 914 {
 915         struct sched_entity *se = &prev->se;
 916         struct cfs_rq *cfs_rq;
 917
 918         for_each_sched_entity(se) {
 919                 cfs_rq = cfs_rq_of(se);
 920                 put_prev_entity(cfs_rq, se);
 921         }
 922 }
 923
 924 /**************************************************
 925  * Fair scheduling class load-balancing methods:
 926  */
 927
 928 /*
 929  * Load-balancing iterator. Note: while the runqueue stays locked
 930  * during the whole iteration, the current task might be
 931  * dequeued so the iterator has to be dequeue-safe. Here we
 932  * achieve that by always pre-iterating before returning
 933  * the current task:
 934  */
 935 static inline struct task_struct *
 936 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 937 {
 938         struct task_struct *p;
 939
 940         if (!curr)
 941                 return NULL;
 942
 943         p = rb_entry(curr, struct task_struct, se.run_node);
 944         cfs_rq->rb_load_balance_curr = rb_next(curr);
 945
 946         return p;
 947 }
 948
 949 static struct task_struct *load_balance_start_fair(void *arg)
 950 {
 951         struct cfs_rq *cfs_rq = arg;
 952
 953         return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
 954 }
 955
 956 static struct task_struct *load_balance_next_fair(void *arg)
 957 {
 958         struct cfs_rq *cfs_rq = arg;
 959
 960         return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 961 }
 962
 963 #ifdef CONFIG_FAIR_GROUP_SCHED
 964 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 965 {
 966         struct sched_entity *curr;
 967         struct task_struct *p;
 968
 969         if (!cfs_rq->nr_running)
 970                 return MAX_PRIO;
 971
 972         curr = __pick_next_entity(cfs_rq);
 973         p = task_of(curr);
 974
 975         return p->prio;
 976 }
 977 #endif
 978
 979 static unsigned long
 980 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 981                   unsigned long max_nr_move, unsigned long max_load_move,
 982                   struct sched_domain *sd, enum cpu_idle_type idle,
 983                   int *all_pinned, int *this_best_prio)
 984 {
 985         struct cfs_rq *busy_cfs_rq;
 986         unsigned long load_moved, total_nr_moved = 0, nr_moved;
 987         long rem_load_move = max_load_move;
 988         struct rq_iterator cfs_rq_iterator;
 989
 990         cfs_rq_iterator.start = load_balance_start_fair;
 991         cfs_rq_iterator.next = load_balance_next_fair;
 992
 993         for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 994 #ifdef CONFIG_FAIR_GROUP_SCHED
 995                 struct cfs_rq *this_cfs_rq;
 996                 long imbalance;
 997                 unsigned long maxload;
 998
 999                 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1000
1001                 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
1002                 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
1003                 if (imbalance <= 0)
1004                         continue;
1005
1006                 /* Don't pull more than imbalance/2 */
1007                 imbalance /= 2;
1008                 maxload = min(rem_load_move, imbalance);
1009
1010                 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1011 #else
1012 # define maxload rem_load_move
1013 #endif
1014                 /* pass busy_cfs_rq argument into
1015                  * load_balance_[start|next]_fair iterators
1016                  */
1017                 cfs_rq_iterator.arg = busy_cfs_rq;
1018                 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
1019                                 max_nr_move, maxload, sd, idle, all_pinned,
1020                                 &load_moved, this_best_prio, &cfs_rq_iterator);
1021
1022                 total_nr_moved += nr_moved;
1023                 max_nr_move -= nr_moved;
1024                 rem_load_move -= load_moved;
1025
1026                 if (max_nr_move <= 0 || rem_load_move <= 0)
1027                         break;
1028         }
1029
1030         return max_load_move - rem_load_move;
1031 }
1032
1033 /*
1034  * scheduler tick hitting a task of our scheduling class:
1035  */
1036 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1037 {
1038         struct cfs_rq *cfs_rq;
1039         struct sched_entity *se = &curr->se;
1040
1041         for_each_sched_entity(se) {
1042                 cfs_rq = cfs_rq_of(se);
1043                 entity_tick(cfs_rq, se);
1044         }
1045 }
1046
1047 #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1048
1049 /*
1050  * Share the fairness runtime between parent and child, thus the
1051  * total amount of pressure for CPU stays equal - new tasks
1052  * get a chance to run but frequent forkers are not allowed to
1053  * monopolize the CPU. Note: the parent runqueue is locked,
1054  * the child is not running yet.
1055  */
1056 static void task_new_fair(struct rq *rq, struct task_struct *p)
1057 {
1058         struct cfs_rq *cfs_rq = task_cfs_rq(p);
1059         struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1060
1061         sched_info_queued(p);
1062
1063         update_curr(cfs_rq);
1064         place_entity(cfs_rq, se, 1);
1065
1066         /*
1067          * The statistical average of wait_runtime is about
1068          * -granularity/2, so initialize the task with that:
1069          */
1070         if (sched_feat(START_DEBIT))
1071                 se->wait_runtime = -(__sched_period(cfs_rq->nr_running+1) / 2);
1072
1073         if (sysctl_sched_child_runs_first &&
1074                         curr->vruntime < se->vruntime) {
1075
1076                 dequeue_entity(cfs_rq, curr, 0);
1077                 swap(curr->vruntime, se->vruntime);
1078                 enqueue_entity(cfs_rq, curr, 0);
1079         }
1080
1081         update_stats_enqueue(cfs_rq, se);
1082         __enqueue_entity(cfs_rq, se);
1083         resched_task(rq->curr);
1084 }
1085
1086 #ifdef CONFIG_FAIR_GROUP_SCHED
1087 /* Account for a task changing its policy or group.
1088  *
1089  * This routine is mostly called to set cfs_rq->curr field when a task
1090  * migrates between groups/classes.
1091  */
1092 static void set_curr_task_fair(struct rq *rq)
1093 {
1094         struct sched_entity *se = &rq->curr->se;
1095
1096         for_each_sched_entity(se)
1097                 set_next_entity(cfs_rq_of(se), se);
1098 }
1099 #else
1100 static void set_curr_task_fair(struct rq *rq)
1101 {
1102 }
1103 #endif
1104
1105 /*
1106  * All the scheduling class methods:
1107  */
1108 struct sched_class fair_sched_class __read_mostly = {
1109         .enqueue_task           = enqueue_task_fair,
1110         .dequeue_task           = dequeue_task_fair,
1111         .yield_task             = yield_task_fair,
1112
1113         .check_preempt_curr     = check_preempt_wakeup,
1114
1115         .pick_next_task         = pick_next_task_fair,
1116         .put_prev_task          = put_prev_task_fair,
1117
1118         .load_balance           = load_balance_fair,
1119
1120         .set_curr_task          = set_curr_task_fair,
1121         .task_tick              = task_tick_fair,
1122         .task_new               = task_new_fair,
1123 };
1124
1125 #ifdef CONFIG_SCHED_DEBUG
1126 static void print_cfs_stats(struct seq_file *m, int cpu)
1127 {
1128         struct cfs_rq *cfs_rq;
1129
1130         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1131                 print_cfs_rq(m, cpu, cfs_rq);
1132 }
1133 #endif