SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/sched_fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 /*
  24  * Targeted preemption latency for CPU-bound tasks:
  25  * (default: 20ms, units: nanoseconds)
  26  *
  27  * NOTE: this latency value is not the same as the concept of
  28  * 'timeslice length' - timeslices in CFS are of variable length.
  29  * (to see the precise effective timeslice length of your workload,
  30  *  run vmstat and monitor the context-switches field)
  31  *
  32  * On SMP systems the value of this is multiplied by the log2 of the
  33  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  34  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  35  * Targeted preemption latency for CPU-bound tasks:
  36  */
  37 const_debug unsigned int sysctl_sched_latency = 20000000ULL;
  38
  39 /*
  40  * After fork, child runs first. (default) If set to 0 then
  41  * parent will (try to) run first.
  42  */
  43 const_debug unsigned int sysctl_sched_child_runs_first = 1;
  44
  45 /*
  46  * Minimal preemption granularity for CPU-bound tasks:
  47  * (default: 2 msec, units: nanoseconds)
  48  */
  49 unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
  50
  51 /*
  52  * sys_sched_yield() compat mode
  53  *
  54  * This option switches the agressive yield implementation of the
  55  * old scheduler back on.
  56  */
  57 unsigned int __read_mostly sysctl_sched_compat_yield;
  58
  59 /*
  60  * SCHED_BATCH wake-up granularity.
  61  * (default: 25 msec, units: nanoseconds)
  62  *
  63  * This option delays the preemption effects of decoupled workloads
  64  * and reduces their over-scheduling. Synchronous workloads will still
  65  * have immediate wakeup/sleep latencies.
  66  */
  67 const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  68
  69 /*
  70  * SCHED_OTHER wake-up granularity.
  71  * (default: 1 msec, units: nanoseconds)
  72  *
  73  * This option delays the preemption effects of decoupled workloads
  74  * and reduces their over-scheduling. Synchronous workloads will still
  75  * have immediate wakeup/sleep latencies.
  76  */
  77 const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
  78
  79 unsigned int sysctl_sched_runtime_limit __read_mostly;
  80
  81 extern struct sched_class fair_sched_class;
  82
  83 /**************************************************************
  84  * CFS operations on generic schedulable entities:
  85  */
  86
  87 #ifdef CONFIG_FAIR_GROUP_SCHED
  88
  89 /* cpu runqueue to which this cfs_rq is attached */
  90 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  91 {
  92         return cfs_rq->rq;
  93 }
  94
  95 /* An entity is a task if it doesn't "own" a runqueue */
  96 #define entity_is_task(se)      (!se->my_q)
  97
  98 #else   /* CONFIG_FAIR_GROUP_SCHED */
  99
 100 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 101 {
 102         return container_of(cfs_rq, struct rq, cfs);
 103 }
 104
 105 #define entity_is_task(se)      1
 106
 107 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 108
 109 static inline struct task_struct *task_of(struct sched_entity *se)
 110 {
 111         return container_of(se, struct task_struct, se);
 112 }
 113
 114
 115 /**************************************************************
 116  * Scheduling class tree data structure manipulation methods:
 117  */
 118
 119 static inline void
 120 set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
 121 {
 122         struct sched_entity *se;
 123
 124         cfs_rq->rb_leftmost = leftmost;
 125         if (leftmost) {
 126                 se = rb_entry(leftmost, struct sched_entity, run_node);
 127                 cfs_rq->min_vruntime = max(se->vruntime,
 128                                                 cfs_rq->min_vruntime);
 129         }
 130 }
 131
 132 /*
 133  * Enqueue an entity into the rb-tree:
 134  */
 135 static void
 136 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 137 {
 138         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 139         struct rb_node *parent = NULL;
 140         struct sched_entity *entry;
 141         s64 key = se->fair_key;
 142         int leftmost = 1;
 143
 144         /*
 145          * Find the right place in the rbtree:
 146          */
 147         while (*link) {
 148                 parent = *link;
 149                 entry = rb_entry(parent, struct sched_entity, run_node);
 150                 /*
 151                  * We dont care about collisions. Nodes with
 152                  * the same key stay together.
 153                  */
 154                 if (key - entry->fair_key < 0) {
 155                         link = &parent->rb_left;
 156                 } else {
 157                         link = &parent->rb_right;
 158                         leftmost = 0;
 159                 }
 160         }
 161
 162         /*
 163          * Maintain a cache of leftmost tree entries (it is frequently
 164          * used):
 165          */
 166         if (leftmost)
 167                 set_leftmost(cfs_rq, &se->run_node);
 168
 169         rb_link_node(&se->run_node, parent, link);
 170         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 171         update_load_add(&cfs_rq->load, se->load.weight);
 172         cfs_rq->nr_running++;
 173         se->on_rq = 1;
 174
 175         schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 176 }
 177
 178 static void
 179 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 180 {
 181         if (cfs_rq->rb_leftmost == &se->run_node)
 182                 set_leftmost(cfs_rq, rb_next(&se->run_node));
 183
 184         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 185         update_load_sub(&cfs_rq->load, se->load.weight);
 186         cfs_rq->nr_running--;
 187         se->on_rq = 0;
 188
 189         schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 190 }
 191
 192 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
 193 {
 194         return cfs_rq->rb_leftmost;
 195 }
 196
 197 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 198 {
 199         return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 200 }
 201
 202 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 203 {
 204         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 205         struct sched_entity *se = NULL;
 206         struct rb_node *parent;
 207
 208         while (*link) {
 209                 parent = *link;
 210                 se = rb_entry(parent, struct sched_entity, run_node);
 211                 link = &parent->rb_right;
 212         }
 213
 214         return se;
 215 }
 216
 217 /**************************************************************
 218  * Scheduling class statistics methods:
 219  */
 220
 221 static u64 __sched_period(unsigned long nr_running)
 222 {
 223         u64 period = sysctl_sched_latency;
 224         unsigned long nr_latency =
 225                 sysctl_sched_latency / sysctl_sched_min_granularity;
 226
 227         if (unlikely(nr_running > nr_latency)) {
 228                 period *= nr_running;
 229                 do_div(period, nr_latency);
 230         }
 231
 232         return period;
 233 }
 234
 235 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 236 {
 237         u64 period = __sched_period(cfs_rq->nr_running);
 238
 239         period *= se->load.weight;
 240         do_div(period, cfs_rq->load.weight);
 241
 242         return period;
 243 }
 244
 245 static inline void
 246 limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
 247 {
 248         long limit = sysctl_sched_runtime_limit;
 249
 250         /*
 251          * Niced tasks have the same history dynamic range as
 252          * non-niced tasks:
 253          */
 254         if (unlikely(se->wait_runtime > limit)) {
 255                 se->wait_runtime = limit;
 256                 schedstat_inc(se, wait_runtime_overruns);
 257                 schedstat_inc(cfs_rq, wait_runtime_overruns);
 258         }
 259         if (unlikely(se->wait_runtime < -limit)) {
 260                 se->wait_runtime = -limit;
 261                 schedstat_inc(se, wait_runtime_underruns);
 262                 schedstat_inc(cfs_rq, wait_runtime_underruns);
 263         }
 264 }
 265
 266 static inline void
 267 __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 268 {
 269         se->wait_runtime += delta;
 270         schedstat_add(se, sum_wait_runtime, delta);
 271         limit_wait_runtime(cfs_rq, se);
 272 }
 273
 274 static void
 275 add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 276 {
 277         schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 278         __add_wait_runtime(cfs_rq, se, delta);
 279         schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 280 }
 281
 282 /*
 283  * Update the current task's runtime statistics. Skip current tasks that
 284  * are not in our scheduling class.
 285  */
 286 static inline void
 287 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 288               unsigned long delta_exec)
 289 {
 290         unsigned long delta, delta_fair, delta_mine, delta_exec_weighted;
 291         struct load_weight *lw = &cfs_rq->load;
 292         unsigned long load = lw->weight;
 293
 294         schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 295
 296         curr->sum_exec_runtime += delta_exec;
 297         cfs_rq->exec_clock += delta_exec;
 298         delta_exec_weighted = delta_exec;
 299         if (unlikely(curr->load.weight != NICE_0_LOAD)) {
 300                 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
 301                                                         &curr->load);
 302         }
 303         curr->vruntime += delta_exec_weighted;
 304
 305         if (!sched_feat(FAIR_SLEEPERS))
 306                 return;
 307
 308         if (unlikely(!load))
 309                 return;
 310
 311         delta_fair = calc_delta_fair(delta_exec, lw);
 312         delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 313
 314         if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
 315                 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
 316                 delta = min(delta, (unsigned long)(
 317                         (long)sysctl_sched_runtime_limit - curr->wait_runtime));
 318                 cfs_rq->sleeper_bonus -= delta;
 319                 delta_mine -= delta;
 320         }
 321
 322         cfs_rq->fair_clock += delta_fair;
 323         /*
 324          * We executed delta_exec amount of time on the CPU,
 325          * but we were only entitled to delta_mine amount of
 326          * time during that period (if nr_running == 1 then
 327          * the two values are equal)
 328          * [Note: delta_mine - delta_exec is negative]:
 329          */
 330         add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
 331 }
 332
 333 static void update_curr(struct cfs_rq *cfs_rq)
 334 {
 335         struct sched_entity *curr = cfs_rq->curr;
 336         u64 now = rq_of(cfs_rq)->clock;
 337         unsigned long delta_exec;
 338
 339         if (unlikely(!curr))
 340                 return;
 341
 342         /*
 343          * Get the amount of time the current task was running
 344          * since the last time we changed load (this cannot
 345          * overflow on 32 bits):
 346          */
 347         delta_exec = (unsigned long)(now - curr->exec_start);
 348
 349         __update_curr(cfs_rq, curr, delta_exec);
 350         curr->exec_start = now;
 351 }
 352
 353 static inline void
 354 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 355 {
 356         se->wait_start_fair = cfs_rq->fair_clock;
 357         schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 358 }
 359
 360 static inline unsigned long
 361 calc_weighted(unsigned long delta, struct sched_entity *se)
 362 {
 363         unsigned long weight = se->load.weight;
 364
 365         if (unlikely(weight != NICE_0_LOAD))
 366                 return (u64)delta * se->load.weight >> NICE_0_SHIFT;
 367         else
 368                 return delta;
 369 }
 370
 371 /*
 372  * Task is being enqueued - update stats:
 373  */
 374 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 375 {
 376         /*
 377          * Are we enqueueing a waiting task? (for current tasks
 378          * a dequeue/enqueue event is a NOP)
 379          */
 380         if (se != cfs_rq->curr)
 381                 update_stats_wait_start(cfs_rq, se);
 382         /*
 383          * Update the key:
 384          */
 385         se->fair_key = se->vruntime;
 386 }
 387
 388 /*
 389  * Note: must be called with a freshly updated rq->fair_clock.
 390  */
 391 static inline void
 392 __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se,
 393                         unsigned long delta_fair)
 394 {
 395         schedstat_set(se->wait_max, max(se->wait_max,
 396                         rq_of(cfs_rq)->clock - se->wait_start));
 397
 398         delta_fair = calc_weighted(delta_fair, se);
 399
 400         add_wait_runtime(cfs_rq, se, delta_fair);
 401 }
 402
 403 static void
 404 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 405 {
 406         unsigned long delta_fair;
 407
 408         if (unlikely(!se->wait_start_fair))
 409                 return;
 410
 411         delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 412                         (u64)(cfs_rq->fair_clock - se->wait_start_fair));
 413
 414         __update_stats_wait_end(cfs_rq, se, delta_fair);
 415
 416         se->wait_start_fair = 0;
 417         schedstat_set(se->wait_start, 0);
 418 }
 419
 420 static inline void
 421 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 422 {
 423         update_curr(cfs_rq);
 424         /*
 425          * Mark the end of the wait period if dequeueing a
 426          * waiting task:
 427          */
 428         if (se != cfs_rq->curr)
 429                 update_stats_wait_end(cfs_rq, se);
 430 }
 431
 432 /*
 433  * We are picking a new current task - update its stats:
 434  */
 435 static inline void
 436 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 437 {
 438         /*
 439          * We are starting a new run period:
 440          */
 441         se->exec_start = rq_of(cfs_rq)->clock;
 442 }
 443
 444 /*
 445  * We are descheduling a task - update its stats:
 446  */
 447 static inline void
 448 update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 449 {
 450         se->exec_start = 0;
 451 }
 452
 453 /**************************************************
 454  * Scheduling class queueing methods:
 455  */
 456
 457 static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se,
 458                               unsigned long delta_fair)
 459 {
 460         unsigned long load = cfs_rq->load.weight;
 461         long prev_runtime;
 462
 463         /*
 464          * Do not boost sleepers if there's too much bonus 'in flight'
 465          * already:
 466          */
 467         if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
 468                 return;
 469
 470         if (sched_feat(SLEEPER_LOAD_AVG))
 471                 load = rq_of(cfs_rq)->cpu_load[2];
 472
 473         /*
 474          * Fix up delta_fair with the effect of us running
 475          * during the whole sleep period:
 476          */
 477         if (sched_feat(SLEEPER_AVG))
 478                 delta_fair = div64_likely32((u64)delta_fair * load,
 479                                                 load + se->load.weight);
 480
 481         delta_fair = calc_weighted(delta_fair, se);
 482
 483         prev_runtime = se->wait_runtime;
 484         __add_wait_runtime(cfs_rq, se, delta_fair);
 485         delta_fair = se->wait_runtime - prev_runtime;
 486
 487         /*
 488          * Track the amount of bonus we've given to sleepers:
 489          */
 490         cfs_rq->sleeper_bonus += delta_fair;
 491 }
 492
 493 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 494 {
 495         struct task_struct *tsk = task_of(se);
 496         unsigned long delta_fair;
 497
 498         if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
 499                          !sched_feat(FAIR_SLEEPERS))
 500                 return;
 501
 502         delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 503                 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
 504
 505         __enqueue_sleeper(cfs_rq, se, delta_fair);
 506
 507         se->sleep_start_fair = 0;
 508
 509 #ifdef CONFIG_SCHEDSTATS
 510         if (se->sleep_start) {
 511                 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
 512
 513                 if ((s64)delta < 0)
 514                         delta = 0;
 515
 516                 if (unlikely(delta > se->sleep_max))
 517                         se->sleep_max = delta;
 518
 519                 se->sleep_start = 0;
 520                 se->sum_sleep_runtime += delta;
 521         }
 522         if (se->block_start) {
 523                 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
 524
 525                 if ((s64)delta < 0)
 526                         delta = 0;
 527
 528                 if (unlikely(delta > se->block_max))
 529                         se->block_max = delta;
 530
 531                 se->block_start = 0;
 532                 se->sum_sleep_runtime += delta;
 533
 534                 /*
 535                  * Blocking time is in units of nanosecs, so shift by 20 to
 536                  * get a milliseconds-range estimation of the amount of
 537                  * time that the task spent sleeping:
 538                  */
 539                 if (unlikely(prof_on == SLEEP_PROFILING)) {
 540                         profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 541                                      delta >> 20);
 542                 }
 543         }
 544 #endif
 545 }
 546
 547 static void
 548 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 549 {
 550         u64 min_runtime, latency;
 551
 552         min_runtime = cfs_rq->min_vruntime;
 553
 554         if (sched_feat(USE_TREE_AVG)) {
 555                 struct sched_entity *last = __pick_last_entity(cfs_rq);
 556                 if (last) {
 557                         min_runtime = __pick_next_entity(cfs_rq)->vruntime;
 558                         min_runtime += last->vruntime;
 559                         min_runtime >>= 1;
 560                 }
 561         } else if (sched_feat(APPROX_AVG))
 562                 min_runtime += sysctl_sched_latency/2;
 563
 564         if (initial && sched_feat(START_DEBIT))
 565                 min_runtime += sched_slice(cfs_rq, se);
 566
 567         if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) {
 568                 latency = sysctl_sched_latency;
 569                 if (min_runtime > latency)
 570                         min_runtime -= latency;
 571                 else
 572                         min_runtime = 0;
 573         }
 574
 575         se->vruntime = max(se->vruntime, min_runtime);
 576 }
 577
 578 static void
 579 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 580 {
 581         /*
 582          * Update the fair clock.
 583          */
 584         update_curr(cfs_rq);
 585
 586         if (wakeup) {
 587                 place_entity(cfs_rq, se, 0);
 588                 enqueue_sleeper(cfs_rq, se);
 589         }
 590
 591         update_stats_enqueue(cfs_rq, se);
 592         __enqueue_entity(cfs_rq, se);
 593 }
 594
 595 static void
 596 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 597 {
 598         update_stats_dequeue(cfs_rq, se);
 599         if (sleep) {
 600                 se->sleep_start_fair = cfs_rq->fair_clock;
 601 #ifdef CONFIG_SCHEDSTATS
 602                 if (entity_is_task(se)) {
 603                         struct task_struct *tsk = task_of(se);
 604
 605                         if (tsk->state & TASK_INTERRUPTIBLE)
 606                                 se->sleep_start = rq_of(cfs_rq)->clock;
 607                         if (tsk->state & TASK_UNINTERRUPTIBLE)
 608                                 se->block_start = rq_of(cfs_rq)->clock;
 609                 }
 610 #endif
 611         }
 612         __dequeue_entity(cfs_rq, se);
 613 }
 614
 615 /*
 616  * Preempt the current task with a newly woken task if needed:
 617  */
 618 static void
 619 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 620 {
 621         unsigned long ideal_runtime, delta_exec;
 622
 623         ideal_runtime = sched_slice(cfs_rq, curr);
 624         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 625         if (delta_exec > ideal_runtime)
 626                 resched_task(rq_of(cfs_rq)->curr);
 627 }
 628
 629 static inline void
 630 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 631 {
 632         /*
 633          * Any task has to be enqueued before it get to execute on
 634          * a CPU. So account for the time it spent waiting on the
 635          * runqueue. (note, here we rely on pick_next_task() having
 636          * done a put_prev_task_fair() shortly before this, which
 637          * updated rq->fair_clock - used by update_stats_wait_end())
 638          */
 639         update_stats_wait_end(cfs_rq, se);
 640         update_stats_curr_start(cfs_rq, se);
 641         cfs_rq->curr = se;
 642 #ifdef CONFIG_SCHEDSTATS
 643         /*
 644          * Track our maximum slice length, if the CPU's load is at
 645          * least twice that of our own weight (i.e. dont track it
 646          * when there are only lesser-weight tasks around):
 647          */
 648         if (rq_of(cfs_rq)->ls.load.weight >= 2*se->load.weight) {
 649                 se->slice_max = max(se->slice_max,
 650                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
 651         }
 652 #endif
 653         se->prev_sum_exec_runtime = se->sum_exec_runtime;
 654 }
 655
 656 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 657 {
 658         struct sched_entity *se = __pick_next_entity(cfs_rq);
 659
 660         set_next_entity(cfs_rq, se);
 661
 662         return se;
 663 }
 664
 665 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 666 {
 667         /*
 668          * If still on the runqueue then deactivate_task()
 669          * was not called and update_curr() has to be done:
 670          */
 671         if (prev->on_rq)
 672                 update_curr(cfs_rq);
 673
 674         update_stats_curr_end(cfs_rq, prev);
 675
 676         if (prev->on_rq)
 677                 update_stats_wait_start(cfs_rq, prev);
 678         cfs_rq->curr = NULL;
 679 }
 680
 681 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 682 {
 683         /*
 684          * Dequeue and enqueue the task to update its
 685          * position within the tree:
 686          */
 687         dequeue_entity(cfs_rq, curr, 0);
 688         enqueue_entity(cfs_rq, curr, 0);
 689
 690         if (cfs_rq->nr_running > 1)
 691                 check_preempt_tick(cfs_rq, curr);
 692 }
 693
 694 /**************************************************
 695  * CFS operations on tasks:
 696  */
 697
 698 #ifdef CONFIG_FAIR_GROUP_SCHED
 699
 700 /* Walk up scheduling entities hierarchy */
 701 #define for_each_sched_entity(se) \
 702                 for (; se; se = se->parent)
 703
 704 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 705 {
 706         return p->se.cfs_rq;
 707 }
 708
 709 /* runqueue on which this entity is (to be) queued */
 710 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 711 {
 712         return se->cfs_rq;
 713 }
 714
 715 /* runqueue "owned" by this group */
 716 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 717 {
 718         return grp->my_q;
 719 }
 720
 721 /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
 722  * another cpu ('this_cpu')
 723  */
 724 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 725 {
 726         /* A later patch will take group into account */
 727         return &cpu_rq(this_cpu)->cfs;
 728 }
 729
 730 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 731 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 732         list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 733
 734 /* Do the two (enqueued) tasks belong to the same group ? */
 735 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 736 {
 737         if (curr->se.cfs_rq == p->se.cfs_rq)
 738                 return 1;
 739
 740         return 0;
 741 }
 742
 743 #else   /* CONFIG_FAIR_GROUP_SCHED */
 744
 745 #define for_each_sched_entity(se) \
 746                 for (; se; se = NULL)
 747
 748 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 749 {
 750         return &task_rq(p)->cfs;
 751 }
 752
 753 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 754 {
 755         struct task_struct *p = task_of(se);
 756         struct rq *rq = task_rq(p);
 757
 758         return &rq->cfs;
 759 }
 760
 761 /* runqueue "owned" by this group */
 762 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 763 {
 764         return NULL;
 765 }
 766
 767 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 768 {
 769         return &cpu_rq(this_cpu)->cfs;
 770 }
 771
 772 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 773                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 774
 775 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 776 {
 777         return 1;
 778 }
 779
 780 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 781
 782 /*
 783  * The enqueue_task method is called before nr_running is
 784  * increased. Here we update the fair scheduling stats and
 785  * then put the task into the rbtree:
 786  */
 787 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 788 {
 789         struct cfs_rq *cfs_rq;
 790         struct sched_entity *se = &p->se;
 791
 792         for_each_sched_entity(se) {
 793                 if (se->on_rq)
 794                         break;
 795                 cfs_rq = cfs_rq_of(se);
 796                 enqueue_entity(cfs_rq, se, wakeup);
 797         }
 798 }
 799
 800 /*
 801  * The dequeue_task method is called before nr_running is
 802  * decreased. We remove the task from the rbtree and
 803  * update the fair scheduling stats:
 804  */
 805 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 806 {
 807         struct cfs_rq *cfs_rq;
 808         struct sched_entity *se = &p->se;
 809
 810         for_each_sched_entity(se) {
 811                 cfs_rq = cfs_rq_of(se);
 812                 dequeue_entity(cfs_rq, se, sleep);
 813                 /* Don't dequeue parent if it has other entities besides us */
 814                 if (cfs_rq->load.weight)
 815                         break;
 816         }
 817 }
 818
 819 /*
 820  * sched_yield() support is very simple - we dequeue and enqueue.
 821  *
 822  * If compat_yield is turned on then we requeue to the end of the tree.
 823  */
 824 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 825 {
 826         struct cfs_rq *cfs_rq = task_cfs_rq(p);
 827         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 828         struct sched_entity *rightmost, *se = &p->se;
 829         struct rb_node *parent;
 830
 831         /*
 832          * Are we the only task in the tree?
 833          */
 834         if (unlikely(cfs_rq->nr_running == 1))
 835                 return;
 836
 837         if (likely(!sysctl_sched_compat_yield)) {
 838                 __update_rq_clock(rq);
 839                 /*
 840                  * Dequeue and enqueue the task to update its
 841                  * position within the tree:
 842                  */
 843                 dequeue_entity(cfs_rq, &p->se, 0);
 844                 enqueue_entity(cfs_rq, &p->se, 0);
 845
 846                 return;
 847         }
 848         /*
 849          * Find the rightmost entry in the rbtree:
 850          */
 851         do {
 852                 parent = *link;
 853                 link = &parent->rb_right;
 854         } while (*link);
 855
 856         rightmost = rb_entry(parent, struct sched_entity, run_node);
 857         /*
 858          * Already in the rightmost position?
 859          */
 860         if (unlikely(rightmost == se))
 861                 return;
 862
 863         /*
 864          * Minimally necessary key value to be last in the tree:
 865          */
 866         se->fair_key = rightmost->fair_key + 1;
 867
 868         if (cfs_rq->rb_leftmost == &se->run_node)
 869                 cfs_rq->rb_leftmost = rb_next(&se->run_node);
 870         /*
 871          * Relink the task to the rightmost position:
 872          */
 873         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 874         rb_link_node(&se->run_node, parent, link);
 875         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 876 }
 877
 878 /*
 879  * Preempt the current task with a newly woken task if needed:
 880  */
 881 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 882 {
 883         struct task_struct *curr = rq->curr;
 884         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 885
 886         if (unlikely(rt_prio(p->prio))) {
 887                 update_rq_clock(rq);
 888                 update_curr(cfs_rq);
 889                 resched_task(curr);
 890                 return;
 891         }
 892         if (is_same_group(curr, p)) {
 893                 s64 delta = curr->se.vruntime - p->se.vruntime;
 894
 895                 if (delta > (s64)sysctl_sched_wakeup_granularity)
 896                         resched_task(curr);
 897         }
 898 }
 899
 900 static struct task_struct *pick_next_task_fair(struct rq *rq)
 901 {
 902         struct cfs_rq *cfs_rq = &rq->cfs;
 903         struct sched_entity *se;
 904
 905         if (unlikely(!cfs_rq->nr_running))
 906                 return NULL;
 907
 908         do {
 909                 se = pick_next_entity(cfs_rq);
 910                 cfs_rq = group_cfs_rq(se);
 911         } while (cfs_rq);
 912
 913         return task_of(se);
 914 }
 915
 916 /*
 917  * Account for a descheduled task:
 918  */
 919 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 920 {
 921         struct sched_entity *se = &prev->se;
 922         struct cfs_rq *cfs_rq;
 923
 924         for_each_sched_entity(se) {
 925                 cfs_rq = cfs_rq_of(se);
 926                 put_prev_entity(cfs_rq, se);
 927         }
 928 }
 929
 930 /**************************************************
 931  * Fair scheduling class load-balancing methods:
 932  */
 933
 934 /*
 935  * Load-balancing iterator. Note: while the runqueue stays locked
 936  * during the whole iteration, the current task might be
 937  * dequeued so the iterator has to be dequeue-safe. Here we
 938  * achieve that by always pre-iterating before returning
 939  * the current task:
 940  */
 941 static inline struct task_struct *
 942 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 943 {
 944         struct task_struct *p;
 945
 946         if (!curr)
 947                 return NULL;
 948
 949         p = rb_entry(curr, struct task_struct, se.run_node);
 950         cfs_rq->rb_load_balance_curr = rb_next(curr);
 951
 952         return p;
 953 }
 954
 955 static struct task_struct *load_balance_start_fair(void *arg)
 956 {
 957         struct cfs_rq *cfs_rq = arg;
 958
 959         return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
 960 }
 961
 962 static struct task_struct *load_balance_next_fair(void *arg)
 963 {
 964         struct cfs_rq *cfs_rq = arg;
 965
 966         return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 967 }
 968
 969 #ifdef CONFIG_FAIR_GROUP_SCHED
 970 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 971 {
 972         struct sched_entity *curr;
 973         struct task_struct *p;
 974
 975         if (!cfs_rq->nr_running)
 976                 return MAX_PRIO;
 977
 978         curr = __pick_next_entity(cfs_rq);
 979         p = task_of(curr);
 980
 981         return p->prio;
 982 }
 983 #endif
 984
 985 static unsigned long
 986 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 987                   unsigned long max_nr_move, unsigned long max_load_move,
 988                   struct sched_domain *sd, enum cpu_idle_type idle,
 989                   int *all_pinned, int *this_best_prio)
 990 {
 991         struct cfs_rq *busy_cfs_rq;
 992         unsigned long load_moved, total_nr_moved = 0, nr_moved;
 993         long rem_load_move = max_load_move;
 994         struct rq_iterator cfs_rq_iterator;
 995
 996         cfs_rq_iterator.start = load_balance_start_fair;
 997         cfs_rq_iterator.next = load_balance_next_fair;
 998
 999         for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1000 #ifdef CONFIG_FAIR_GROUP_SCHED
1001                 struct cfs_rq *this_cfs_rq;
1002                 long imbalance;
1003                 unsigned long maxload;
1004
1005                 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1006
1007                 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
1008                 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
1009                 if (imbalance <= 0)
1010                         continue;
1011
1012                 /* Don't pull more than imbalance/2 */
1013                 imbalance /= 2;
1014                 maxload = min(rem_load_move, imbalance);
1015
1016                 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1017 #else
1018 # define maxload rem_load_move
1019 #endif
1020                 /* pass busy_cfs_rq argument into
1021                  * load_balance_[start|next]_fair iterators
1022                  */
1023                 cfs_rq_iterator.arg = busy_cfs_rq;
1024                 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
1025                                 max_nr_move, maxload, sd, idle, all_pinned,
1026                                 &load_moved, this_best_prio, &cfs_rq_iterator);
1027
1028                 total_nr_moved += nr_moved;
1029                 max_nr_move -= nr_moved;
1030                 rem_load_move -= load_moved;
1031
1032                 if (max_nr_move <= 0 || rem_load_move <= 0)
1033                         break;
1034         }
1035
1036         return max_load_move - rem_load_move;
1037 }
1038
1039 /*
1040  * scheduler tick hitting a task of our scheduling class:
1041  */
1042 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1043 {
1044         struct cfs_rq *cfs_rq;
1045         struct sched_entity *se = &curr->se;
1046
1047         for_each_sched_entity(se) {
1048                 cfs_rq = cfs_rq_of(se);
1049                 entity_tick(cfs_rq, se);
1050         }
1051 }
1052
1053 #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1054
1055 /*
1056  * Share the fairness runtime between parent and child, thus the
1057  * total amount of pressure for CPU stays equal - new tasks
1058  * get a chance to run but frequent forkers are not allowed to
1059  * monopolize the CPU. Note: the parent runqueue is locked,
1060  * the child is not running yet.
1061  */
1062 static void task_new_fair(struct rq *rq, struct task_struct *p)
1063 {
1064         struct cfs_rq *cfs_rq = task_cfs_rq(p);
1065         struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1066
1067         sched_info_queued(p);
1068
1069         update_curr(cfs_rq);
1070         place_entity(cfs_rq, se, 1);
1071
1072         /*
1073          * The statistical average of wait_runtime is about
1074          * -granularity/2, so initialize the task with that:
1075          */
1076         if (sched_feat(START_DEBIT))
1077                 se->wait_runtime = -(__sched_period(cfs_rq->nr_running+1) / 2);
1078
1079         if (sysctl_sched_child_runs_first &&
1080                         curr->vruntime < se->vruntime) {
1081
1082                 dequeue_entity(cfs_rq, curr, 0);
1083                 swap(curr->vruntime, se->vruntime);
1084                 enqueue_entity(cfs_rq, curr, 0);
1085         }
1086
1087         update_stats_enqueue(cfs_rq, se);
1088         __enqueue_entity(cfs_rq, se);
1089         resched_task(rq->curr);
1090 }
1091
1092 #ifdef CONFIG_FAIR_GROUP_SCHED
1093 /* Account for a task changing its policy or group.
1094  *
1095  * This routine is mostly called to set cfs_rq->curr field when a task
1096  * migrates between groups/classes.
1097  */
1098 static void set_curr_task_fair(struct rq *rq)
1099 {
1100         struct sched_entity *se = &rq->curr->se;
1101
1102         for_each_sched_entity(se)
1103                 set_next_entity(cfs_rq_of(se), se);
1104 }
1105 #else
1106 static void set_curr_task_fair(struct rq *rq)
1107 {
1108 }
1109 #endif
1110
1111 /*
1112  * All the scheduling class methods:
1113  */
1114 struct sched_class fair_sched_class __read_mostly = {
1115         .enqueue_task           = enqueue_task_fair,
1116         .dequeue_task           = dequeue_task_fair,
1117         .yield_task             = yield_task_fair,
1118
1119         .check_preempt_curr     = check_preempt_wakeup,
1120
1121         .pick_next_task         = pick_next_task_fair,
1122         .put_prev_task          = put_prev_task_fair,
1123
1124         .load_balance           = load_balance_fair,
1125
1126         .set_curr_task          = set_curr_task_fair,
1127         .task_tick              = task_tick_fair,
1128         .task_new               = task_new_fair,
1129 };
1130
1131 #ifdef CONFIG_SCHED_DEBUG
1132 static void print_cfs_stats(struct seq_file *m, int cpu)
1133 {
1134         struct cfs_rq *cfs_rq;
1135
1136         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1137                 print_cfs_rq(m, cpu, cfs_rq);
1138 }
1139 #endif