SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/sched_rt.c

   1 /*
   2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3  * policies)
   4  */
   5
   6 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
   7 {
   8         return container_of(rt_se, struct task_struct, rt);
   9 }
  10
  11 #ifdef CONFIG_RT_GROUP_SCHED
  12
  13 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  14 {
  15         return rt_rq->rq;
  16 }
  17
  18 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  19 {
  20         return rt_se->rt_rq;
  21 }
  22
  23 #else /* CONFIG_RT_GROUP_SCHED */
  24
  25 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  26 {
  27         return container_of(rt_rq, struct rq, rt);
  28 }
  29
  30 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  31 {
  32         struct task_struct *p = rt_task_of(rt_se);
  33         struct rq *rq = task_rq(p);
  34
  35         return &rq->rt;
  36 }
  37
  38 #endif /* CONFIG_RT_GROUP_SCHED */
  39
  40 #ifdef CONFIG_SMP
  41
  42 static inline int rt_overloaded(struct rq *rq)
  43 {
  44         return atomic_read(&rq->rd->rto_count);
  45 }
  46
  47 static inline void rt_set_overload(struct rq *rq)
  48 {
  49         if (!rq->online)
  50                 return;
  51
  52         cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  53         /*
  54          * Make sure the mask is visible before we set
  55          * the overload count. That is checked to determine
  56          * if we should look at the mask. It would be a shame
  57          * if we looked at the mask, but the mask was not
  58          * updated yet.
  59          */
  60         wmb();
  61         atomic_inc(&rq->rd->rto_count);
  62 }
  63
  64 static inline void rt_clear_overload(struct rq *rq)
  65 {
  66         if (!rq->online)
  67                 return;
  68
  69         /* the order here really doesn't matter */
  70         atomic_dec(&rq->rd->rto_count);
  71         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  72 }
  73
  74 static void update_rt_migration(struct rt_rq *rt_rq)
  75 {
  76         if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
  77                 if (!rt_rq->overloaded) {
  78                         rt_set_overload(rq_of_rt_rq(rt_rq));
  79                         rt_rq->overloaded = 1;
  80                 }
  81         } else if (rt_rq->overloaded) {
  82                 rt_clear_overload(rq_of_rt_rq(rt_rq));
  83                 rt_rq->overloaded = 0;
  84         }
  85 }
  86
  87 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  88 {
  89         if (rt_se->nr_cpus_allowed > 1)
  90                 rt_rq->rt_nr_migratory++;
  91
  92         update_rt_migration(rt_rq);
  93 }
  94
  95 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  96 {
  97         if (rt_se->nr_cpus_allowed > 1)
  98                 rt_rq->rt_nr_migratory--;
  99
 100         update_rt_migration(rt_rq);
 101 }
 102
 103 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 104 {
 105         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 106         plist_node_init(&p->pushable_tasks, p->prio);
 107         plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 108 }
 109
 110 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 111 {
 112         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 113 }
 114
 115 #else
 116
 117 static inline
 118 void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {}
 119 static inline
 120 void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {}
 121 static inline
 122 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 123 static inline
 124 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 125
 126 #endif /* CONFIG_SMP */
 127
 128 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 129 {
 130         return !list_empty(&rt_se->run_list);
 131 }
 132
 133 #ifdef CONFIG_RT_GROUP_SCHED
 134
 135 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 136 {
 137         if (!rt_rq->tg)
 138                 return RUNTIME_INF;
 139
 140         return rt_rq->rt_runtime;
 141 }
 142
 143 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 144 {
 145         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 146 }
 147
 148 #define for_each_leaf_rt_rq(rt_rq, rq) \
 149         list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 150
 151 #define for_each_sched_rt_entity(rt_se) \
 152         for (; rt_se; rt_se = rt_se->parent)
 153
 154 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 155 {
 156         return rt_se->my_q;
 157 }
 158
 159 static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
 160 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 161
 162 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 163 {
 164         struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 165         struct sched_rt_entity *rt_se = rt_rq->rt_se;
 166
 167         if (rt_rq->rt_nr_running) {
 168                 if (rt_se && !on_rt_rq(rt_se))
 169                         enqueue_rt_entity(rt_se);
 170                 if (rt_rq->highest_prio.curr < curr->prio)
 171                         resched_task(curr);
 172         }
 173 }
 174
 175 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 176 {
 177         struct sched_rt_entity *rt_se = rt_rq->rt_se;
 178
 179         if (rt_se && on_rt_rq(rt_se))
 180                 dequeue_rt_entity(rt_se);
 181 }
 182
 183 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 184 {
 185         return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 186 }
 187
 188 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 189 {
 190         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 191         struct task_struct *p;
 192
 193         if (rt_rq)
 194                 return !!rt_rq->rt_nr_boosted;
 195
 196         p = rt_task_of(rt_se);
 197         return p->prio != p->normal_prio;
 198 }
 199
 200 #ifdef CONFIG_SMP
 201 static inline const struct cpumask *sched_rt_period_mask(void)
 202 {
 203         return cpu_rq(smp_processor_id())->rd->span;
 204 }
 205 #else
 206 static inline const struct cpumask *sched_rt_period_mask(void)
 207 {
 208         return cpu_online_mask;
 209 }
 210 #endif
 211
 212 static inline
 213 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 214 {
 215         return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 216 }
 217
 218 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 219 {
 220         return &rt_rq->tg->rt_bandwidth;
 221 }
 222
 223 #else /* !CONFIG_RT_GROUP_SCHED */
 224
 225 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 226 {
 227         return rt_rq->rt_runtime;
 228 }
 229
 230 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 231 {
 232         return ktime_to_ns(def_rt_bandwidth.rt_period);
 233 }
 234
 235 #define for_each_leaf_rt_rq(rt_rq, rq) \
 236         for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 237
 238 #define for_each_sched_rt_entity(rt_se) \
 239         for (; rt_se; rt_se = NULL)
 240
 241 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 242 {
 243         return NULL;
 244 }
 245
 246 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 247 {
 248         if (rt_rq->rt_nr_running)
 249                 resched_task(rq_of_rt_rq(rt_rq)->curr);
 250 }
 251
 252 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 253 {
 254 }
 255
 256 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 257 {
 258         return rt_rq->rt_throttled;
 259 }
 260
 261 static inline const struct cpumask *sched_rt_period_mask(void)
 262 {
 263         return cpu_online_mask;
 264 }
 265
 266 static inline
 267 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 268 {
 269         return &cpu_rq(cpu)->rt;
 270 }
 271
 272 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 273 {
 274         return &def_rt_bandwidth;
 275 }
 276
 277 #endif /* CONFIG_RT_GROUP_SCHED */
 278
 279 #ifdef CONFIG_SMP
 280 /*
 281  * We ran out of runtime, see if we can borrow some from our neighbours.
 282  */
 283 static int do_balance_runtime(struct rt_rq *rt_rq)
 284 {
 285         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 286         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 287         int i, weight, more = 0;
 288         u64 rt_period;
 289
 290         weight = cpumask_weight(rd->span);
 291
 292         spin_lock(&rt_b->rt_runtime_lock);
 293         rt_period = ktime_to_ns(rt_b->rt_period);
 294         for_each_cpu(i, rd->span) {
 295                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 296                 s64 diff;
 297
 298                 if (iter == rt_rq)
 299                         continue;
 300
 301                 spin_lock(&iter->rt_runtime_lock);
 302                 /*
 303                  * Either all rqs have inf runtime and there's nothing to steal
 304                  * or __disable_runtime() below sets a specific rq to inf to
 305                  * indicate its been disabled and disalow stealing.
 306                  */
 307                 if (iter->rt_runtime == RUNTIME_INF)
 308                         goto next;
 309
 310                 /*
 311                  * From runqueues with spare time, take 1/n part of their
 312                  * spare time, but no more than our period.
 313                  */
 314                 diff = iter->rt_runtime - iter->rt_time;
 315                 if (diff > 0) {
 316                         diff = div_u64((u64)diff, weight);
 317                         if (rt_rq->rt_runtime + diff > rt_period)
 318                                 diff = rt_period - rt_rq->rt_runtime;
 319                         iter->rt_runtime -= diff;
 320                         rt_rq->rt_runtime += diff;
 321                         more = 1;
 322                         if (rt_rq->rt_runtime == rt_period) {
 323                                 spin_unlock(&iter->rt_runtime_lock);
 324                                 break;
 325                         }
 326                 }
 327 next:
 328                 spin_unlock(&iter->rt_runtime_lock);
 329         }
 330         spin_unlock(&rt_b->rt_runtime_lock);
 331
 332         return more;
 333 }
 334
 335 /*
 336  * Ensure this RQ takes back all the runtime it lend to its neighbours.
 337  */
 338 static void __disable_runtime(struct rq *rq)
 339 {
 340         struct root_domain *rd = rq->rd;
 341         struct rt_rq *rt_rq;
 342
 343         if (unlikely(!scheduler_running))
 344                 return;
 345
 346         for_each_leaf_rt_rq(rt_rq, rq) {
 347                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 348                 s64 want;
 349                 int i;
 350
 351                 spin_lock(&rt_b->rt_runtime_lock);
 352                 spin_lock(&rt_rq->rt_runtime_lock);
 353                 /*
 354                  * Either we're all inf and nobody needs to borrow, or we're
 355                  * already disabled and thus have nothing to do, or we have
 356                  * exactly the right amount of runtime to take out.
 357                  */
 358                 if (rt_rq->rt_runtime == RUNTIME_INF ||
 359                                 rt_rq->rt_runtime == rt_b->rt_runtime)
 360                         goto balanced;
 361                 spin_unlock(&rt_rq->rt_runtime_lock);
 362
 363                 /*
 364                  * Calculate the difference between what we started out with
 365                  * and what we current have, that's the amount of runtime
 366                  * we lend and now have to reclaim.
 367                  */
 368                 want = rt_b->rt_runtime - rt_rq->rt_runtime;
 369
 370                 /*
 371                  * Greedy reclaim, take back as much as we can.
 372                  */
 373                 for_each_cpu(i, rd->span) {
 374                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 375                         s64 diff;
 376
 377                         /*
 378                          * Can't reclaim from ourselves or disabled runqueues.
 379                          */
 380                         if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 381                                 continue;
 382
 383                         spin_lock(&iter->rt_runtime_lock);
 384                         if (want > 0) {
 385                                 diff = min_t(s64, iter->rt_runtime, want);
 386                                 iter->rt_runtime -= diff;
 387                                 want -= diff;
 388                         } else {
 389                                 iter->rt_runtime -= want;
 390                                 want -= want;
 391                         }
 392                         spin_unlock(&iter->rt_runtime_lock);
 393
 394                         if (!want)
 395                                 break;
 396                 }
 397
 398                 spin_lock(&rt_rq->rt_runtime_lock);
 399                 /*
 400                  * We cannot be left wanting - that would mean some runtime
 401                  * leaked out of the system.
 402                  */
 403                 BUG_ON(want);
 404 balanced:
 405                 /*
 406                  * Disable all the borrow logic by pretending we have inf
 407                  * runtime - in which case borrowing doesn't make sense.
 408                  */
 409                 rt_rq->rt_runtime = RUNTIME_INF;
 410                 spin_unlock(&rt_rq->rt_runtime_lock);
 411                 spin_unlock(&rt_b->rt_runtime_lock);
 412         }
 413 }
 414
 415 static void disable_runtime(struct rq *rq)
 416 {
 417         unsigned long flags;
 418
 419         spin_lock_irqsave(&rq->lock, flags);
 420         __disable_runtime(rq);
 421         spin_unlock_irqrestore(&rq->lock, flags);
 422 }
 423
 424 static void __enable_runtime(struct rq *rq)
 425 {
 426         struct rt_rq *rt_rq;
 427
 428         if (unlikely(!scheduler_running))
 429                 return;
 430
 431         /*
 432          * Reset each runqueue's bandwidth settings
 433          */
 434         for_each_leaf_rt_rq(rt_rq, rq) {
 435                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 436
 437                 spin_lock(&rt_b->rt_runtime_lock);
 438                 spin_lock(&rt_rq->rt_runtime_lock);
 439                 rt_rq->rt_runtime = rt_b->rt_runtime;
 440                 rt_rq->rt_time = 0;
 441                 rt_rq->rt_throttled = 0;
 442                 spin_unlock(&rt_rq->rt_runtime_lock);
 443                 spin_unlock(&rt_b->rt_runtime_lock);
 444         }
 445 }
 446
 447 static void enable_runtime(struct rq *rq)
 448 {
 449         unsigned long flags;
 450
 451         spin_lock_irqsave(&rq->lock, flags);
 452         __enable_runtime(rq);
 453         spin_unlock_irqrestore(&rq->lock, flags);
 454 }
 455
 456 static int balance_runtime(struct rt_rq *rt_rq)
 457 {
 458         int more = 0;
 459
 460         if (rt_rq->rt_time > rt_rq->rt_runtime) {
 461                 spin_unlock(&rt_rq->rt_runtime_lock);
 462                 more = do_balance_runtime(rt_rq);
 463                 spin_lock(&rt_rq->rt_runtime_lock);
 464         }
 465
 466         return more;
 467 }
 468 #else /* !CONFIG_SMP */
 469 static inline int balance_runtime(struct rt_rq *rt_rq)
 470 {
 471         return 0;
 472 }
 473 #endif /* CONFIG_SMP */
 474
 475 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 476 {
 477         int i, idle = 1;
 478         const struct cpumask *span;
 479
 480         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 481                 return 1;
 482
 483         span = sched_rt_period_mask();
 484         for_each_cpu(i, span) {
 485                 int enqueue = 0;
 486                 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 487                 struct rq *rq = rq_of_rt_rq(rt_rq);
 488
 489                 spin_lock(&rq->lock);
 490                 if (rt_rq->rt_time) {
 491                         u64 runtime;
 492
 493                         spin_lock(&rt_rq->rt_runtime_lock);
 494                         if (rt_rq->rt_throttled)
 495                                 balance_runtime(rt_rq);
 496                         runtime = rt_rq->rt_runtime;
 497                         rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 498                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 499                                 rt_rq->rt_throttled = 0;
 500                                 enqueue = 1;
 501                         }
 502                         if (rt_rq->rt_time || rt_rq->rt_nr_running)
 503                                 idle = 0;
 504                         spin_unlock(&rt_rq->rt_runtime_lock);
 505                 } else if (rt_rq->rt_nr_running)
 506                         idle = 0;
 507
 508                 if (enqueue)
 509                         sched_rt_rq_enqueue(rt_rq);
 510                 spin_unlock(&rq->lock);
 511         }
 512
 513         return idle;
 514 }
 515
 516 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 517 {
 518 #ifdef CONFIG_RT_GROUP_SCHED
 519         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 520
 521         if (rt_rq)
 522                 return rt_rq->highest_prio.curr;
 523 #endif
 524
 525         return rt_task_of(rt_se)->prio;
 526 }
 527
 528 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 529 {
 530         u64 runtime = sched_rt_runtime(rt_rq);
 531
 532         if (rt_rq->rt_throttled)
 533                 return rt_rq_throttled(rt_rq);
 534
 535         if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
 536                 return 0;
 537
 538         balance_runtime(rt_rq);
 539         runtime = sched_rt_runtime(rt_rq);
 540         if (runtime == RUNTIME_INF)
 541                 return 0;
 542
 543         if (rt_rq->rt_time > runtime) {
 544                 rt_rq->rt_throttled = 1;
 545                 if (rt_rq_throttled(rt_rq)) {
 546                         sched_rt_rq_dequeue(rt_rq);
 547                         return 1;
 548                 }
 549         }
 550
 551         return 0;
 552 }
 553
 554 /*
 555  * Update the current task's runtime statistics. Skip current tasks that
 556  * are not in our scheduling class.
 557  */
 558 static void update_curr_rt(struct rq *rq)
 559 {
 560         struct task_struct *curr = rq->curr;
 561         struct sched_rt_entity *rt_se = &curr->rt;
 562         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 563         u64 delta_exec;
 564
 565         if (!task_has_rt_policy(curr))
 566                 return;
 567
 568         delta_exec = rq->clock - curr->se.exec_start;
 569         if (unlikely((s64)delta_exec < 0))
 570                 delta_exec = 0;
 571
 572         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 573
 574         curr->se.sum_exec_runtime += delta_exec;
 575         account_group_exec_runtime(curr, delta_exec);
 576
 577         curr->se.exec_start = rq->clock;
 578         cpuacct_charge(curr, delta_exec);
 579
 580         if (!rt_bandwidth_enabled())
 581                 return;
 582
 583         for_each_sched_rt_entity(rt_se) {
 584                 rt_rq = rt_rq_of_se(rt_se);
 585
 586                 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 587                         spin_lock(&rt_rq->rt_runtime_lock);
 588                         rt_rq->rt_time += delta_exec;
 589                         if (sched_rt_runtime_exceeded(rt_rq))
 590                                 resched_task(curr);
 591                         spin_unlock(&rt_rq->rt_runtime_lock);
 592                 }
 593         }
 594 }
 595
 596 #if defined CONFIG_SMP
 597
 598 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
 599
 600 static inline int next_prio(struct rq *rq)
 601 {
 602         struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
 603
 604         if (next && rt_prio(next->prio))
 605                 return next->prio;
 606         else
 607                 return MAX_RT_PRIO;
 608 }
 609
 610 static void
 611 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 612 {
 613         struct rq *rq = rq_of_rt_rq(rt_rq);
 614
 615         if (prio < prev_prio) {
 616
 617                 /*
 618                  * If the new task is higher in priority than anything on the
 619                  * run-queue, we know that the previous high becomes our
 620                  * next-highest.
 621                  */
 622                 rt_rq->highest_prio.next = prev_prio;
 623
 624                 if (rq->online)
 625                         cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 626
 627         } else if (prio == rt_rq->highest_prio.curr)
 628                 /*
 629                  * If the next task is equal in priority to the highest on
 630                  * the run-queue, then we implicitly know that the next highest
 631                  * task cannot be any lower than current
 632                  */
 633                 rt_rq->highest_prio.next = prio;
 634         else if (prio < rt_rq->highest_prio.next)
 635                 /*
 636                  * Otherwise, we need to recompute next-highest
 637                  */
 638                 rt_rq->highest_prio.next = next_prio(rq);
 639 }
 640
 641 static void
 642 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 643 {
 644         struct rq *rq = rq_of_rt_rq(rt_rq);
 645
 646         if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
 647                 rt_rq->highest_prio.next = next_prio(rq);
 648
 649         if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 650                 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 651 }
 652
 653 #else /* CONFIG_SMP */
 654
 655 static inline
 656 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 657 static inline
 658 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 659
 660 #endif /* CONFIG_SMP */
 661
 662 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 663 static void
 664 inc_rt_prio(struct rt_rq *rt_rq, int prio)
 665 {
 666         int prev_prio = rt_rq->highest_prio.curr;
 667
 668         if (prio < prev_prio)
 669                 rt_rq->highest_prio.curr = prio;
 670
 671         inc_rt_prio_smp(rt_rq, prio, prev_prio);
 672 }
 673
 674 static void
 675 dec_rt_prio(struct rt_rq *rt_rq, int prio)
 676 {
 677         int prev_prio = rt_rq->highest_prio.curr;
 678
 679         if (rt_rq->rt_nr_running) {
 680
 681                 WARN_ON(prio < prev_prio);
 682
 683                 /*
 684                  * This may have been our highest task, and therefore
 685                  * we may have some recomputation to do
 686                  */
 687                 if (prio == prev_prio) {
 688                         struct rt_prio_array *array = &rt_rq->active;
 689
 690                         rt_rq->highest_prio.curr =
 691                                 sched_find_first_bit(array->bitmap);
 692                 }
 693
 694         } else
 695                 rt_rq->highest_prio.curr = MAX_RT_PRIO;
 696
 697         dec_rt_prio_smp(rt_rq, prio, prev_prio);
 698 }
 699
 700 #else
 701
 702 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 703 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 704
 705 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 706
 707 #ifdef CONFIG_RT_GROUP_SCHED
 708
 709 static void
 710 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 711 {
 712         if (rt_se_boosted(rt_se))
 713                 rt_rq->rt_nr_boosted++;
 714
 715         if (rt_rq->tg)
 716                 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
 717 }
 718
 719 static void
 720 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 721 {
 722         if (rt_se_boosted(rt_se))
 723                 rt_rq->rt_nr_boosted--;
 724
 725         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
 726 }
 727
 728 #else /* CONFIG_RT_GROUP_SCHED */
 729
 730 static void
 731 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 732 {
 733         start_rt_bandwidth(&def_rt_bandwidth);
 734 }
 735
 736 static inline
 737 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 738
 739 #endif /* CONFIG_RT_GROUP_SCHED */
 740
 741 static inline
 742 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 743 {
 744         int prio = rt_se_prio(rt_se);
 745
 746         WARN_ON(!rt_prio(prio));
 747         rt_rq->rt_nr_running++;
 748
 749         inc_rt_prio(rt_rq, prio);
 750         inc_rt_migration(rt_se, rt_rq);
 751         inc_rt_group(rt_se, rt_rq);
 752 }
 753
 754 static inline
 755 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 756 {
 757         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 758         WARN_ON(!rt_rq->rt_nr_running);
 759         rt_rq->rt_nr_running--;
 760
 761         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
 762         dec_rt_migration(rt_se, rt_rq);
 763         dec_rt_group(rt_se, rt_rq);
 764 }
 765
 766 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 767 {
 768         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 769         struct rt_prio_array *array = &rt_rq->active;
 770         struct rt_rq *group_rq = group_rt_rq(rt_se);
 771         struct list_head *queue = array->queue + rt_se_prio(rt_se);
 772
 773         /*
 774          * Don't enqueue the group if its throttled, or when empty.
 775          * The latter is a consequence of the former when a child group
 776          * get throttled and the current group doesn't have any other
 777          * active members.
 778          */
 779         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 780                 return;
 781
 782         list_add_tail(&rt_se->run_list, queue);
 783         __set_bit(rt_se_prio(rt_se), array->bitmap);
 784
 785         inc_rt_tasks(rt_se, rt_rq);
 786 }
 787
 788 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 789 {
 790         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 791         struct rt_prio_array *array = &rt_rq->active;
 792
 793         list_del_init(&rt_se->run_list);
 794         if (list_empty(array->queue + rt_se_prio(rt_se)))
 795                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
 796
 797         dec_rt_tasks(rt_se, rt_rq);
 798 }
 799
 800 /*
 801  * Because the prio of an upper entry depends on the lower
 802  * entries, we must remove entries top - down.
 803  */
 804 static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 805 {
 806         struct sched_rt_entity *back = NULL;
 807
 808         for_each_sched_rt_entity(rt_se) {
 809                 rt_se->back = back;
 810                 back = rt_se;
 811         }
 812
 813         for (rt_se = back; rt_se; rt_se = rt_se->back) {
 814                 if (on_rt_rq(rt_se))
 815                         __dequeue_rt_entity(rt_se);
 816         }
 817 }
 818
 819 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 820 {
 821         dequeue_rt_stack(rt_se);
 822         for_each_sched_rt_entity(rt_se)
 823                 __enqueue_rt_entity(rt_se);
 824 }
 825
 826 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 827 {
 828         dequeue_rt_stack(rt_se);
 829
 830         for_each_sched_rt_entity(rt_se) {
 831                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
 832
 833                 if (rt_rq && rt_rq->rt_nr_running)
 834                         __enqueue_rt_entity(rt_se);
 835         }
 836 }
 837
 838 /*
 839  * Adding/removing a task to/from a priority array:
 840  */
 841 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 842 {
 843         struct sched_rt_entity *rt_se = &p->rt;
 844
 845         if (wakeup)
 846                 rt_se->timeout = 0;
 847
 848         enqueue_rt_entity(rt_se);
 849
 850         if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 851                 enqueue_pushable_task(rq, p);
 852
 853         inc_cpu_load(rq, p->se.load.weight);
 854 }
 855
 856 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 857 {
 858         struct sched_rt_entity *rt_se = &p->rt;
 859
 860         update_curr_rt(rq);
 861         dequeue_rt_entity(rt_se);
 862
 863         dequeue_pushable_task(rq, p);
 864
 865         dec_cpu_load(rq, p->se.load.weight);
 866 }
 867
 868 /*
 869  * Put task to the end of the run list without the overhead of dequeue
 870  * followed by enqueue.
 871  */
 872 static void
 873 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 874 {
 875         if (on_rt_rq(rt_se)) {
 876                 struct rt_prio_array *array = &rt_rq->active;
 877                 struct list_head *queue = array->queue + rt_se_prio(rt_se);
 878
 879                 if (head)
 880                         list_move(&rt_se->run_list, queue);
 881                 else
 882                         list_move_tail(&rt_se->run_list, queue);
 883         }
 884 }
 885
 886 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 887 {
 888         struct sched_rt_entity *rt_se = &p->rt;
 889         struct rt_rq *rt_rq;
 890
 891         for_each_sched_rt_entity(rt_se) {
 892                 rt_rq = rt_rq_of_se(rt_se);
 893                 requeue_rt_entity(rt_rq, rt_se, head);
 894         }
 895 }
 896
 897 static void yield_task_rt(struct rq *rq)
 898 {
 899         requeue_task_rt(rq, rq->curr, 0);
 900 }
 901
 902 #ifdef CONFIG_SMP
 903 static int find_lowest_rq(struct task_struct *task);
 904
 905 static int select_task_rq_rt(struct task_struct *p, int sync)
 906 {
 907         struct rq *rq = task_rq(p);
 908
 909         /*
 910          * If the current task is an RT task, then
 911          * try to see if we can wake this RT task up on another
 912          * runqueue. Otherwise simply start this RT task
 913          * on its current runqueue.
 914          *
 915          * We want to avoid overloading runqueues. Even if
 916          * the RT task is of higher priority than the current RT task.
 917          * RT tasks behave differently than other tasks. If
 918          * one gets preempted, we try to push it off to another queue.
 919          * So trying to keep a preempting RT task on the same
 920          * cache hot CPU will force the running RT task to
 921          * a cold CPU. So we waste all the cache for the lower
 922          * RT task in hopes of saving some of a RT task
 923          * that is just being woken and probably will have
 924          * cold cache anyway.
 925          */
 926         if (unlikely(rt_task(rq->curr)) &&
 927             (p->rt.nr_cpus_allowed > 1)) {
 928                 int cpu = find_lowest_rq(p);
 929
 930                 return (cpu == -1) ? task_cpu(p) : cpu;
 931         }
 932
 933         /*
 934          * Otherwise, just let it ride on the affined RQ and the
 935          * post-schedule router will push the preempted task away
 936          */
 937         return task_cpu(p);
 938 }
 939
 940 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 941 {
 942         cpumask_var_t mask;
 943
 944         if (rq->curr->rt.nr_cpus_allowed == 1)
 945                 return;
 946
 947         if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
 948                 return;
 949
 950         if (p->rt.nr_cpus_allowed != 1
 951             && cpupri_find(&rq->rd->cpupri, p, mask))
 952                 goto free;
 953
 954         if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
 955                 goto free;
 956
 957         /*
 958          * There appears to be other cpus that can accept
 959          * current and none to run 'p', so lets reschedule
 960          * to try and push current away:
 961          */
 962         requeue_task_rt(rq, p, 1);
 963         resched_task(rq->curr);
 964 free:
 965         free_cpumask_var(mask);
 966 }
 967
 968 #endif /* CONFIG_SMP */
 969
 970 /*
 971  * Preempt the current task with a newly woken task if needed:
 972  */
 973 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
 974 {
 975         if (p->prio < rq->curr->prio) {
 976                 resched_task(rq->curr);
 977                 return;
 978         }
 979
 980 #ifdef CONFIG_SMP
 981         /*
 982          * If:
 983          *
 984          * - the newly woken task is of equal priority to the current task
 985          * - the newly woken task is non-migratable while current is migratable
 986          * - current will be preempted on the next reschedule
 987          *
 988          * we should check to see if current can readily move to a different
 989          * cpu.  If so, we will reschedule to allow the push logic to try
 990          * to move current somewhere else, making room for our non-migratable
 991          * task.
 992          */
 993         if (p->prio == rq->curr->prio && !need_resched())
 994                 check_preempt_equal_prio(rq, p);
 995 #endif
 996 }
 997
 998 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 999                                                    struct rt_rq *rt_rq)
1000 {
1001         struct rt_prio_array *array = &rt_rq->active;
1002         struct sched_rt_entity *next = NULL;
1003         struct list_head *queue;
1004         int idx;
1005
1006         idx = sched_find_first_bit(array->bitmap);
1007         BUG_ON(idx >= MAX_RT_PRIO);
1008
1009         queue = array->queue + idx;
1010         next = list_entry(queue->next, struct sched_rt_entity, run_list);
1011
1012         return next;
1013 }
1014
1015 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1016 {
1017         struct sched_rt_entity *rt_se;
1018         struct task_struct *p;
1019         struct rt_rq *rt_rq;
1020
1021         rt_rq = &rq->rt;
1022
1023         if (unlikely(!rt_rq->rt_nr_running))
1024                 return NULL;
1025
1026         if (rt_rq_throttled(rt_rq))
1027                 return NULL;
1028
1029         do {
1030                 rt_se = pick_next_rt_entity(rq, rt_rq);
1031                 BUG_ON(!rt_se);
1032                 rt_rq = group_rt_rq(rt_se);
1033         } while (rt_rq);
1034
1035         p = rt_task_of(rt_se);
1036         p->se.exec_start = rq->clock;
1037
1038         return p;
1039 }
1040
1041 static struct task_struct *pick_next_task_rt(struct rq *rq)
1042 {
1043         struct task_struct *p = _pick_next_task_rt(rq);
1044
1045         /* The running task is never eligible for pushing */
1046         if (p)
1047                 dequeue_pushable_task(rq, p);
1048
1049         return p;
1050 }
1051
1052 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1053 {
1054         update_curr_rt(rq);
1055         p->se.exec_start = 0;
1056
1057         /*
1058          * The previous task needs to be made eligible for pushing
1059          * if it is still active
1060          */
1061         if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1062                 enqueue_pushable_task(rq, p);
1063 }
1064
1065 #ifdef CONFIG_SMP
1066
1067 /* Only try algorithms three times */
1068 #define RT_MAX_TRIES 3
1069
1070 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1071
1072 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1073 {
1074         if (!task_running(rq, p) &&
1075             (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1076             (p->rt.nr_cpus_allowed > 1))
1077                 return 1;
1078         return 0;
1079 }
1080
1081 /* Return the second highest RT task, NULL otherwise */
1082 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1083 {
1084         struct task_struct *next = NULL;
1085         struct sched_rt_entity *rt_se;
1086         struct rt_prio_array *array;
1087         struct rt_rq *rt_rq;
1088         int idx;
1089
1090         for_each_leaf_rt_rq(rt_rq, rq) {
1091                 array = &rt_rq->active;
1092                 idx = sched_find_first_bit(array->bitmap);
1093  next_idx:
1094                 if (idx >= MAX_RT_PRIO)
1095                         continue;
1096                 if (next && next->prio < idx)
1097                         continue;
1098                 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1099                         struct task_struct *p = rt_task_of(rt_se);
1100                         if (pick_rt_task(rq, p, cpu)) {
1101                                 next = p;
1102                                 break;
1103                         }
1104                 }
1105                 if (!next) {
1106                         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1107                         goto next_idx;
1108                 }
1109         }
1110
1111         return next;
1112 }
1113
1114 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1115
1116 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
1117 {
1118         int first;
1119
1120         /* "this_cpu" is cheaper to preempt than a remote processor */
1121         if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
1122                 return this_cpu;
1123
1124         first = first_cpu(*mask);
1125         if (first != NR_CPUS)
1126                 return first;
1127
1128         return -1;
1129 }
1130
1131 static int find_lowest_rq(struct task_struct *task)
1132 {
1133         struct sched_domain *sd;
1134         struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1135         int this_cpu = smp_processor_id();
1136         int cpu      = task_cpu(task);
1137
1138         if (task->rt.nr_cpus_allowed == 1)
1139                 return -1; /* No other targets possible */
1140
1141         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1142                 return -1; /* No targets found */
1143
1144         /*
1145          * Only consider CPUs that are usable for migration.
1146          * I guess we might want to change cpupri_find() to ignore those
1147          * in the first place.
1148          */
1149         cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1150
1151         /*
1152          * At this point we have built a mask of cpus representing the
1153          * lowest priority tasks in the system.  Now we want to elect
1154          * the best one based on our affinity and topology.
1155          *
1156          * We prioritize the last cpu that the task executed on since
1157          * it is most likely cache-hot in that location.
1158          */
1159         if (cpumask_test_cpu(cpu, lowest_mask))
1160                 return cpu;
1161
1162         /*
1163          * Otherwise, we consult the sched_domains span maps to figure
1164          * out which cpu is logically closest to our hot cache data.
1165          */
1166         if (this_cpu == cpu)
1167                 this_cpu = -1; /* Skip this_cpu opt if the same */
1168
1169         for_each_domain(cpu, sd) {
1170                 if (sd->flags & SD_WAKE_AFFINE) {
1171                         cpumask_t domain_mask;
1172                         int       best_cpu;
1173
1174                         cpumask_and(&domain_mask, sched_domain_span(sd),
1175                                     lowest_mask);
1176
1177                         best_cpu = pick_optimal_cpu(this_cpu,
1178                                                     &domain_mask);
1179                         if (best_cpu != -1)
1180                                 return best_cpu;
1181                 }
1182         }
1183
1184         /*
1185          * And finally, if there were no matches within the domains
1186          * just give the caller *something* to work with from the compatible
1187          * locations.
1188          */
1189         return pick_optimal_cpu(this_cpu, lowest_mask);
1190 }
1191
1192 /* Will lock the rq it finds */
1193 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1194 {
1195         struct rq *lowest_rq = NULL;
1196         int tries;
1197         int cpu;
1198
1199         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1200                 cpu = find_lowest_rq(task);
1201
1202                 if ((cpu == -1) || (cpu == rq->cpu))
1203                         break;
1204
1205                 lowest_rq = cpu_rq(cpu);
1206
1207                 /* if the prio of this runqueue changed, try again */
1208                 if (double_lock_balance(rq, lowest_rq)) {
1209                         /*
1210                          * We had to unlock the run queue. In
1211                          * the mean time, task could have
1212                          * migrated already or had its affinity changed.
1213                          * Also make sure that it wasn't scheduled on its rq.
1214                          */
1215                         if (unlikely(task_rq(task) != rq ||
1216                                      !cpumask_test_cpu(lowest_rq->cpu,
1217                                                        &task->cpus_allowed) ||
1218                                      task_running(rq, task) ||
1219                                      !task->se.on_rq)) {
1220
1221                                 spin_unlock(&lowest_rq->lock);
1222                                 lowest_rq = NULL;
1223                                 break;
1224                         }
1225                 }
1226
1227                 /* If this rq is still suitable use it. */
1228                 if (lowest_rq->rt.highest_prio.curr > task->prio)
1229                         break;
1230
1231                 /* try again */
1232                 double_unlock_balance(rq, lowest_rq);
1233                 lowest_rq = NULL;
1234         }
1235
1236         return lowest_rq;
1237 }
1238
1239 static inline int has_pushable_tasks(struct rq *rq)
1240 {
1241         return !plist_head_empty(&rq->rt.pushable_tasks);
1242 }
1243
1244 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1245 {
1246         struct task_struct *p;
1247
1248         if (!has_pushable_tasks(rq))
1249                 return NULL;
1250
1251         p = plist_first_entry(&rq->rt.pushable_tasks,
1252                               struct task_struct, pushable_tasks);
1253
1254         BUG_ON(rq->cpu != task_cpu(p));
1255         BUG_ON(task_current(rq, p));
1256         BUG_ON(p->rt.nr_cpus_allowed <= 1);
1257
1258         BUG_ON(!p->se.on_rq);
1259         BUG_ON(!rt_task(p));
1260
1261         return p;
1262 }
1263
1264 /*
1265  * If the current CPU has more than one RT task, see if the non
1266  * running task can migrate over to a CPU that is running a task
1267  * of lesser priority.
1268  */
1269 static int push_rt_task(struct rq *rq)
1270 {
1271         struct task_struct *next_task;
1272         struct rq *lowest_rq;
1273
1274         if (!rq->rt.overloaded)
1275                 return 0;
1276
1277         next_task = pick_next_pushable_task(rq);
1278         if (!next_task)
1279                 return 0;
1280
1281  retry:
1282         if (unlikely(next_task == rq->curr)) {
1283                 WARN_ON(1);
1284                 return 0;
1285         }
1286
1287         /*
1288          * It's possible that the next_task slipped in of
1289          * higher priority than current. If that's the case
1290          * just reschedule current.
1291          */
1292         if (unlikely(next_task->prio < rq->curr->prio)) {
1293                 resched_task(rq->curr);
1294                 return 0;
1295         }
1296
1297         /* We might release rq lock */
1298         get_task_struct(next_task);
1299
1300         /* find_lock_lowest_rq locks the rq if found */
1301         lowest_rq = find_lock_lowest_rq(next_task, rq);
1302         if (!lowest_rq) {
1303                 struct task_struct *task;
1304                 /*
1305                  * find lock_lowest_rq releases rq->lock
1306                  * so it is possible that next_task has migrated.
1307                  *
1308                  * We need to make sure that the task is still on the same
1309                  * run-queue and is also still the next task eligible for
1310                  * pushing.
1311                  */
1312                 task = pick_next_pushable_task(rq);
1313                 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1314                         /*
1315                          * If we get here, the task hasnt moved at all, but
1316                          * it has failed to push.  We will not try again,
1317                          * since the other cpus will pull from us when they
1318                          * are ready.
1319                          */
1320                         dequeue_pushable_task(rq, next_task);
1321                         goto out;
1322                 }
1323
1324                 if (!task)
1325                         /* No more tasks, just exit */
1326                         goto out;
1327
1328                 /*
1329                  * Something has shifted, try again.
1330                  */
1331                 put_task_struct(next_task);
1332                 next_task = task;
1333                 goto retry;
1334         }
1335
1336         deactivate_task(rq, next_task, 0);
1337         set_task_cpu(next_task, lowest_rq->cpu);
1338         activate_task(lowest_rq, next_task, 0);
1339
1340         resched_task(lowest_rq->curr);
1341
1342         double_unlock_balance(rq, lowest_rq);
1343
1344 out:
1345         put_task_struct(next_task);
1346
1347         return 1;
1348 }
1349
1350 static void push_rt_tasks(struct rq *rq)
1351 {
1352         /* push_rt_task will return true if it moved an RT */
1353         while (push_rt_task(rq))
1354                 ;
1355 }
1356
1357 static int pull_rt_task(struct rq *this_rq)
1358 {
1359         int this_cpu = this_rq->cpu, ret = 0, cpu;
1360         struct task_struct *p;
1361         struct rq *src_rq;
1362
1363         if (likely(!rt_overloaded(this_rq)))
1364                 return 0;
1365
1366         for_each_cpu(cpu, this_rq->rd->rto_mask) {
1367                 if (this_cpu == cpu)
1368                         continue;
1369
1370                 src_rq = cpu_rq(cpu);
1371
1372                 /*
1373                  * Don't bother taking the src_rq->lock if the next highest
1374                  * task is known to be lower-priority than our current task.
1375                  * This may look racy, but if this value is about to go
1376                  * logically higher, the src_rq will push this task away.
1377                  * And if its going logically lower, we do not care
1378                  */
1379                 if (src_rq->rt.highest_prio.next >=
1380                     this_rq->rt.highest_prio.curr)
1381                         continue;
1382
1383                 /*
1384                  * We can potentially drop this_rq's lock in
1385                  * double_lock_balance, and another CPU could
1386                  * alter this_rq
1387                  */
1388                 double_lock_balance(this_rq, src_rq);
1389
1390                 /*
1391                  * Are there still pullable RT tasks?
1392                  */
1393                 if (src_rq->rt.rt_nr_running <= 1)
1394                         goto skip;
1395
1396                 p = pick_next_highest_task_rt(src_rq, this_cpu);
1397
1398                 /*
1399                  * Do we have an RT task that preempts
1400                  * the to-be-scheduled task?
1401                  */
1402                 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1403                         WARN_ON(p == src_rq->curr);
1404                         WARN_ON(!p->se.on_rq);
1405
1406                         /*
1407                          * There's a chance that p is higher in priority
1408                          * than what's currently running on its cpu.
1409                          * This is just that p is wakeing up and hasn't
1410                          * had a chance to schedule. We only pull
1411                          * p if it is lower in priority than the
1412                          * current task on the run queue
1413                          */
1414                         if (p->prio < src_rq->curr->prio)
1415                                 goto skip;
1416
1417                         ret = 1;
1418
1419                         deactivate_task(src_rq, p, 0);
1420                         set_task_cpu(p, this_cpu);
1421                         activate_task(this_rq, p, 0);
1422                         /*
1423                          * We continue with the search, just in
1424                          * case there's an even higher prio task
1425                          * in another runqueue. (low likelyhood
1426                          * but possible)
1427                          */
1428                 }
1429  skip:
1430                 double_unlock_balance(this_rq, src_rq);
1431         }
1432
1433         return ret;
1434 }
1435
1436 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1437 {
1438         /* Try to pull RT tasks here if we lower this rq's prio */
1439         if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1440                 pull_rt_task(rq);
1441 }
1442
1443 /*
1444  * assumes rq->lock is held
1445  */
1446 static int needs_post_schedule_rt(struct rq *rq)
1447 {
1448         return has_pushable_tasks(rq);
1449 }
1450
1451 static void post_schedule_rt(struct rq *rq)
1452 {
1453         /*
1454          * This is only called if needs_post_schedule_rt() indicates that
1455          * we need to push tasks away
1456          */
1457         spin_lock_irq(&rq->lock);
1458         push_rt_tasks(rq);
1459         spin_unlock_irq(&rq->lock);
1460 }
1461
1462 /*
1463  * If we are not running and we are not going to reschedule soon, we should
1464  * try to push tasks away now
1465  */
1466 static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1467 {
1468         if (!task_running(rq, p) &&
1469             !test_tsk_need_resched(rq->curr) &&
1470             has_pushable_tasks(rq) &&
1471             p->rt.nr_cpus_allowed > 1)
1472                 push_rt_tasks(rq);
1473 }
1474
1475 static unsigned long
1476 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1477                 unsigned long max_load_move,
1478                 struct sched_domain *sd, enum cpu_idle_type idle,
1479                 int *all_pinned, int *this_best_prio)
1480 {
1481         /* don't touch RT tasks */
1482         return 0;
1483 }
1484
1485 static int
1486 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1487                  struct sched_domain *sd, enum cpu_idle_type idle)
1488 {
1489         /* don't touch RT tasks */
1490         return 0;
1491 }
1492
1493 static void set_cpus_allowed_rt(struct task_struct *p,
1494                                 const struct cpumask *new_mask)
1495 {
1496         int weight = cpumask_weight(new_mask);
1497
1498         BUG_ON(!rt_task(p));
1499
1500         /*
1501          * Update the migration status of the RQ if we have an RT task
1502          * which is running AND changing its weight value.
1503          */
1504         if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1505                 struct rq *rq = task_rq(p);
1506
1507                 if (!task_current(rq, p)) {
1508                         /*
1509                          * Make sure we dequeue this task from the pushable list
1510                          * before going further.  It will either remain off of
1511                          * the list because we are no longer pushable, or it
1512                          * will be requeued.
1513                          */
1514                         if (p->rt.nr_cpus_allowed > 1)
1515                                 dequeue_pushable_task(rq, p);
1516
1517                         /*
1518                          * Requeue if our weight is changing and still > 1
1519                          */
1520                         if (weight > 1)
1521                                 enqueue_pushable_task(rq, p);
1522
1523                 }
1524
1525                 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1526                         rq->rt.rt_nr_migratory++;
1527                 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1528                         BUG_ON(!rq->rt.rt_nr_migratory);
1529                         rq->rt.rt_nr_migratory--;
1530                 }
1531
1532                 update_rt_migration(&rq->rt);
1533         }
1534
1535         cpumask_copy(&p->cpus_allowed, new_mask);
1536         p->rt.nr_cpus_allowed = weight;
1537 }
1538
1539 /* Assumes rq->lock is held */
1540 static void rq_online_rt(struct rq *rq)
1541 {
1542         if (rq->rt.overloaded)
1543                 rt_set_overload(rq);
1544
1545         __enable_runtime(rq);
1546
1547         cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1548 }
1549
1550 /* Assumes rq->lock is held */
1551 static void rq_offline_rt(struct rq *rq)
1552 {
1553         if (rq->rt.overloaded)
1554                 rt_clear_overload(rq);
1555
1556         __disable_runtime(rq);
1557
1558         cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1559 }
1560
1561 /*
1562  * When switch from the rt queue, we bring ourselves to a position
1563  * that we might want to pull RT tasks from other runqueues.
1564  */
1565 static void switched_from_rt(struct rq *rq, struct task_struct *p,
1566                            int running)
1567 {
1568         /*
1569          * If there are other RT tasks then we will reschedule
1570          * and the scheduling of the other RT tasks will handle
1571          * the balancing. But if we are the last RT task
1572          * we may need to handle the pulling of RT tasks
1573          * now.
1574          */
1575         if (!rq->rt.rt_nr_running)
1576                 pull_rt_task(rq);
1577 }
1578
1579 static inline void init_sched_rt_class(void)
1580 {
1581         unsigned int i;
1582
1583         for_each_possible_cpu(i)
1584                 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1585                                         GFP_KERNEL, cpu_to_node(i));
1586 }
1587 #endif /* CONFIG_SMP */
1588
1589 /*
1590  * When switching a task to RT, we may overload the runqueue
1591  * with RT tasks. In this case we try to push them off to
1592  * other runqueues.
1593  */
1594 static void switched_to_rt(struct rq *rq, struct task_struct *p,
1595                            int running)
1596 {
1597         int check_resched = 1;
1598
1599         /*
1600          * If we are already running, then there's nothing
1601          * that needs to be done. But if we are not running
1602          * we may need to preempt the current running task.
1603          * If that current running task is also an RT task
1604          * then see if we can move to another run queue.
1605          */
1606         if (!running) {
1607 #ifdef CONFIG_SMP
1608                 if (rq->rt.overloaded && push_rt_task(rq) &&
1609                     /* Don't resched if we changed runqueues */
1610                     rq != task_rq(p))
1611                         check_resched = 0;
1612 #endif /* CONFIG_SMP */
1613                 if (check_resched && p->prio < rq->curr->prio)
1614                         resched_task(rq->curr);
1615         }
1616 }
1617
1618 /*
1619  * Priority of the task has changed. This may cause
1620  * us to initiate a push or pull.
1621  */
1622 static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1623                             int oldprio, int running)
1624 {
1625         if (running) {
1626 #ifdef CONFIG_SMP
1627                 /*
1628                  * If our priority decreases while running, we
1629                  * may need to pull tasks to this runqueue.
1630                  */
1631                 if (oldprio < p->prio)
1632                         pull_rt_task(rq);
1633                 /*
1634                  * If there's a higher priority task waiting to run
1635                  * then reschedule. Note, the above pull_rt_task
1636                  * can release the rq lock and p could migrate.
1637                  * Only reschedule if p is still on the same runqueue.
1638                  */
1639                 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1640                         resched_task(p);
1641 #else
1642                 /* For UP simply resched on drop of prio */
1643                 if (oldprio < p->prio)
1644                         resched_task(p);
1645 #endif /* CONFIG_SMP */
1646         } else {
1647                 /*
1648                  * This task is not running, but if it is
1649                  * greater than the current running task
1650                  * then reschedule.
1651                  */
1652                 if (p->prio < rq->curr->prio)
1653                         resched_task(rq->curr);
1654         }
1655 }
1656
1657 static void watchdog(struct rq *rq, struct task_struct *p)
1658 {
1659         unsigned long soft, hard;
1660
1661         if (!p->signal)
1662                 return;
1663
1664         soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1665         hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1666
1667         if (soft != RLIM_INFINITY) {
1668                 unsigned long next;
1669
1670                 p->rt.timeout++;
1671                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1672                 if (p->rt.timeout > next)
1673                         p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1674         }
1675 }
1676
1677 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1678 {
1679         update_curr_rt(rq);
1680
1681         watchdog(rq, p);
1682
1683         /*
1684          * RR tasks need a special form of timeslice management.
1685          * FIFO tasks have no timeslices.
1686          */
1687         if (p->policy != SCHED_RR)
1688                 return;
1689
1690         if (--p->rt.time_slice)
1691                 return;
1692
1693         p->rt.time_slice = DEF_TIMESLICE;
1694
1695         /*
1696          * Requeue to the end of queue if we are not the only element
1697          * on the queue:
1698          */
1699         if (p->rt.run_list.prev != p->rt.run_list.next) {
1700                 requeue_task_rt(rq, p, 0);
1701                 set_tsk_need_resched(p);
1702         }
1703 }
1704
1705 static void set_curr_task_rt(struct rq *rq)
1706 {
1707         struct task_struct *p = rq->curr;
1708
1709         p->se.exec_start = rq->clock;
1710
1711         /* The running task is never eligible for pushing */
1712         dequeue_pushable_task(rq, p);
1713 }
1714
1715 static const struct sched_class rt_sched_class = {
1716         .next                   = &fair_sched_class,
1717         .enqueue_task           = enqueue_task_rt,
1718         .dequeue_task           = dequeue_task_rt,
1719         .yield_task             = yield_task_rt,
1720
1721         .check_preempt_curr     = check_preempt_curr_rt,
1722
1723         .pick_next_task         = pick_next_task_rt,
1724         .put_prev_task          = put_prev_task_rt,
1725
1726 #ifdef CONFIG_SMP
1727         .select_task_rq         = select_task_rq_rt,
1728
1729         .load_balance           = load_balance_rt,
1730         .move_one_task          = move_one_task_rt,
1731         .set_cpus_allowed       = set_cpus_allowed_rt,
1732         .rq_online              = rq_online_rt,
1733         .rq_offline             = rq_offline_rt,
1734         .pre_schedule           = pre_schedule_rt,
1735         .needs_post_schedule    = needs_post_schedule_rt,
1736         .post_schedule          = post_schedule_rt,
1737         .task_wake_up           = task_wake_up_rt,
1738         .switched_from          = switched_from_rt,
1739 #endif
1740
1741         .set_curr_task          = set_curr_task_rt,
1742         .task_tick              = task_tick_rt,
1743
1744         .prio_changed           = prio_changed_rt,
1745         .switched_to            = switched_to_rt,
1746 };
1747
1748 #ifdef CONFIG_SCHED_DEBUG
1749 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1750
1751 static void print_rt_stats(struct seq_file *m, int cpu)
1752 {
1753         struct rt_rq *rt_rq;
1754
1755         rcu_read_lock();
1756         for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1757                 print_rt_rq(m, cpu, rt_rq);
1758         rcu_read_unlock();
1759 }
1760 #endif /* CONFIG_SCHED_DEBUG */
1761