sched: fix build error in kernel/sched_rt.c when RT_GROUP_SCHED && !SMP
[safe/jmp/linux-2.6] / kernel / sched_rt.c
1 /*
2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3  * policies)
4  */
5
6 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7 {
8         return container_of(rt_se, struct task_struct, rt);
9 }
10
11 #ifdef CONFIG_RT_GROUP_SCHED
12
13 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14 {
15         return rt_rq->rq;
16 }
17
18 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
19 {
20         return rt_se->rt_rq;
21 }
22
23 #else /* CONFIG_RT_GROUP_SCHED */
24
25 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26 {
27         return container_of(rt_rq, struct rq, rt);
28 }
29
30 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
31 {
32         struct task_struct *p = rt_task_of(rt_se);
33         struct rq *rq = task_rq(p);
34
35         return &rq->rt;
36 }
37
38 #endif /* CONFIG_RT_GROUP_SCHED */
39
40 #ifdef CONFIG_SMP
41
42 static inline int rt_overloaded(struct rq *rq)
43 {
44         return atomic_read(&rq->rd->rto_count);
45 }
46
47 static inline void rt_set_overload(struct rq *rq)
48 {
49         if (!rq->online)
50                 return;
51
52         cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
53         /*
54          * Make sure the mask is visible before we set
55          * the overload count. That is checked to determine
56          * if we should look at the mask. It would be a shame
57          * if we looked at the mask, but the mask was not
58          * updated yet.
59          */
60         wmb();
61         atomic_inc(&rq->rd->rto_count);
62 }
63
64 static inline void rt_clear_overload(struct rq *rq)
65 {
66         if (!rq->online)
67                 return;
68
69         /* the order here really doesn't matter */
70         atomic_dec(&rq->rd->rto_count);
71         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
72 }
73
74 static void update_rt_migration(struct rt_rq *rt_rq)
75 {
76         if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
77                 if (!rt_rq->overloaded) {
78                         rt_set_overload(rq_of_rt_rq(rt_rq));
79                         rt_rq->overloaded = 1;
80                 }
81         } else if (rt_rq->overloaded) {
82                 rt_clear_overload(rq_of_rt_rq(rt_rq));
83                 rt_rq->overloaded = 0;
84         }
85 }
86
87 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88 {
89         if (rt_se->nr_cpus_allowed > 1)
90                 rt_rq->rt_nr_migratory++;
91
92         update_rt_migration(rt_rq);
93 }
94
95 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96 {
97         if (rt_se->nr_cpus_allowed > 1)
98                 rt_rq->rt_nr_migratory--;
99
100         update_rt_migration(rt_rq);
101 }
102
103 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
104 {
105         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
106         plist_node_init(&p->pushable_tasks, p->prio);
107         plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
108 }
109
110 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
111 {
112         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
113 }
114
115 #else
116
117 static inline
118 void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {}
119 static inline
120 void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {}
121 static inline
122 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
123 static inline
124 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
125
126 #endif /* CONFIG_SMP */
127
128 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
129 {
130         return !list_empty(&rt_se->run_list);
131 }
132
133 #ifdef CONFIG_RT_GROUP_SCHED
134
135 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
136 {
137         if (!rt_rq->tg)
138                 return RUNTIME_INF;
139
140         return rt_rq->rt_runtime;
141 }
142
143 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
144 {
145         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
146 }
147
148 #define for_each_leaf_rt_rq(rt_rq, rq) \
149         list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
150
151 #define for_each_sched_rt_entity(rt_se) \
152         for (; rt_se; rt_se = rt_se->parent)
153
154 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
155 {
156         return rt_se->my_q;
157 }
158
159 static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
160 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
161
162 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
163 {
164         struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
165         struct sched_rt_entity *rt_se = rt_rq->rt_se;
166
167         if (rt_rq->rt_nr_running) {
168                 if (rt_se && !on_rt_rq(rt_se))
169                         enqueue_rt_entity(rt_se);
170                 if (rt_rq->highest_prio.curr < curr->prio)
171                         resched_task(curr);
172         }
173 }
174
175 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
176 {
177         struct sched_rt_entity *rt_se = rt_rq->rt_se;
178
179         if (rt_se && on_rt_rq(rt_se))
180                 dequeue_rt_entity(rt_se);
181 }
182
183 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
184 {
185         return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
186 }
187
188 static int rt_se_boosted(struct sched_rt_entity *rt_se)
189 {
190         struct rt_rq *rt_rq = group_rt_rq(rt_se);
191         struct task_struct *p;
192
193         if (rt_rq)
194                 return !!rt_rq->rt_nr_boosted;
195
196         p = rt_task_of(rt_se);
197         return p->prio != p->normal_prio;
198 }
199
200 #ifdef CONFIG_SMP
201 static inline const struct cpumask *sched_rt_period_mask(void)
202 {
203         return cpu_rq(smp_processor_id())->rd->span;
204 }
205 #else
206 static inline const struct cpumask *sched_rt_period_mask(void)
207 {
208         return cpu_online_mask;
209 }
210 #endif
211
212 static inline
213 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
214 {
215         return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
216 }
217
218 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
219 {
220         return &rt_rq->tg->rt_bandwidth;
221 }
222
223 #else /* !CONFIG_RT_GROUP_SCHED */
224
225 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
226 {
227         return rt_rq->rt_runtime;
228 }
229
230 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
231 {
232         return ktime_to_ns(def_rt_bandwidth.rt_period);
233 }
234
235 #define for_each_leaf_rt_rq(rt_rq, rq) \
236         for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
237
238 #define for_each_sched_rt_entity(rt_se) \
239         for (; rt_se; rt_se = NULL)
240
241 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
242 {
243         return NULL;
244 }
245
246 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
247 {
248         if (rt_rq->rt_nr_running)
249                 resched_task(rq_of_rt_rq(rt_rq)->curr);
250 }
251
252 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
253 {
254 }
255
256 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
257 {
258         return rt_rq->rt_throttled;
259 }
260
261 static inline const struct cpumask *sched_rt_period_mask(void)
262 {
263         return cpu_online_mask;
264 }
265
266 static inline
267 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
268 {
269         return &cpu_rq(cpu)->rt;
270 }
271
272 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
273 {
274         return &def_rt_bandwidth;
275 }
276
277 #endif /* CONFIG_RT_GROUP_SCHED */
278
279 #ifdef CONFIG_SMP
280 /*
281  * We ran out of runtime, see if we can borrow some from our neighbours.
282  */
283 static int do_balance_runtime(struct rt_rq *rt_rq)
284 {
285         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
286         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
287         int i, weight, more = 0;
288         u64 rt_period;
289
290         weight = cpumask_weight(rd->span);
291
292         spin_lock(&rt_b->rt_runtime_lock);
293         rt_period = ktime_to_ns(rt_b->rt_period);
294         for_each_cpu(i, rd->span) {
295                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
296                 s64 diff;
297
298                 if (iter == rt_rq)
299                         continue;
300
301                 spin_lock(&iter->rt_runtime_lock);
302                 /*
303                  * Either all rqs have inf runtime and there's nothing to steal
304                  * or __disable_runtime() below sets a specific rq to inf to
305                  * indicate its been disabled and disalow stealing.
306                  */
307                 if (iter->rt_runtime == RUNTIME_INF)
308                         goto next;
309
310                 /*
311                  * From runqueues with spare time, take 1/n part of their
312                  * spare time, but no more than our period.
313                  */
314                 diff = iter->rt_runtime - iter->rt_time;
315                 if (diff > 0) {
316                         diff = div_u64((u64)diff, weight);
317                         if (rt_rq->rt_runtime + diff > rt_period)
318                                 diff = rt_period - rt_rq->rt_runtime;
319                         iter->rt_runtime -= diff;
320                         rt_rq->rt_runtime += diff;
321                         more = 1;
322                         if (rt_rq->rt_runtime == rt_period) {
323                                 spin_unlock(&iter->rt_runtime_lock);
324                                 break;
325                         }
326                 }
327 next:
328                 spin_unlock(&iter->rt_runtime_lock);
329         }
330         spin_unlock(&rt_b->rt_runtime_lock);
331
332         return more;
333 }
334
335 /*
336  * Ensure this RQ takes back all the runtime it lend to its neighbours.
337  */
338 static void __disable_runtime(struct rq *rq)
339 {
340         struct root_domain *rd = rq->rd;
341         struct rt_rq *rt_rq;
342
343         if (unlikely(!scheduler_running))
344                 return;
345
346         for_each_leaf_rt_rq(rt_rq, rq) {
347                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
348                 s64 want;
349                 int i;
350
351                 spin_lock(&rt_b->rt_runtime_lock);
352                 spin_lock(&rt_rq->rt_runtime_lock);
353                 /*
354                  * Either we're all inf and nobody needs to borrow, or we're
355                  * already disabled and thus have nothing to do, or we have
356                  * exactly the right amount of runtime to take out.
357                  */
358                 if (rt_rq->rt_runtime == RUNTIME_INF ||
359                                 rt_rq->rt_runtime == rt_b->rt_runtime)
360                         goto balanced;
361                 spin_unlock(&rt_rq->rt_runtime_lock);
362
363                 /*
364                  * Calculate the difference between what we started out with
365                  * and what we current have, that's the amount of runtime
366                  * we lend and now have to reclaim.
367                  */
368                 want = rt_b->rt_runtime - rt_rq->rt_runtime;
369
370                 /*
371                  * Greedy reclaim, take back as much as we can.
372                  */
373                 for_each_cpu(i, rd->span) {
374                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
375                         s64 diff;
376
377                         /*
378                          * Can't reclaim from ourselves or disabled runqueues.
379                          */
380                         if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
381                                 continue;
382
383                         spin_lock(&iter->rt_runtime_lock);
384                         if (want > 0) {
385                                 diff = min_t(s64, iter->rt_runtime, want);
386                                 iter->rt_runtime -= diff;
387                                 want -= diff;
388                         } else {
389                                 iter->rt_runtime -= want;
390                                 want -= want;
391                         }
392                         spin_unlock(&iter->rt_runtime_lock);
393
394                         if (!want)
395                                 break;
396                 }
397
398                 spin_lock(&rt_rq->rt_runtime_lock);
399                 /*
400                  * We cannot be left wanting - that would mean some runtime
401                  * leaked out of the system.
402                  */
403                 BUG_ON(want);
404 balanced:
405                 /*
406                  * Disable all the borrow logic by pretending we have inf
407                  * runtime - in which case borrowing doesn't make sense.
408                  */
409                 rt_rq->rt_runtime = RUNTIME_INF;
410                 spin_unlock(&rt_rq->rt_runtime_lock);
411                 spin_unlock(&rt_b->rt_runtime_lock);
412         }
413 }
414
415 static void disable_runtime(struct rq *rq)
416 {
417         unsigned long flags;
418
419         spin_lock_irqsave(&rq->lock, flags);
420         __disable_runtime(rq);
421         spin_unlock_irqrestore(&rq->lock, flags);
422 }
423
424 static void __enable_runtime(struct rq *rq)
425 {
426         struct rt_rq *rt_rq;
427
428         if (unlikely(!scheduler_running))
429                 return;
430
431         /*
432          * Reset each runqueue's bandwidth settings
433          */
434         for_each_leaf_rt_rq(rt_rq, rq) {
435                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
436
437                 spin_lock(&rt_b->rt_runtime_lock);
438                 spin_lock(&rt_rq->rt_runtime_lock);
439                 rt_rq->rt_runtime = rt_b->rt_runtime;
440                 rt_rq->rt_time = 0;
441                 rt_rq->rt_throttled = 0;
442                 spin_unlock(&rt_rq->rt_runtime_lock);
443                 spin_unlock(&rt_b->rt_runtime_lock);
444         }
445 }
446
447 static void enable_runtime(struct rq *rq)
448 {
449         unsigned long flags;
450
451         spin_lock_irqsave(&rq->lock, flags);
452         __enable_runtime(rq);
453         spin_unlock_irqrestore(&rq->lock, flags);
454 }
455
456 static int balance_runtime(struct rt_rq *rt_rq)
457 {
458         int more = 0;
459
460         if (rt_rq->rt_time > rt_rq->rt_runtime) {
461                 spin_unlock(&rt_rq->rt_runtime_lock);
462                 more = do_balance_runtime(rt_rq);
463                 spin_lock(&rt_rq->rt_runtime_lock);
464         }
465
466         return more;
467 }
468 #else /* !CONFIG_SMP */
469 static inline int balance_runtime(struct rt_rq *rt_rq)
470 {
471         return 0;
472 }
473 #endif /* CONFIG_SMP */
474
475 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
476 {
477         int i, idle = 1;
478         const struct cpumask *span;
479
480         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
481                 return 1;
482
483         span = sched_rt_period_mask();
484         for_each_cpu(i, span) {
485                 int enqueue = 0;
486                 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
487                 struct rq *rq = rq_of_rt_rq(rt_rq);
488
489                 spin_lock(&rq->lock);
490                 if (rt_rq->rt_time) {
491                         u64 runtime;
492
493                         spin_lock(&rt_rq->rt_runtime_lock);
494                         if (rt_rq->rt_throttled)
495                                 balance_runtime(rt_rq);
496                         runtime = rt_rq->rt_runtime;
497                         rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
498                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
499                                 rt_rq->rt_throttled = 0;
500                                 enqueue = 1;
501                         }
502                         if (rt_rq->rt_time || rt_rq->rt_nr_running)
503                                 idle = 0;
504                         spin_unlock(&rt_rq->rt_runtime_lock);
505                 } else if (rt_rq->rt_nr_running)
506                         idle = 0;
507
508                 if (enqueue)
509                         sched_rt_rq_enqueue(rt_rq);
510                 spin_unlock(&rq->lock);
511         }
512
513         return idle;
514 }
515
516 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
517 {
518 #ifdef CONFIG_RT_GROUP_SCHED
519         struct rt_rq *rt_rq = group_rt_rq(rt_se);
520
521         if (rt_rq)
522                 return rt_rq->highest_prio.curr;
523 #endif
524
525         return rt_task_of(rt_se)->prio;
526 }
527
528 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
529 {
530         u64 runtime = sched_rt_runtime(rt_rq);
531
532         if (rt_rq->rt_throttled)
533                 return rt_rq_throttled(rt_rq);
534
535         if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
536                 return 0;
537
538         balance_runtime(rt_rq);
539         runtime = sched_rt_runtime(rt_rq);
540         if (runtime == RUNTIME_INF)
541                 return 0;
542
543         if (rt_rq->rt_time > runtime) {
544                 rt_rq->rt_throttled = 1;
545                 if (rt_rq_throttled(rt_rq)) {
546                         sched_rt_rq_dequeue(rt_rq);
547                         return 1;
548                 }
549         }
550
551         return 0;
552 }
553
554 /*
555  * Update the current task's runtime statistics. Skip current tasks that
556  * are not in our scheduling class.
557  */
558 static void update_curr_rt(struct rq *rq)
559 {
560         struct task_struct *curr = rq->curr;
561         struct sched_rt_entity *rt_se = &curr->rt;
562         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
563         u64 delta_exec;
564
565         if (!task_has_rt_policy(curr))
566                 return;
567
568         delta_exec = rq->clock - curr->se.exec_start;
569         if (unlikely((s64)delta_exec < 0))
570                 delta_exec = 0;
571
572         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
573
574         curr->se.sum_exec_runtime += delta_exec;
575         account_group_exec_runtime(curr, delta_exec);
576
577         curr->se.exec_start = rq->clock;
578         cpuacct_charge(curr, delta_exec);
579
580         if (!rt_bandwidth_enabled())
581                 return;
582
583         for_each_sched_rt_entity(rt_se) {
584                 rt_rq = rt_rq_of_se(rt_se);
585
586                 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
587                         spin_lock(&rt_rq->rt_runtime_lock);
588                         rt_rq->rt_time += delta_exec;
589                         if (sched_rt_runtime_exceeded(rt_rq))
590                                 resched_task(curr);
591                         spin_unlock(&rt_rq->rt_runtime_lock);
592                 }
593         }
594 }
595
596 #if defined CONFIG_SMP
597
598 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
599
600 static inline int next_prio(struct rq *rq)
601 {
602         struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
603
604         if (next && rt_prio(next->prio))
605                 return next->prio;
606         else
607                 return MAX_RT_PRIO;
608 }
609
610 static void
611 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
612 {
613         struct rq *rq = rq_of_rt_rq(rt_rq);
614
615         if (prio < prev_prio) {
616
617                 /*
618                  * If the new task is higher in priority than anything on the
619                  * run-queue, we know that the previous high becomes our
620                  * next-highest.
621                  */
622                 rt_rq->highest_prio.next = prev_prio;
623
624                 if (rq->online)
625                         cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
626
627         } else if (prio == rt_rq->highest_prio.curr)
628                 /*
629                  * If the next task is equal in priority to the highest on
630                  * the run-queue, then we implicitly know that the next highest
631                  * task cannot be any lower than current
632                  */
633                 rt_rq->highest_prio.next = prio;
634         else if (prio < rt_rq->highest_prio.next)
635                 /*
636                  * Otherwise, we need to recompute next-highest
637                  */
638                 rt_rq->highest_prio.next = next_prio(rq);
639 }
640
641 static void
642 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
643 {
644         struct rq *rq = rq_of_rt_rq(rt_rq);
645
646         if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
647                 rt_rq->highest_prio.next = next_prio(rq);
648
649         if (rq->online && rt_rq->highest_prio.curr != prev_prio)
650                 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
651 }
652
653 #else /* CONFIG_SMP */
654
655 static inline
656 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
657 static inline
658 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
659
660 #endif /* CONFIG_SMP */
661
662 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
663 static void
664 inc_rt_prio(struct rt_rq *rt_rq, int prio)
665 {
666         int prev_prio = rt_rq->highest_prio.curr;
667
668         if (prio < prev_prio)
669                 rt_rq->highest_prio.curr = prio;
670
671         inc_rt_prio_smp(rt_rq, prio, prev_prio);
672 }
673
674 static void
675 dec_rt_prio(struct rt_rq *rt_rq, int prio)
676 {
677         int prev_prio = rt_rq->highest_prio.curr;
678
679         if (rt_rq->rt_nr_running) {
680
681                 WARN_ON(prio < prev_prio);
682
683                 /*
684                  * This may have been our highest task, and therefore
685                  * we may have some recomputation to do
686                  */
687                 if (prio == prev_prio) {
688                         struct rt_prio_array *array = &rt_rq->active;
689
690                         rt_rq->highest_prio.curr =
691                                 sched_find_first_bit(array->bitmap);
692                 }
693
694         } else
695                 rt_rq->highest_prio.curr = MAX_RT_PRIO;
696
697         dec_rt_prio_smp(rt_rq, prio, prev_prio);
698 }
699
700 #else
701
702 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
703 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
704
705 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
706
707 #ifdef CONFIG_RT_GROUP_SCHED
708
709 static void
710 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
711 {
712         if (rt_se_boosted(rt_se))
713                 rt_rq->rt_nr_boosted++;
714
715         if (rt_rq->tg)
716                 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
717 }
718
719 static void
720 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
721 {
722         if (rt_se_boosted(rt_se))
723                 rt_rq->rt_nr_boosted--;
724
725         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
726 }
727
728 #else /* CONFIG_RT_GROUP_SCHED */
729
730 static void
731 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
732 {
733         start_rt_bandwidth(&def_rt_bandwidth);
734 }
735
736 static inline
737 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
738
739 #endif /* CONFIG_RT_GROUP_SCHED */
740
741 static inline
742 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
743 {
744         int prio = rt_se_prio(rt_se);
745
746         WARN_ON(!rt_prio(prio));
747         rt_rq->rt_nr_running++;
748
749         inc_rt_prio(rt_rq, prio);
750         inc_rt_migration(rt_se, rt_rq);
751         inc_rt_group(rt_se, rt_rq);
752 }
753
754 static inline
755 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
756 {
757         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
758         WARN_ON(!rt_rq->rt_nr_running);
759         rt_rq->rt_nr_running--;
760
761         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
762         dec_rt_migration(rt_se, rt_rq);
763         dec_rt_group(rt_se, rt_rq);
764 }
765
766 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
767 {
768         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
769         struct rt_prio_array *array = &rt_rq->active;
770         struct rt_rq *group_rq = group_rt_rq(rt_se);
771         struct list_head *queue = array->queue + rt_se_prio(rt_se);
772
773         /*
774          * Don't enqueue the group if its throttled, or when empty.
775          * The latter is a consequence of the former when a child group
776          * get throttled and the current group doesn't have any other
777          * active members.
778          */
779         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
780                 return;
781
782         list_add_tail(&rt_se->run_list, queue);
783         __set_bit(rt_se_prio(rt_se), array->bitmap);
784
785         inc_rt_tasks(rt_se, rt_rq);
786 }
787
788 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
789 {
790         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
791         struct rt_prio_array *array = &rt_rq->active;
792
793         list_del_init(&rt_se->run_list);
794         if (list_empty(array->queue + rt_se_prio(rt_se)))
795                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
796
797         dec_rt_tasks(rt_se, rt_rq);
798 }
799
800 /*
801  * Because the prio of an upper entry depends on the lower
802  * entries, we must remove entries top - down.
803  */
804 static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
805 {
806         struct sched_rt_entity *back = NULL;
807
808         for_each_sched_rt_entity(rt_se) {
809                 rt_se->back = back;
810                 back = rt_se;
811         }
812
813         for (rt_se = back; rt_se; rt_se = rt_se->back) {
814                 if (on_rt_rq(rt_se))
815                         __dequeue_rt_entity(rt_se);
816         }
817 }
818
819 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
820 {
821         dequeue_rt_stack(rt_se);
822         for_each_sched_rt_entity(rt_se)
823                 __enqueue_rt_entity(rt_se);
824 }
825
826 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
827 {
828         dequeue_rt_stack(rt_se);
829
830         for_each_sched_rt_entity(rt_se) {
831                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
832
833                 if (rt_rq && rt_rq->rt_nr_running)
834                         __enqueue_rt_entity(rt_se);
835         }
836 }
837
838 /*
839  * Adding/removing a task to/from a priority array:
840  */
841 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
842 {
843         struct sched_rt_entity *rt_se = &p->rt;
844
845         if (wakeup)
846                 rt_se->timeout = 0;
847
848         enqueue_rt_entity(rt_se);
849
850         if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
851                 enqueue_pushable_task(rq, p);
852
853         inc_cpu_load(rq, p->se.load.weight);
854 }
855
856 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
857 {
858         struct sched_rt_entity *rt_se = &p->rt;
859
860         update_curr_rt(rq);
861         dequeue_rt_entity(rt_se);
862
863         dequeue_pushable_task(rq, p);
864
865         dec_cpu_load(rq, p->se.load.weight);
866 }
867
868 /*
869  * Put task to the end of the run list without the overhead of dequeue
870  * followed by enqueue.
871  */
872 static void
873 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
874 {
875         if (on_rt_rq(rt_se)) {
876                 struct rt_prio_array *array = &rt_rq->active;
877                 struct list_head *queue = array->queue + rt_se_prio(rt_se);
878
879                 if (head)
880                         list_move(&rt_se->run_list, queue);
881                 else
882                         list_move_tail(&rt_se->run_list, queue);
883         }
884 }
885
886 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
887 {
888         struct sched_rt_entity *rt_se = &p->rt;
889         struct rt_rq *rt_rq;
890
891         for_each_sched_rt_entity(rt_se) {
892                 rt_rq = rt_rq_of_se(rt_se);
893                 requeue_rt_entity(rt_rq, rt_se, head);
894         }
895 }
896
897 static void yield_task_rt(struct rq *rq)
898 {
899         requeue_task_rt(rq, rq->curr, 0);
900 }
901
902 #ifdef CONFIG_SMP
903 static int find_lowest_rq(struct task_struct *task);
904
905 static int select_task_rq_rt(struct task_struct *p, int sync)
906 {
907         struct rq *rq = task_rq(p);
908
909         /*
910          * If the current task is an RT task, then
911          * try to see if we can wake this RT task up on another
912          * runqueue. Otherwise simply start this RT task
913          * on its current runqueue.
914          *
915          * We want to avoid overloading runqueues. Even if
916          * the RT task is of higher priority than the current RT task.
917          * RT tasks behave differently than other tasks. If
918          * one gets preempted, we try to push it off to another queue.
919          * So trying to keep a preempting RT task on the same
920          * cache hot CPU will force the running RT task to
921          * a cold CPU. So we waste all the cache for the lower
922          * RT task in hopes of saving some of a RT task
923          * that is just being woken and probably will have
924          * cold cache anyway.
925          */
926         if (unlikely(rt_task(rq->curr)) &&
927             (p->rt.nr_cpus_allowed > 1)) {
928                 int cpu = find_lowest_rq(p);
929
930                 return (cpu == -1) ? task_cpu(p) : cpu;
931         }
932
933         /*
934          * Otherwise, just let it ride on the affined RQ and the
935          * post-schedule router will push the preempted task away
936          */
937         return task_cpu(p);
938 }
939
940 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
941 {
942         cpumask_var_t mask;
943
944         if (rq->curr->rt.nr_cpus_allowed == 1)
945                 return;
946
947         if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
948                 return;
949
950         if (p->rt.nr_cpus_allowed != 1
951             && cpupri_find(&rq->rd->cpupri, p, mask))
952                 goto free;
953
954         if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
955                 goto free;
956
957         /*
958          * There appears to be other cpus that can accept
959          * current and none to run 'p', so lets reschedule
960          * to try and push current away:
961          */
962         requeue_task_rt(rq, p, 1);
963         resched_task(rq->curr);
964 free:
965         free_cpumask_var(mask);
966 }
967
968 #endif /* CONFIG_SMP */
969
970 /*
971  * Preempt the current task with a newly woken task if needed:
972  */
973 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
974 {
975         if (p->prio < rq->curr->prio) {
976                 resched_task(rq->curr);
977                 return;
978         }
979
980 #ifdef CONFIG_SMP
981         /*
982          * If:
983          *
984          * - the newly woken task is of equal priority to the current task
985          * - the newly woken task is non-migratable while current is migratable
986          * - current will be preempted on the next reschedule
987          *
988          * we should check to see if current can readily move to a different
989          * cpu.  If so, we will reschedule to allow the push logic to try
990          * to move current somewhere else, making room for our non-migratable
991          * task.
992          */
993         if (p->prio == rq->curr->prio && !need_resched())
994                 check_preempt_equal_prio(rq, p);
995 #endif
996 }
997
998 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
999                                                    struct rt_rq *rt_rq)
1000 {
1001         struct rt_prio_array *array = &rt_rq->active;
1002         struct sched_rt_entity *next = NULL;
1003         struct list_head *queue;
1004         int idx;
1005
1006         idx = sched_find_first_bit(array->bitmap);
1007         BUG_ON(idx >= MAX_RT_PRIO);
1008
1009         queue = array->queue + idx;
1010         next = list_entry(queue->next, struct sched_rt_entity, run_list);
1011
1012         return next;
1013 }
1014
1015 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1016 {
1017         struct sched_rt_entity *rt_se;
1018         struct task_struct *p;
1019         struct rt_rq *rt_rq;
1020
1021         rt_rq = &rq->rt;
1022
1023         if (unlikely(!rt_rq->rt_nr_running))
1024                 return NULL;
1025
1026         if (rt_rq_throttled(rt_rq))
1027                 return NULL;
1028
1029         do {
1030                 rt_se = pick_next_rt_entity(rq, rt_rq);
1031                 BUG_ON(!rt_se);
1032                 rt_rq = group_rt_rq(rt_se);
1033         } while (rt_rq);
1034
1035         p = rt_task_of(rt_se);
1036         p->se.exec_start = rq->clock;
1037
1038         return p;
1039 }
1040
1041 static struct task_struct *pick_next_task_rt(struct rq *rq)
1042 {
1043         struct task_struct *p = _pick_next_task_rt(rq);
1044
1045         /* The running task is never eligible for pushing */
1046         if (p)
1047                 dequeue_pushable_task(rq, p);
1048
1049         return p;
1050 }
1051
1052 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1053 {
1054         update_curr_rt(rq);
1055         p->se.exec_start = 0;
1056
1057         /*
1058          * The previous task needs to be made eligible for pushing
1059          * if it is still active
1060          */
1061         if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1062                 enqueue_pushable_task(rq, p);
1063 }
1064
1065 #ifdef CONFIG_SMP
1066
1067 /* Only try algorithms three times */
1068 #define RT_MAX_TRIES 3
1069
1070 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1071
1072 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1073 {
1074         if (!task_running(rq, p) &&
1075             (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1076             (p->rt.nr_cpus_allowed > 1))
1077                 return 1;
1078         return 0;
1079 }
1080
1081 /* Return the second highest RT task, NULL otherwise */
1082 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1083 {
1084         struct task_struct *next = NULL;
1085         struct sched_rt_entity *rt_se;
1086         struct rt_prio_array *array;
1087         struct rt_rq *rt_rq;
1088         int idx;
1089
1090         for_each_leaf_rt_rq(rt_rq, rq) {
1091                 array = &rt_rq->active;
1092                 idx = sched_find_first_bit(array->bitmap);
1093  next_idx:
1094                 if (idx >= MAX_RT_PRIO)
1095                         continue;
1096                 if (next && next->prio < idx)
1097                         continue;
1098                 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1099                         struct task_struct *p = rt_task_of(rt_se);
1100                         if (pick_rt_task(rq, p, cpu)) {
1101                                 next = p;
1102                                 break;
1103                         }
1104                 }
1105                 if (!next) {
1106                         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1107                         goto next_idx;
1108                 }
1109         }
1110
1111         return next;
1112 }
1113
1114 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1115
1116 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
1117 {
1118         int first;
1119
1120         /* "this_cpu" is cheaper to preempt than a remote processor */
1121         if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
1122                 return this_cpu;
1123
1124         first = first_cpu(*mask);
1125         if (first != NR_CPUS)
1126                 return first;
1127
1128         return -1;
1129 }
1130
1131 static int find_lowest_rq(struct task_struct *task)
1132 {
1133         struct sched_domain *sd;
1134         struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1135         int this_cpu = smp_processor_id();
1136         int cpu      = task_cpu(task);
1137
1138         if (task->rt.nr_cpus_allowed == 1)
1139                 return -1; /* No other targets possible */
1140
1141         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1142                 return -1; /* No targets found */
1143
1144         /*
1145          * Only consider CPUs that are usable for migration.
1146          * I guess we might want to change cpupri_find() to ignore those
1147          * in the first place.
1148          */
1149         cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1150
1151         /*
1152          * At this point we have built a mask of cpus representing the
1153          * lowest priority tasks in the system.  Now we want to elect
1154          * the best one based on our affinity and topology.
1155          *
1156          * We prioritize the last cpu that the task executed on since
1157          * it is most likely cache-hot in that location.
1158          */
1159         if (cpumask_test_cpu(cpu, lowest_mask))
1160                 return cpu;
1161
1162         /*
1163          * Otherwise, we consult the sched_domains span maps to figure
1164          * out which cpu is logically closest to our hot cache data.
1165          */
1166         if (this_cpu == cpu)
1167                 this_cpu = -1; /* Skip this_cpu opt if the same */
1168
1169         for_each_domain(cpu, sd) {
1170                 if (sd->flags & SD_WAKE_AFFINE) {
1171                         cpumask_t domain_mask;
1172                         int       best_cpu;
1173
1174                         cpumask_and(&domain_mask, sched_domain_span(sd),
1175                                     lowest_mask);
1176
1177                         best_cpu = pick_optimal_cpu(this_cpu,
1178                                                     &domain_mask);
1179                         if (best_cpu != -1)
1180                                 return best_cpu;
1181                 }
1182         }
1183
1184         /*
1185          * And finally, if there were no matches within the domains
1186          * just give the caller *something* to work with from the compatible
1187          * locations.
1188          */
1189         return pick_optimal_cpu(this_cpu, lowest_mask);
1190 }
1191
1192 /* Will lock the rq it finds */
1193 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1194 {
1195         struct rq *lowest_rq = NULL;
1196         int tries;
1197         int cpu;
1198
1199         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1200                 cpu = find_lowest_rq(task);
1201
1202                 if ((cpu == -1) || (cpu == rq->cpu))
1203                         break;
1204
1205                 lowest_rq = cpu_rq(cpu);
1206
1207                 /* if the prio of this runqueue changed, try again */
1208                 if (double_lock_balance(rq, lowest_rq)) {
1209                         /*
1210                          * We had to unlock the run queue. In
1211                          * the mean time, task could have
1212                          * migrated already or had its affinity changed.
1213                          * Also make sure that it wasn't scheduled on its rq.
1214                          */
1215                         if (unlikely(task_rq(task) != rq ||
1216                                      !cpumask_test_cpu(lowest_rq->cpu,
1217                                                        &task->cpus_allowed) ||
1218                                      task_running(rq, task) ||
1219                                      !task->se.on_rq)) {
1220
1221                                 spin_unlock(&lowest_rq->lock);
1222                                 lowest_rq = NULL;
1223                                 break;
1224                         }
1225                 }
1226
1227                 /* If this rq is still suitable use it. */
1228                 if (lowest_rq->rt.highest_prio.curr > task->prio)
1229                         break;
1230
1231                 /* try again */
1232                 double_unlock_balance(rq, lowest_rq);
1233                 lowest_rq = NULL;
1234         }
1235
1236         return lowest_rq;
1237 }
1238
1239 static inline int has_pushable_tasks(struct rq *rq)
1240 {
1241         return !plist_head_empty(&rq->rt.pushable_tasks);
1242 }
1243
1244 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1245 {
1246         struct task_struct *p;
1247
1248         if (!has_pushable_tasks(rq))
1249                 return NULL;
1250
1251         p = plist_first_entry(&rq->rt.pushable_tasks,
1252                               struct task_struct, pushable_tasks);
1253
1254         BUG_ON(rq->cpu != task_cpu(p));
1255         BUG_ON(task_current(rq, p));
1256         BUG_ON(p->rt.nr_cpus_allowed <= 1);
1257
1258         BUG_ON(!p->se.on_rq);
1259         BUG_ON(!rt_task(p));
1260
1261         return p;
1262 }
1263
1264 /*
1265  * If the current CPU has more than one RT task, see if the non
1266  * running task can migrate over to a CPU that is running a task
1267  * of lesser priority.
1268  */
1269 static int push_rt_task(struct rq *rq)
1270 {
1271         struct task_struct *next_task;
1272         struct rq *lowest_rq;
1273
1274         if (!rq->rt.overloaded)
1275                 return 0;
1276
1277         next_task = pick_next_pushable_task(rq);
1278         if (!next_task)
1279                 return 0;
1280
1281  retry:
1282         if (unlikely(next_task == rq->curr)) {
1283                 WARN_ON(1);
1284                 return 0;
1285         }
1286
1287         /*
1288          * It's possible that the next_task slipped in of
1289          * higher priority than current. If that's the case
1290          * just reschedule current.
1291          */
1292         if (unlikely(next_task->prio < rq->curr->prio)) {
1293                 resched_task(rq->curr);
1294                 return 0;
1295         }
1296
1297         /* We might release rq lock */
1298         get_task_struct(next_task);
1299
1300         /* find_lock_lowest_rq locks the rq if found */
1301         lowest_rq = find_lock_lowest_rq(next_task, rq);
1302         if (!lowest_rq) {
1303                 struct task_struct *task;
1304                 /*
1305                  * find lock_lowest_rq releases rq->lock
1306                  * so it is possible that next_task has migrated.
1307                  *
1308                  * We need to make sure that the task is still on the same
1309                  * run-queue and is also still the next task eligible for
1310                  * pushing.
1311                  */
1312                 task = pick_next_pushable_task(rq);
1313                 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1314                         /*
1315                          * If we get here, the task hasnt moved at all, but
1316                          * it has failed to push.  We will not try again,
1317                          * since the other cpus will pull from us when they
1318                          * are ready.
1319                          */
1320                         dequeue_pushable_task(rq, next_task);
1321                         goto out;
1322                 }
1323
1324                 if (!task)
1325                         /* No more tasks, just exit */
1326                         goto out;
1327
1328                 /*
1329                  * Something has shifted, try again.
1330                  */
1331                 put_task_struct(next_task);
1332                 next_task = task;
1333                 goto retry;
1334         }
1335
1336         deactivate_task(rq, next_task, 0);
1337         set_task_cpu(next_task, lowest_rq->cpu);
1338         activate_task(lowest_rq, next_task, 0);
1339
1340         resched_task(lowest_rq->curr);
1341
1342         double_unlock_balance(rq, lowest_rq);
1343
1344 out:
1345         put_task_struct(next_task);
1346
1347         return 1;
1348 }
1349
1350 static void push_rt_tasks(struct rq *rq)
1351 {
1352         /* push_rt_task will return true if it moved an RT */
1353         while (push_rt_task(rq))
1354                 ;
1355 }
1356
1357 static int pull_rt_task(struct rq *this_rq)
1358 {
1359         int this_cpu = this_rq->cpu, ret = 0, cpu;
1360         struct task_struct *p;
1361         struct rq *src_rq;
1362
1363         if (likely(!rt_overloaded(this_rq)))
1364                 return 0;
1365
1366         for_each_cpu(cpu, this_rq->rd->rto_mask) {
1367                 if (this_cpu == cpu)
1368                         continue;
1369
1370                 src_rq = cpu_rq(cpu);
1371
1372                 /*
1373                  * Don't bother taking the src_rq->lock if the next highest
1374                  * task is known to be lower-priority than our current task.
1375                  * This may look racy, but if this value is about to go
1376                  * logically higher, the src_rq will push this task away.
1377                  * And if its going logically lower, we do not care
1378                  */
1379                 if (src_rq->rt.highest_prio.next >=
1380                     this_rq->rt.highest_prio.curr)
1381                         continue;
1382
1383                 /*
1384                  * We can potentially drop this_rq's lock in
1385                  * double_lock_balance, and another CPU could
1386                  * alter this_rq
1387                  */
1388                 double_lock_balance(this_rq, src_rq);
1389
1390                 /*
1391                  * Are there still pullable RT tasks?
1392                  */
1393                 if (src_rq->rt.rt_nr_running <= 1)
1394                         goto skip;
1395
1396                 p = pick_next_highest_task_rt(src_rq, this_cpu);
1397
1398                 /*
1399                  * Do we have an RT task that preempts
1400                  * the to-be-scheduled task?
1401                  */
1402                 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1403                         WARN_ON(p == src_rq->curr);
1404                         WARN_ON(!p->se.on_rq);
1405
1406                         /*
1407                          * There's a chance that p is higher in priority
1408                          * than what's currently running on its cpu.
1409                          * This is just that p is wakeing up and hasn't
1410                          * had a chance to schedule. We only pull
1411                          * p if it is lower in priority than the
1412                          * current task on the run queue
1413                          */
1414                         if (p->prio < src_rq->curr->prio)
1415                                 goto skip;
1416
1417                         ret = 1;
1418
1419                         deactivate_task(src_rq, p, 0);
1420                         set_task_cpu(p, this_cpu);
1421                         activate_task(this_rq, p, 0);
1422                         /*
1423                          * We continue with the search, just in
1424                          * case there's an even higher prio task
1425                          * in another runqueue. (low likelyhood
1426                          * but possible)
1427                          */
1428                 }
1429  skip:
1430                 double_unlock_balance(this_rq, src_rq);
1431         }
1432
1433         return ret;
1434 }
1435
1436 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1437 {
1438         /* Try to pull RT tasks here if we lower this rq's prio */
1439         if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1440                 pull_rt_task(rq);
1441 }
1442
1443 /*
1444  * assumes rq->lock is held
1445  */
1446 static int needs_post_schedule_rt(struct rq *rq)
1447 {
1448         return has_pushable_tasks(rq);
1449 }
1450
1451 static void post_schedule_rt(struct rq *rq)
1452 {
1453         /*
1454          * This is only called if needs_post_schedule_rt() indicates that
1455          * we need to push tasks away
1456          */
1457         spin_lock_irq(&rq->lock);
1458         push_rt_tasks(rq);
1459         spin_unlock_irq(&rq->lock);
1460 }
1461
1462 /*
1463  * If we are not running and we are not going to reschedule soon, we should
1464  * try to push tasks away now
1465  */
1466 static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1467 {
1468         if (!task_running(rq, p) &&
1469             !test_tsk_need_resched(rq->curr) &&
1470             has_pushable_tasks(rq) &&
1471             p->rt.nr_cpus_allowed > 1)
1472                 push_rt_tasks(rq);
1473 }
1474
1475 static unsigned long
1476 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1477                 unsigned long max_load_move,
1478                 struct sched_domain *sd, enum cpu_idle_type idle,
1479                 int *all_pinned, int *this_best_prio)
1480 {
1481         /* don't touch RT tasks */
1482         return 0;
1483 }
1484
1485 static int
1486 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1487                  struct sched_domain *sd, enum cpu_idle_type idle)
1488 {
1489         /* don't touch RT tasks */
1490         return 0;
1491 }
1492
1493 static void set_cpus_allowed_rt(struct task_struct *p,
1494                                 const struct cpumask *new_mask)
1495 {
1496         int weight = cpumask_weight(new_mask);
1497
1498         BUG_ON(!rt_task(p));
1499
1500         /*
1501          * Update the migration status of the RQ if we have an RT task
1502          * which is running AND changing its weight value.
1503          */
1504         if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1505                 struct rq *rq = task_rq(p);
1506
1507                 if (!task_current(rq, p)) {
1508                         /*
1509                          * Make sure we dequeue this task from the pushable list
1510                          * before going further.  It will either remain off of
1511                          * the list because we are no longer pushable, or it
1512                          * will be requeued.
1513                          */
1514                         if (p->rt.nr_cpus_allowed > 1)
1515                                 dequeue_pushable_task(rq, p);
1516
1517                         /*
1518                          * Requeue if our weight is changing and still > 1
1519                          */
1520                         if (weight > 1)
1521                                 enqueue_pushable_task(rq, p);
1522
1523                 }
1524
1525                 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1526                         rq->rt.rt_nr_migratory++;
1527                 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1528                         BUG_ON(!rq->rt.rt_nr_migratory);
1529                         rq->rt.rt_nr_migratory--;
1530                 }
1531
1532                 update_rt_migration(&rq->rt);
1533         }
1534
1535         cpumask_copy(&p->cpus_allowed, new_mask);
1536         p->rt.nr_cpus_allowed = weight;
1537 }
1538
1539 /* Assumes rq->lock is held */
1540 static void rq_online_rt(struct rq *rq)
1541 {
1542         if (rq->rt.overloaded)
1543                 rt_set_overload(rq);
1544
1545         __enable_runtime(rq);
1546
1547         cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1548 }
1549
1550 /* Assumes rq->lock is held */
1551 static void rq_offline_rt(struct rq *rq)
1552 {
1553         if (rq->rt.overloaded)
1554                 rt_clear_overload(rq);
1555
1556         __disable_runtime(rq);
1557
1558         cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1559 }
1560
1561 /*
1562  * When switch from the rt queue, we bring ourselves to a position
1563  * that we might want to pull RT tasks from other runqueues.
1564  */
1565 static void switched_from_rt(struct rq *rq, struct task_struct *p,
1566                            int running)
1567 {
1568         /*
1569          * If there are other RT tasks then we will reschedule
1570          * and the scheduling of the other RT tasks will handle
1571          * the balancing. But if we are the last RT task
1572          * we may need to handle the pulling of RT tasks
1573          * now.
1574          */
1575         if (!rq->rt.rt_nr_running)
1576                 pull_rt_task(rq);
1577 }
1578
1579 static inline void init_sched_rt_class(void)
1580 {
1581         unsigned int i;
1582
1583         for_each_possible_cpu(i)
1584                 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1585                                         GFP_KERNEL, cpu_to_node(i));
1586 }
1587 #endif /* CONFIG_SMP */
1588
1589 /*
1590  * When switching a task to RT, we may overload the runqueue
1591  * with RT tasks. In this case we try to push them off to
1592  * other runqueues.
1593  */
1594 static void switched_to_rt(struct rq *rq, struct task_struct *p,
1595                            int running)
1596 {
1597         int check_resched = 1;
1598
1599         /*
1600          * If we are already running, then there's nothing
1601          * that needs to be done. But if we are not running
1602          * we may need to preempt the current running task.
1603          * If that current running task is also an RT task
1604          * then see if we can move to another run queue.
1605          */
1606         if (!running) {
1607 #ifdef CONFIG_SMP
1608                 if (rq->rt.overloaded && push_rt_task(rq) &&
1609                     /* Don't resched if we changed runqueues */
1610                     rq != task_rq(p))
1611                         check_resched = 0;
1612 #endif /* CONFIG_SMP */
1613                 if (check_resched && p->prio < rq->curr->prio)
1614                         resched_task(rq->curr);
1615         }
1616 }
1617
1618 /*
1619  * Priority of the task has changed. This may cause
1620  * us to initiate a push or pull.
1621  */
1622 static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1623                             int oldprio, int running)
1624 {
1625         if (running) {
1626 #ifdef CONFIG_SMP
1627                 /*
1628                  * If our priority decreases while running, we
1629                  * may need to pull tasks to this runqueue.
1630                  */
1631                 if (oldprio < p->prio)
1632                         pull_rt_task(rq);
1633                 /*
1634                  * If there's a higher priority task waiting to run
1635                  * then reschedule. Note, the above pull_rt_task
1636                  * can release the rq lock and p could migrate.
1637                  * Only reschedule if p is still on the same runqueue.
1638                  */
1639                 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1640                         resched_task(p);
1641 #else
1642                 /* For UP simply resched on drop of prio */
1643                 if (oldprio < p->prio)
1644                         resched_task(p);
1645 #endif /* CONFIG_SMP */
1646         } else {
1647                 /*
1648                  * This task is not running, but if it is
1649                  * greater than the current running task
1650                  * then reschedule.
1651                  */
1652                 if (p->prio < rq->curr->prio)
1653                         resched_task(rq->curr);
1654         }
1655 }
1656
1657 static void watchdog(struct rq *rq, struct task_struct *p)
1658 {
1659         unsigned long soft, hard;
1660
1661         if (!p->signal)
1662                 return;
1663
1664         soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1665         hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1666
1667         if (soft != RLIM_INFINITY) {
1668                 unsigned long next;
1669
1670                 p->rt.timeout++;
1671                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1672                 if (p->rt.timeout > next)
1673                         p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1674         }
1675 }
1676
1677 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1678 {
1679         update_curr_rt(rq);
1680
1681         watchdog(rq, p);
1682
1683         /*
1684          * RR tasks need a special form of timeslice management.
1685          * FIFO tasks have no timeslices.
1686          */
1687         if (p->policy != SCHED_RR)
1688                 return;
1689
1690         if (--p->rt.time_slice)
1691                 return;
1692
1693         p->rt.time_slice = DEF_TIMESLICE;
1694
1695         /*
1696          * Requeue to the end of queue if we are not the only element
1697          * on the queue:
1698          */
1699         if (p->rt.run_list.prev != p->rt.run_list.next) {
1700                 requeue_task_rt(rq, p, 0);
1701                 set_tsk_need_resched(p);
1702         }
1703 }
1704
1705 static void set_curr_task_rt(struct rq *rq)
1706 {
1707         struct task_struct *p = rq->curr;
1708
1709         p->se.exec_start = rq->clock;
1710
1711         /* The running task is never eligible for pushing */
1712         dequeue_pushable_task(rq, p);
1713 }
1714
1715 static const struct sched_class rt_sched_class = {
1716         .next                   = &fair_sched_class,
1717         .enqueue_task           = enqueue_task_rt,
1718         .dequeue_task           = dequeue_task_rt,
1719         .yield_task             = yield_task_rt,
1720
1721         .check_preempt_curr     = check_preempt_curr_rt,
1722
1723         .pick_next_task         = pick_next_task_rt,
1724         .put_prev_task          = put_prev_task_rt,
1725
1726 #ifdef CONFIG_SMP
1727         .select_task_rq         = select_task_rq_rt,
1728
1729         .load_balance           = load_balance_rt,
1730         .move_one_task          = move_one_task_rt,
1731         .set_cpus_allowed       = set_cpus_allowed_rt,
1732         .rq_online              = rq_online_rt,
1733         .rq_offline             = rq_offline_rt,
1734         .pre_schedule           = pre_schedule_rt,
1735         .needs_post_schedule    = needs_post_schedule_rt,
1736         .post_schedule          = post_schedule_rt,
1737         .task_wake_up           = task_wake_up_rt,
1738         .switched_from          = switched_from_rt,
1739 #endif
1740
1741         .set_curr_task          = set_curr_task_rt,
1742         .task_tick              = task_tick_rt,
1743
1744         .prio_changed           = prio_changed_rt,
1745         .switched_to            = switched_to_rt,
1746 };
1747
1748 #ifdef CONFIG_SCHED_DEBUG
1749 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1750
1751 static void print_rt_stats(struct seq_file *m, int cpu)
1752 {
1753         struct rt_rq *rt_rq;
1754
1755         rcu_read_lock();
1756         for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1757                 print_rt_rq(m, cpu, rt_rq);
1758         rcu_read_unlock();
1759 }
1760 #endif /* CONFIG_SCHED_DEBUG */
1761