X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched.c;h=6c10fa796ca040ef8da1f1e65acd58e4d7644faa;hb=e86908614f2c7fec401827e5cefd7a6ea9407f85;hp=26795adab3ad9a97538b5f3c39c2bc9acb6387a8;hpb=f2ac58ee617fd9f6cd9922fbcd291b661d7c9954;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 26795ad..6c10fa7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -16,13 +16,19 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith + * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri */ #include #include #include #include -#include +#include #include #include #include @@ -47,15 +53,17 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include #include -#include /* * Scheduler clock - returns current time in nanosec units. @@ -103,87 +111,6 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} #ifdef CONFIG_SMP /* @@ -206,18 +133,22 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) } #endif +#define SCALE_PRIO(x, prio) \ + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) + /* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. */ - -static inline unsigned int task_timeslice(struct task_struct *p) +static unsigned int static_prio_timeslice(int static_prio) { - return static_prio_timeslice(p->static_prio); + if (static_prio == NICE_TO_PRIO(19)) + return 1; + + if (static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, static_prio); } static inline int rt_policy(int policy) @@ -286,15 +217,6 @@ struct rt_rq { }; /* - * The prio-array type of the old scheduler: - */ -struct prio_array { - unsigned int nr_active; - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_PRIO]; -}; - -/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues @@ -309,7 +231,6 @@ struct rq { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; - unsigned long raw_weighted_load; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick; @@ -334,23 +255,17 @@ struct rq { */ unsigned long nr_uninterruptible; - unsigned long expired_timestamp; - unsigned long long most_recent_timestamp; - struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; - struct prio_array *active, *expired, arrays[2]; - int best_expired_prio; - u64 clock, prev_clock_raw; s64 clock_max_delta; unsigned int clock_warps, clock_overflows; - unsigned int clock_unstable_events; - - struct sched_class *load_balance_class; + u64 idle_clock; + unsigned int clock_deep_idle_events; + u64 tick_timestamp; atomic_t nr_iowait; @@ -388,9 +303,14 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static DEFINE_MUTEX(sched_hotcpu_mutex); +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +{ + rq->curr->sched_class->check_preempt_curr(rq, p); +} + static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP @@ -401,15 +321,19 @@ static inline int cpu_of(struct rq *rq) } /* - * Per-runqueue clock, as finegrained as the platform can give us: + * Update the per-runqueue clock, as finegrained as the platform can give + * us, but without assuming monotonicity, etc.: */ -static unsigned long long __rq_clock(struct rq *rq) +static void __update_rq_clock(struct rq *rq) { u64 prev_raw = rq->prev_clock_raw; u64 now = sched_clock(); s64 delta = now - prev_raw; u64 clock = rq->clock; +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); +#endif /* * Protect against sched_clock() occasionally going backwards: */ @@ -420,8 +344,11 @@ static unsigned long long __rq_clock(struct rq *rq) /* * Catch too large forward jumps too: */ - if (unlikely(delta > 2*TICK_NSEC)) { - clock++; + if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { + if (clock < rq->tick_timestamp + TICK_NSEC) + clock = rq->tick_timestamp + TICK_NSEC; + else + clock++; rq->clock_overflows++; } else { if (unlikely(delta > rq->clock_max_delta)) @@ -432,18 +359,12 @@ static unsigned long long __rq_clock(struct rq *rq) rq->prev_clock_raw = now; rq->clock = clock; - - return clock; } -static inline unsigned long long rq_clock(struct rq *rq) +static void update_rq_clock(struct rq *rq) { - int this_cpu = smp_processor_id(); - - if (this_cpu == cpu_of(rq)) - return __rq_clock(rq); - - return rq->clock; + if (likely(smp_processor_id() == cpu_of(rq))) + __update_rq_clock(rq); } /* @@ -461,6 +382,25 @@ static inline unsigned long long rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +unsigned long long cpu_clock(int cpu) +{ + unsigned long long now; + unsigned long flags; + struct rq *rq; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); + now = rq->clock; + local_irq_restore(flags); + + return now; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* Change a task's ->cfs_rq if it moves across CPUs */ static inline void set_task_cfs_rq(struct task_struct *p) @@ -618,6 +558,42 @@ static inline struct rq *this_rq_lock(void) } /* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + + spin_lock(&rq->lock); + __update_rq_clock(rq); + spin_unlock(&rq->lock); + rq->clock_deep_idle_events++; +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); + + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + spin_lock(&rq->lock); + rq->prev_clock_raw = now; + rq->clock += delta_ns; + spin_unlock(&rq->lock); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +/* * resched_task - mark a task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it @@ -669,8 +645,6 @@ static inline void resched_task(struct task_struct *p) } #endif -#include "sched_stats.h" - static u64 div64_likely32(u64 divident, unsigned long divisor) { #if BITS_PER_LONG == 32 @@ -692,27 +666,31 @@ static u64 div64_likely32(u64 divident, unsigned long divisor) #define WMULT_SHIFT 32 -static inline unsigned long +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) { u64 tmp; if (unlikely(!lw->inv_weight)) - lw->inv_weight = WMULT_CONST / lw->weight; + lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; tmp = (u64)delta_exec * weight; /* * Check whether we'd overflow the 64-bit multiplication: */ - if (unlikely(tmp > WMULT_CONST)) { - tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) - >> (WMULT_SHIFT/2); - } else { - tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; - } + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit); + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } static inline unsigned long @@ -733,6 +711,88 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec) lw->inv_weight = 0; } +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 2 +#define WMULT_IDLEPRIO (1 << 31) + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); + +/* + * runqueue iterator, to support SMP load-balancing between different + * scheduling classes, without having to expose their internal data + * structures to the load-balancing proper: + */ +struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); +}; + +static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *load_moved, + int *this_best_prio, struct rq_iterator *iterator); + +#include "sched_stats.h" +#include "sched_rt.c" +#include "sched_fair.c" +#include "sched_idletask.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif + +#define sched_class_highest (&rt_sched_class) + static void __update_curr_load(struct rq *rq, struct load_stat *ls) { if (rq->curr != rq->idle && ls->load.weight) { @@ -757,14 +817,14 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls) * This function is called /before/ updating rq->ls.load * and when switching tasks. */ -static void update_curr_load(struct rq *rq, u64 now) +static void update_curr_load(struct rq *rq) { struct load_stat *ls = &rq->ls; u64 start; start = ls->load_update_start; - ls->load_update_start = now; - ls->delta_stat += now - start; + ls->load_update_start = rq->clock; + ls->delta_stat += rq->clock - start; /* * Stagger updates to ls->delta_fair. Very frequent updates * can be expensive. @@ -773,135 +833,72 @@ static void update_curr_load(struct rq *rq, u64 now) __update_curr_load(rq, ls); } -/* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ - -/* - * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE - * If static_prio_timeslice() is ever changed to break this assumption then - * this code will need modification - */ -#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define LOAD_WEIGHT(lp) \ - (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) -#define PRIO_TO_LOAD_WEIGHT(prio) \ - LOAD_WEIGHT(static_prio_timeslice(prio)) -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) - -static inline void -inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) +static inline void inc_load(struct rq *rq, const struct task_struct *p) { - rq->raw_weighted_load += p->load_weight; + update_curr_load(rq); + update_load_add(&rq->ls.load, p->se.load.weight); } -static inline void -dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) +static inline void dec_load(struct rq *rq, const struct task_struct *p) { - rq->raw_weighted_load -= p->load_weight; + update_curr_load(rq); + update_load_sub(&rq->ls.load, p->se.load.weight); } -static inline void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; - inc_raw_weighted_load(rq, p); + inc_load(rq, p); } -static inline void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; - dec_raw_weighted_load(rq, p); + dec_load(rq, p); } static void set_load_weight(struct task_struct *p) { + p->se.wait_runtime = 0; + if (task_has_rt_policy(p)) { -#ifdef CONFIG_SMP - if (p == task_rq(p)->migration_thread) - /* - * The migration thread does the actual balancing. - * Giving its load any weight will skew balancing - * adversely. - */ - p->load_weight = 0; - else -#endif - p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); - } else - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); -} + p->se.load.weight = prio_to_weight[0] * 2; + p->se.load.inv_weight = prio_to_wmult[0] >> 1; + return; + } -/* - * Adding/removing a task to/from a priority array: - */ -static void dequeue_task(struct task_struct *p, struct prio_array *array) -{ - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); -} + /* + * SCHED_IDLE tasks get minimal weight: + */ + if (p->policy == SCHED_IDLE) { + p->se.load.weight = WEIGHT_IDLEPRIO; + p->se.load.inv_weight = WMULT_IDLEPRIO; + return; + } -static void enqueue_task(struct task_struct *p, struct prio_array *array) -{ - sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; + p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } -/* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. - */ -static void requeue_task(struct task_struct *p, struct prio_array *array) +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { - list_move_tail(&p->run_list, array->queue + p->prio); + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, wakeup); + p->se.on_rq = 1; } -static inline void -enqueue_task_head(struct task_struct *p, struct prio_array *array) +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + p->sched_class->dequeue_task(rq, p, sleep); + p->se.on_rq = 0; } /* - * __normal_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. + * __normal_prio - return the priority that is based on the static prio */ - static inline int __normal_prio(struct task_struct *p) { - int bonus, prio; - - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; + return p->static_prio; } /* @@ -943,120 +940,41 @@ static int effective_prio(struct task_struct *p) } /* - * __activate_task - move a task to the runqueue. + * activate_task - move a task to the runqueue. */ -static void __activate_task(struct task_struct *p, struct rq *rq) +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - struct prio_array *target = rq->active; - - if (batch_task(p)) - target = rq->expired; - enqueue_task(p, target); - inc_nr_running(p, rq); -} + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; -/* - * __activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) -{ - enqueue_task_head(p, rq->active); + enqueue_task(rq, p, wakeup); inc_nr_running(p, rq); } /* - * Recalculate p->normal_prio and p->prio after having slept, - * updating the sleep-average too: - */ -static int recalc_task_prio(struct task_struct *p, unsigned long long now) -{ - /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long sleep_time = now - p->timestamp; - - if (batch_task(p)) - sleep_time = 0; - - if (likely(sleep_time > 0)) { - /* - * This ceiling is set to the lowest priority that would allow - * a task to be reinserted into the active array on timeslice - * completion. - */ - unsigned long ceiling = INTERACTIVE_SLEEP(p); - - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { - /* - * Prevents user tasks from achieving best priority - * with one single large enough sleep. - */ - p->sleep_avg = ceiling; - } else { - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - } - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } - - return effective_prio(p); -} - -/* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) + * activate_idle_task - move idle task to the _front_ of runqueue. */ -static void activate_task(struct task_struct *p, struct rq *rq, int local) +static inline void activate_idle_task(struct task_struct *p, struct rq *rq) { - unsigned long long now; - - if (rt_task(p)) - goto out; + update_rq_clock(rq); - now = sched_clock(); -#ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ - struct rq *this_rq = this_rq(); - now = (now - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; - } -#endif - - /* - * Sleep time is in units of nanosecs, so shift by 20 to get a - * milliseconds-range estimation of the amount of time that the task - * spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - if (p->state == TASK_UNINTERRUPTIBLE) - profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (now - p->timestamp) >> 20); - } + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; - p->prio = recalc_task_prio(p, now); - p->timestamp = now; -out: - __activate_task(p, rq); + enqueue_task(rq, p, 0); + inc_nr_running(p, rq); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, struct rq *rq) +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + + dequeue_task(rq, p, sleep); dec_nr_running(p, rq); - dequeue_task(p, p->array); - p->array = NULL; } /** @@ -1071,14 +989,43 @@ inline int task_curr(const struct task_struct *p) /* Used instead of source_load when we know the type == 0 */ unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->raw_weighted_load; + return cpu_rq(cpu)->ls.load.weight; +} + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_SMP + task_thread_info(p)->cpu = cpu; + set_task_cfs_rq(p); +#endif } #ifdef CONFIG_SMP -void set_task_cpu(struct task_struct *p, unsigned int cpu) +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { - task_thread_info(p)->cpu = cpu; + int old_cpu = task_cpu(p); + struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + u64 clock_offset, fair_clock_offset; + + clock_offset = old_rq->clock - new_rq->clock; + fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; + + if (p->se.wait_start_fair) + p->se.wait_start_fair -= fair_clock_offset; + if (p->se.sleep_start_fair) + p->se.sleep_start_fair -= fair_clock_offset; + +#ifdef CONFIG_SCHEDSTATS + if (p->se.wait_start) + p->se.wait_start -= clock_offset; + if (p->se.sleep_start) + p->se.sleep_start -= clock_offset; + if (p->se.block_start) + p->se.block_start -= clock_offset; +#endif + + __set_task_cpu(p, new_cpu); } struct migration_req { @@ -1103,7 +1050,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!p->se.on_rq && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -1128,9 +1075,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) void wait_task_inactive(struct task_struct *p) { unsigned long flags; + int running, on_rq; struct rq *rq; - struct prio_array *array; - int running; repeat: /* @@ -1162,7 +1108,7 @@ repeat: */ rq = task_rq_lock(p, &flags); running = task_running(rq, p); - array = p->array; + on_rq = p->se.on_rq; task_rq_unlock(rq, &flags); /* @@ -1185,7 +1131,7 @@ repeat: * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(array)) { + if (unlikely(on_rq)) { yield(); goto repeat; } @@ -1231,11 +1177,12 @@ void kick_process(struct task_struct *p) static inline unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); if (type == 0) - return rq->raw_weighted_load; + return total; - return min(rq->cpu_load[type-1], rq->raw_weighted_load); + return min(rq->cpu_load[type-1], total); } /* @@ -1245,11 +1192,12 @@ static inline unsigned long source_load(int cpu, int type) static inline unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); if (type == 0) - return rq->raw_weighted_load; + return total; - return max(rq->cpu_load[type-1], rq->raw_weighted_load); + return max(rq->cpu_load[type-1], total); } /* @@ -1258,9 +1206,10 @@ static inline unsigned long target_load(int cpu, int type) static inline unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); unsigned long n = rq->nr_running; - return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; + return n ? total / n : SCHED_LOAD_SCALE; } /* @@ -1362,9 +1311,9 @@ static int sched_balance_self(int cpu, int flag) struct sched_domain *tmp, *sd = NULL; for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ + /* + * If power savings logic is enabled for a domain, stop there. + */ if (tmp->flags & SD_POWERSAVINGS_BALANCE) break; if (tmp->flags & flag) @@ -1447,9 +1396,9 @@ static int wake_idle(int cpu, struct task_struct *p) if (idle_cpu(i)) return i; } - } - else + } else { break; + } } return cpu; } @@ -1491,7 +1440,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!(old_state & state)) goto out; - if (p->array) + if (p->se.on_rq) goto out_running; cpu = task_cpu(p); @@ -1546,11 +1495,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) * of the current CPU: */ if (sync) - tl -= current->load_weight; + tl -= current->se.load.weight; if ((tl <= load && tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->load_weight) <= imbalance*load) { + 100*(tl + p->se.load.weight) <= imbalance*load) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1584,7 +1533,7 @@ out_set_cpu: old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (p->se.on_rq) goto out_running; this_cpu = smp_processor_id(); @@ -1593,10 +1542,8 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; - - activate_task(p, rq, cpu == this_cpu); + update_rq_clock(rq); + activate_task(rq, p, 1); /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) @@ -1605,10 +1552,8 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + if (!sync || cpu != this_cpu) + check_preempt_curr(rq, p); success = 1; out_running: @@ -1631,19 +1576,44 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: */ -void fastcall sched_fork(struct task_struct *p, int clone_flags) +static void __sched_fork(struct task_struct *p) { - int cpu = get_cpu(); + p->se.wait_start_fair = 0; + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; + p->se.delta_exec = 0; + p->se.delta_fair_run = 0; + p->se.delta_fair_sleep = 0; + p->se.wait_runtime = 0; + p->se.sleep_start_fair = 0; -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; + p->se.sum_wait_runtime = 0; + p->se.sum_sleep_runtime = 0; + p->se.sleep_start = 0; + p->se.block_start = 0; + p->se.sleep_max = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.wait_max = 0; + p->se.wait_runtime_overruns = 0; + p->se.wait_runtime_underruns = 0; +#endif + + INIT_LIST_HEAD(&p->run_list); + p->se.on_rq = 0; + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - set_task_cpu(p, cpu); /* * We mark the process as running here, but have not actually @@ -1652,16 +1622,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; +} + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p, int clone_flags) +{ + int cpu = get_cpu(); + + __sched_fork(p); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + __set_task_cpu(p, cpu); /* * Make sure we do not leak PI boosting priority to the child: */ p->prio = current->normal_prio; - INIT_LIST_HEAD(&p->run_list); - p->array = NULL; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (unlikely(sched_info_on())) + if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) @@ -1671,118 +1654,119 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ - local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { + put_cpu(); +} + +/* + * After fork, child runs first. (default) If set to 0 then + * parent will (try to) run first. + */ +unsigned int __read_mostly sysctl_sched_child_runs_first = 1; + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +{ + unsigned long flags; + struct rq *rq; + int this_cpu; + + rq = task_rq_lock(p, &flags); + BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); /* parent's CPU */ + update_rq_clock(rq); + + p->prio = effective_prio(p); + + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || + (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || + !current->se.on_rq) { + + activate_task(rq, p, 0); + } else { /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. + * Let the scheduling class do new task startup + * management (if any): */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); + p->sched_class->task_new(rq, p); + inc_nr_running(p, rq); } - local_irq_enable(); - put_cpu(); + check_preempt_curr(rq, p); + task_rq_unlock(rq, &flags); +} + +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); } +EXPORT_SYMBOL_GPL(preempt_notifier_register); -/* - * wake_up_new_task - wake up a newly created task for the first time. +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister * - * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created context, then puts the task - * on the runqueue and wakes it. + * This is safe to call from within a preemption notifier. */ -void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +void preempt_notifier_unregister(struct preempt_notifier *notifier) { - struct rq *rq, *this_rq; - unsigned long flags; - int this_cpu, cpu; + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); - this_cpu = smp_processor_id(); - cpu = task_cpu(p); +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} - p->prio = effective_prio(p); +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; - if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - p->normal_prio = current->normal_prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - inc_nr_running(p, rq); - } - set_need_resched(); - } else - /* Run child last */ - __activate_task(p, rq); - /* - * We skip the following code due to cpu == this_cpu - * - * task_rq_unlock(rq, &flags); - * this_rq = task_rq_lock(current, &flags); - */ - this_rq = rq; - } else { - this_rq = cpu_rq(this_cpu); + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} - /* - * Not the local CPU - must adjust timestamp. This should - * get optimised away in the !CONFIG_SMP case. - */ - p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); +#else - /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: - */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); - } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ } +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out * @next: the task we are going to switch to. * * This is called with the rq lock held and interrupts off. It must @@ -1792,8 +1776,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * prepare_task_switch sets up locking and calls architecture specific * hooks. */ -static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) { + fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -1835,13 +1822,14 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); + fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. - */ + */ kprobe_flush_task(prev); put_task_struct(prev); } @@ -1869,13 +1857,15 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline struct task_struct * +static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; + struct mm_struct *mm, *oldmm; + prepare_task_switch(rq, prev, next); + mm = next->mm; + oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -1883,16 +1873,15 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_enter_lazy_cpu_mode(); - if (!mm) { + if (unlikely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (!prev->mm) { + if (unlikely(!prev->mm)) { prev->active_mm = NULL; - WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } /* @@ -1908,7 +1897,13 @@ context_switch(struct rq *rq, struct task_struct *prev, /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - return prev; + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); } /* @@ -1981,17 +1976,64 @@ unsigned long nr_active(void) return running + uninterruptible; } -#ifdef CONFIG_SMP - /* - * Is this task likely cache-hot: + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). */ -static inline int -task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) +static void update_cpu_load(struct rq *this_rq) { - return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; + u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; + unsigned long total_load = this_rq->ls.load.weight; + unsigned long this_load = total_load; + struct load_stat *ls = &this_rq->ls; + int i, scale; + + this_rq->nr_load_updates++; + if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) + goto do_avg; + + /* Update delta_fair/delta_exec fields first */ + update_curr_load(this_rq); + + fair_delta64 = ls->delta_fair + 1; + ls->delta_fair = 0; + + exec_delta64 = ls->delta_exec + 1; + ls->delta_exec = 0; + + sample_interval64 = this_rq->clock - ls->load_update_last; + ls->load_update_last = this_rq->clock; + + if ((s64)sample_interval64 < (s64)TICK_NSEC) + sample_interval64 = TICK_NSEC; + + if (exec_delta64 > sample_interval64) + exec_delta64 = sample_interval64; + + idle_delta64 = sample_interval64 - exec_delta64; + + tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); + tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); + + this_load = (unsigned long)tmp64; + +do_avg: + + /* Update our load: */ + for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + new_load = this_load; + + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + } } +#ifdef CONFIG_SMP + /* * double_rq_lock - safely lock two runqueues * @@ -2015,6 +2057,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) spin_lock(&rq1->lock); } } + update_rq_clock(rq1); + update_rq_clock(rq2); } /* @@ -2108,23 +2152,17 @@ void sched_exec(void) * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct prio_array *src_array, - struct task_struct *p, struct rq *this_rq, - struct prio_array *this_array, int this_cpu) +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) { - dequeue_task(p, src_array); - dec_nr_running(p, src_rq); + deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); - enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) - + this_rq->most_recent_timestamp; + activate_task(this_rq, p, 0); /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); + check_preempt_curr(this_rq, p); } /* @@ -2148,133 +2186,57 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (task_running(rq, p)) return 0; - /* - * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. - */ - - if (sd->nr_balance_failed > sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->most_recent_timestamp, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif - return 1; - } - - if (task_hot(p, rq->most_recent_timestamp, sd)) - return 0; return 1; } -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) - -/* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted - * load from busiest to this_rq, as part of a balancing operation within - * "domain". Returns the number of tasks moved. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, +static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *all_pinned, unsigned long *load_moved, + int *this_best_prio, struct rq_iterator *iterator) { - int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, - best_prio_seen, skip_for_load; - struct prio_array *array, *dst_array; - struct list_head *head, *curr; - struct task_struct *tmp; - long rem_load_move; + int pulled = 0, pinned = 0, skip_for_load; + struct task_struct *p; + long rem_load_move = max_load_move; if (max_nr_move == 0 || max_load_move == 0) goto out; - rem_load_move = max_load_move; pinned = 1; - this_best_prio = rq_best_prio(this_rq); - best_prio = rq_best_prio(busiest); - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) of - * any task we find with that prio. - */ - best_prio_seen = best_prio == busiest->curr->prio; /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. + * Start the load-balancing iterator: */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } + p = iterator->start(iterator->arg); +next: + if (!p) goto out; - } - - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, struct task_struct, run_list); - - curr = curr->prev; - /* * To help distribute high priority tasks accross CPUs we don't * skip a task if it will be the highest priority task (i.e. smallest * prio value) on its new queue regardless of its load weight */ - skip_for_load = tmp->load_weight > rem_load_move; - if (skip_for_load && idx < this_best_prio) - skip_for_load = !best_prio_seen && idx == best_prio; - if (skip_for_load || - !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { - - best_prio_seen |= idx == best_prio; - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + skip_for_load = (p->se.load.weight >> 1) > rem_load_move + + SCHED_LOAD_SCALE_FUZZ; + if ((skip_for_load && p->prio >= *this_best_prio) || + !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + p = iterator->next(iterator->arg); + goto next; } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pull_task(busiest, p, this_rq, this_cpu); pulled++; - rem_load_move -= tmp->load_weight; + rem_load_move -= p->se.load.weight; /* * We only want to steal up to the prescribed number of tasks * and the prescribed amount of weighted load. */ if (pulled < max_nr_move && rem_load_move > 0) { - if (idx < this_best_prio) - this_best_prio = idx; - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + if (p->prio < *this_best_prio) + *this_best_prio = p->prio; + p = iterator->next(iterator->arg); + goto next; } out: /* @@ -2286,18 +2248,68 @@ out: if (all_pinned) *all_pinned = pinned; + *load_moved = max_load_move - rem_load_move; return pulled; } /* + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + struct sched_class *class = sched_class_highest; + unsigned long total_load_moved = 0; + int this_best_prio = this_rq->curr->prio; + + do { + total_load_moved += + class->load_balance(this_rq, this_cpu, busiest, + ULONG_MAX, max_load_move - total_load_moved, + sd, idle, all_pinned, &this_best_prio); + class = class->next; + } while (class && max_load_move > total_load_moved); + + return total_load_moved > 0; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct sched_class *class; + int this_best_prio = MAX_PRIO; + + for (class = sched_class_highest; class; class = class->next) + if (class->load_balance(this_rq, this_cpu, busiest, + 1, ULONG_MAX, sd, idle, NULL, + &this_best_prio)) + return 1; + + return 0; +} + +/* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the amount of weighted load which * should be moved to restore balance via the imbalance parameter. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, - cpumask_t *cpus, int *balance) + unsigned long *imbalance, enum cpu_idle_type idle, + int *sd_idle, cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2345,7 +2357,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, rq = cpu_rq(i); - if (*sd_idle && !idle_cpu(i)) + if (*sd_idle && rq->nr_running) *sd_idle = 0; /* Bias balancing toward cpus of our domain */ @@ -2361,15 +2373,17 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load += load; sum_nr_running += rq->nr_running; - sum_weighted_load += rq->raw_weighted_load; + sum_weighted_load += weighted_cpuload(i); } /* * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above - * domains. + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. */ - if (local_group && balance_cpu != this_cpu && balance) { + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { *balance = 0; goto ret; } @@ -2401,8 +2415,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Busy processors will not participate in power savings * balance. */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto group_next; + if (idle == CPU_NOT_IDLE || + !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; /* * If the local group is idle or completely loaded @@ -2412,42 +2427,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, !this_nr_running)) power_savings_balance = 0; - /* + /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations - */ - if (!power_savings_balance || sum_nr_running >= group_capacity + */ + if (!power_savings_balance || sum_nr_running >= group_capacity || !sum_nr_running) - goto group_next; + goto group_next; - /* + /* * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && first_cpu(group->cpumask) < first_cpu(group_min->cpumask))) { - group_min = group; - min_nr_running = sum_nr_running; + group_min = group; + min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / sum_nr_running; - } + } - /* + /* * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) { + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + first_cpu(group->cpumask) > + first_cpu(group_leader->cpumask))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } } group_next: #endif @@ -2516,7 +2531,8 @@ small_imbalance: } else this_load_per_task = SCHED_LOAD_SCALE; - if (max_load - this_load >= busiest_load_per_task * imbn) { + if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; } @@ -2553,10 +2569,8 @@ small_imbalance: pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ - if (pwr_move <= pwr_now) - goto out_balanced; - - *imbalance = busiest_load_per_task; + if (pwr_move > pwr_now) + *imbalance = busiest_load_per_task; } return busiest; @@ -2588,17 +2602,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, int i; for_each_cpu_mask(i, group->cpumask) { + unsigned long wl; if (!cpu_isset(i, *cpus)) continue; rq = cpu_rq(i); + wl = weighted_cpuload(i); - if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) + if (rq->nr_running == 1 && wl > imbalance) continue; - if (rq->raw_weighted_load > max_load) { - max_load = rq->raw_weighted_load; + if (wl > max_load) { + max_load = wl; busiest = rq; } } @@ -2612,11 +2628,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ #define MAX_PINNED_INTERVAL 512 -static inline unsigned long minus_1_or_zero(unsigned long n) -{ - return n > 0 ? n - 1 : 0; -} - /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2625,7 +2636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -2635,7 +2646,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as IDLE, instead of + * let the state of idle sibling percolate up as CPU_IDLE, instead of * portraying it as CPU_NOT_IDLE. */ if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && @@ -2666,18 +2677,17 @@ redo: schedstat_add(sd, lb_imbalance[idle], imbalance); - nr_moved = 0; + ld_moved = 0; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is + * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ local_irq_save(flags); double_rq_lock(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), + ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -2685,7 +2695,7 @@ redo: /* * some other cpu did the load balance for us. */ - if (nr_moved && this_cpu != smp_processor_id()) + if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ @@ -2697,7 +2707,7 @@ redo: } } - if (!nr_moved) { + if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; @@ -2746,10 +2756,10 @@ redo: sd->balance_interval *= 2; } - if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; - return nr_moved; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -2781,8 +2791,9 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) struct sched_group *group; struct rq *busiest = NULL; unsigned long imbalance; - int nr_moved = 0; + int ld_moved = 0; int sd_idle = 0; + int all_pinned = 0; cpumask_t cpus = CPU_MASK_ALL; /* @@ -2815,23 +2826,25 @@ redo: schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); - nr_moved = 0; + ld_moved = 0; if (busiest->nr_running > 1) { /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), - imbalance, sd, CPU_NEWLY_IDLE, NULL); + /* this_rq->clock is already updated */ + update_rq_clock(busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, CPU_NEWLY_IDLE, + &all_pinned); spin_unlock(&busiest->lock); - if (!nr_moved) { + if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); if (!cpus_empty(cpus)) goto redo; } } - if (!nr_moved) { + if (!ld_moved) { schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) @@ -2839,7 +2852,7 @@ redo: } else sd->nr_balance_failed = 0; - return nr_moved; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); @@ -2858,8 +2871,8 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; - int pulled_task = 0; - unsigned long next_balance = jiffies + 60 * HZ; + int pulled_task = -1; + unsigned long next_balance = jiffies + HZ; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2878,12 +2891,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (pulled_task) break; } - if (!pulled_task) + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on * a busy processor. So reset next_balance. */ this_rq->next_balance = next_balance; + } } /* @@ -2915,6 +2929,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); + update_rq_clock(busiest_rq); + update_rq_clock(target_rq); /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { @@ -2926,9 +2942,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) if (likely(sd)) { schedstat_inc(sd, alb_cnt); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, - RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE, - NULL)) + if (move_one_task(target_rq, target_cpu, busiest_rq, + sd, CPU_IDLE)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -2936,32 +2951,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) spin_unlock(&target_rq->lock); } -static void update_load(struct rq *this_rq) -{ - unsigned long this_load; - unsigned int i, scale; - - this_load = this_rq->raw_weighted_load; - - /* Update our load: */ - for (i = 0, scale = 1; i < 3; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; - } -} - #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; @@ -3052,6 +3041,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3065,6 +3055,9 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; + if (interval > HZ*NR_CPUS/10) + interval = HZ*NR_CPUS/10; + if (sd->flags & SD_SERIALIZE) { if (!spin_trylock(&balancing)) @@ -3085,8 +3078,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) if (sd->flags & SD_SERIALIZE) spin_unlock(&balancing); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -3096,7 +3091,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* @@ -3106,11 +3108,12 @@ out: */ static void run_rebalance_domains(struct softirq_action *h) { - int local_cpu = smp_processor_id(); - struct rq *local_rq = cpu_rq(local_cpu); - enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); + enum cpu_idle_type idle = this_rq->idle_at_tick ? + CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(local_cpu, idle); + rebalance_domains(this_cpu, idle); #ifdef CONFIG_NO_HZ /* @@ -3118,13 +3121,13 @@ static void run_rebalance_domains(struct softirq_action *h) * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (local_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == local_cpu) { + if (this_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == this_cpu) { cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(local_cpu, cpus); + cpu_clear(this_cpu, cpus); for_each_cpu_mask(balance_cpu, cpus) { /* * If this cpu gets work to do, stop the load balancing @@ -3137,8 +3140,8 @@ static void run_rebalance_domains(struct softirq_action *h) rebalance_domains(balance_cpu, CPU_IDLE); rq = cpu_rq(balance_cpu); - if (time_after(local_rq->next_balance, rq->next_balance)) - local_rq->next_balance = rq->next_balance; + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; } } #endif @@ -3151,9 +3154,8 @@ static void run_rebalance_domains(struct softirq_action *h) * idle load balancing owner or decide to stop the periodic load balancing, * if the whole system is idle. */ -static inline void trigger_load_balance(int cpu) +static inline void trigger_load_balance(struct rq *rq, int cpu) { - struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_NO_HZ /* * If we were in the nohz mode recently and busy at the current @@ -3205,13 +3207,28 @@ static inline void trigger_load_balance(int cpu) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); } -#else + +#else /* CONFIG_SMP */ + /* * on UP we do not need to balance between CPUs: */ static inline void idle_balance(int cpu, struct rq *rq) { } + +/* Avoid "used but not defined" warning on UP */ +static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *load_moved, + int *this_best_prio, struct rq_iterator *iterator) +{ + *load_moved = 0; + + return 0; +} + #endif DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -3231,7 +3248,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime; if (rq->curr == p) { - delta_exec = rq_clock(rq) - p->se.exec_start; + update_rq_clock(rq); + delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) ns += delta_exec; } @@ -3241,27 +3259,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) } /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -static inline int expired_starving(struct rq *rq) -{ - if (rq->curr->static_prio > rq->best_expired_prio) - return 1; - if (!STARVATION_LIMIT || !rq->expired_timestamp) - return 0; - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) - return 1; - return 0; -} - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3334,81 +3331,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->steal = cputime64_add(cpustat->steal, tmp); } -static void task_running_tick(struct rq *rq, struct task_struct *p) -{ - if (p->array != rq->active) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); - return; - } - spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - requeue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - requeue_task(p, rq->active); - set_tsk_need_resched(p); - } - } -out_unlock: - spin_unlock(&rq->lock); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3418,17 +3340,27 @@ out_unlock: */ void scheduler_tick(void) { - struct task_struct *p = current; int cpu = smp_processor_id(); - int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; + u64 next_tick = rq->tick_timestamp + TICK_NSEC; + + spin_lock(&rq->lock); + __update_rq_clock(rq); + /* + * Let rq->clock advance by at least TICK_NSEC: + */ + if (unlikely(rq->clock < next_tick)) + rq->clock = next_tick; + rq->tick_timestamp = rq->clock; + update_cpu_load(rq); + if (curr != rq->idle) /* FIXME: needed? */ + curr->sched_class->task_tick(rq, curr); + spin_unlock(&rq->lock); - if (!idle_at_tick) - task_running_tick(rq, p); #ifdef CONFIG_SMP - update_load(rq); - rq->idle_at_tick = idle_at_tick; - trigger_load_balance(cpu); + rq->idle_at_tick = idle_cpu(cpu); + trigger_load_balance(rq, cpu); #endif } @@ -3458,17 +3390,80 @@ void fastcall sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) return; /* - * Is the spinlock portion underflowing? + * Is the spinlock portion underflowing? + */ + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; + + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) +{ + printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", + prev->comm, preempt_count(), prev->pid); + debug_show_held_locks(prev); + if (irqs_disabled()) + print_irqtrace_events(prev); + dump_stack(); +} + +/* + * Various schedule()-time debugging checks and statistics: + */ +static inline void schedule_debug(struct task_struct *prev) +{ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) + __schedule_bug(prev); + + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + + schedstat_inc(this_rq(), sched_cnt); +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev) +{ + struct sched_class *class; + struct task_struct *p; + + /* + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; + if (likely(rq->nr_running == rq->cfs.nr_running)) { + p = fair_sched_class.pick_next_task(rq); + if (likely(p)) + return p; + } - preempt_count() -= val; + class = sched_class_highest; + for ( ; ; ) { + p = class->pick_next_task(rq); + if (p) + return p; + /* + * Will never be NULL as the idle class always + * returns a non-NULL p: + */ + class = class->next; + } } -EXPORT_SYMBOL(sub_preempt_count); - -#endif /* * schedule() is the main scheduler function. @@ -3476,138 +3471,59 @@ EXPORT_SYMBOL(sub_preempt_count); asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; - struct prio_array *array; - struct list_head *queue; - unsigned long long now; - unsigned long run_time; - int cpu, idx; long *switch_count; struct rq *rq; - - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. - */ - if (unlikely(in_atomic() && !current->exit_state)) { - printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } - profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + int cpu; need_resched: preempt_disable(); - prev = current; + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_qsctr_inc(cpu); + prev = rq->curr; + switch_count = &prev->nivcsw; + release_kernel_lock(prev); need_resched_nonpreemptible: - rq = this_rq(); - - /* - * The idle thread is not allowed to schedule! - * Remove this check after it has been exercised a bit. - */ - if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - } - - schedstat_inc(rq, sched_cnt); - now = sched_clock(); - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { - run_time = now - prev->timestamp; - if (unlikely((long long)(now - prev->timestamp) < 0)) - run_time = 0; - } else - run_time = NS_MAX_SLEEP_AVG; - /* - * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); + schedule_debug(prev); spin_lock_irq(&rq->lock); + clear_tsk_need_resched(prev); + __update_rq_clock(rq); - switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) + unlikely(signal_pending(prev)))) { prev->state = TASK_RUNNING; - else { - if (prev->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; - deactivate_task(prev, rq); + } else { + deactivate_task(rq, prev, 1); } + switch_count = &prev->nvcsw; } - cpu = smp_processor_id(); - if (unlikely(!rq->nr_running)) { + if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - goto switch_tasks; - } - } - - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - schedstat_inc(rq, sched_switch); - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); - -switch_tasks: - if (next == rq->idle) - schedstat_inc(rq, sched_goidle); - prefetch(next); - prefetch_stack(next); - clear_tsk_need_resched(prev); - rcu_qsctr_inc(task_cpu(prev)); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; + prev->sched_class->put_prev_task(rq, prev); + next = pick_next_task(rq, prev); sched_info_switch(prev, next); + if (likely(prev != next)) { - next->timestamp = next->last_ran = now; rq->nr_switches++; rq->curr = next; ++*switch_count; - prepare_task_switch(rq, next); - prev = context_switch(rq, prev, next); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + context_switch(rq, prev, next); /* unlocks the rq */ } else spin_unlock_irq(&rq->lock); - prev = current; - if (unlikely(reacquire_kernel_lock(prev) < 0)) + if (unlikely(reacquire_kernel_lock(current) < 0)) { + cpu = smp_processor_id(); + rq = cpu_rq(cpu); goto need_resched_nonpreemptible; + } preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; @@ -3935,74 +3851,85 @@ out: } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -#define SLEEP_ON_VAR \ - unsigned long flags; \ - wait_queue_t wait; \ - init_waitqueue_entry(&wait, current); - -#define SLEEP_ON_HEAD \ - spin_lock_irqsave(&q->lock,flags); \ - __add_wait_queue(q, &wait); \ +static inline void +sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) +{ + spin_lock_irqsave(&q->lock, *flags); + __add_wait_queue(q, wait); spin_unlock(&q->lock); +} -#define SLEEP_ON_TAIL \ - spin_lock_irq(&q->lock); \ - __remove_wait_queue(q, &wait); \ - spin_unlock_irqrestore(&q->lock, flags); +static inline void +sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) +{ + spin_lock_irq(&q->lock); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, *flags); +} -void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) +void __sched interruptible_sleep_on(wait_queue_head_t *q) { - SLEEP_ON_VAR + unsigned long flags; + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); current->state = TASK_INTERRUPTIBLE; - SLEEP_ON_HEAD + sleep_on_head(q, &wait, &flags); schedule(); - SLEEP_ON_TAIL + sleep_on_tail(q, &wait, &flags); } EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched +long __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { - SLEEP_ON_VAR + unsigned long flags; + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); current->state = TASK_INTERRUPTIBLE; - SLEEP_ON_HEAD + sleep_on_head(q, &wait, &flags); timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL + sleep_on_tail(q, &wait, &flags); return timeout; } EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void fastcall __sched sleep_on(wait_queue_head_t *q) +void __sched sleep_on(wait_queue_head_t *q) { - SLEEP_ON_VAR + unsigned long flags; + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); current->state = TASK_UNINTERRUPTIBLE; - SLEEP_ON_HEAD + sleep_on_head(q, &wait, &flags); schedule(); - SLEEP_ON_TAIL + sleep_on_tail(q, &wait, &flags); } EXPORT_SYMBOL(sleep_on); -long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { - SLEEP_ON_VAR + unsigned long flags; + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); current->state = TASK_UNINTERRUPTIBLE; - SLEEP_ON_HEAD + sleep_on_head(q, &wait, &flags); timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL + sleep_on_tail(q, &wait, &flags); return timeout; } - EXPORT_SYMBOL(sleep_on_timeout); #ifdef CONFIG_RT_MUTEXES @@ -4019,29 +3946,29 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - struct prio_array *array; unsigned long flags; + int oldprio, on_rq; struct rq *rq; - int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); oldprio = p->prio; - array = p->array; - if (array) - dequeue_task(p, array); + on_rq = p->se.on_rq; + if (on_rq) + dequeue_task(rq, p, 0); + + if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + p->prio = prio; - if (array) { - /* - * If changing to an RT priority then queue it - * in the active array! - */ - if (rt_task(p)) - array = rq->active; - enqueue_task(p, array); + if (on_rq) { + enqueue_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -4050,8 +3977,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else { + check_preempt_curr(rq, p); + } } task_rq_unlock(rq, &flags); } @@ -4060,8 +3988,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { - struct prio_array *array; - int old_prio, delta; + int old_prio, delta, on_rq; unsigned long flags; struct rq *rq; @@ -4072,20 +3999,21 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * not SCHED_NORMAL/SCHED_BATCH: + * SCHED_FIFO/SCHED_RR: */ if (task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; - if (array) { - dequeue_task(p, array); - dec_raw_weighted_load(rq, p); + on_rq = p->se.on_rq; + if (on_rq) { + dequeue_task(rq, p, 0); + dec_load(rq, p); } p->static_prio = NICE_TO_PRIO(nice); @@ -4094,9 +4022,9 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (array) { - enqueue_task(p, array); - inc_raw_weighted_load(rq, p); + if (on_rq) { + enqueue_task(rq, p, 0); + inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4216,20 +4144,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) } /* Actually do priority change: must hold rq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(p->se.on_rq); p->policy = policy; + switch (p->policy) { + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + p->sched_class = &fair_sched_class; + break; + case SCHED_FIFO: + case SCHED_RR: + p->sched_class = &rt_sched_class; + break; + } + p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; set_load_weight(p); } @@ -4244,8 +4180,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1; - struct prio_array *array; + int retval, oldprio, oldpolicy = -1, on_rq; unsigned long flags; struct rq *rq; @@ -4256,12 +4191,13 @@ recheck: if (policy < 0) policy = oldpolicy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH) + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and - * SCHED_BATCH is 0. + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. */ if (param->sched_priority < 0 || (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || @@ -4276,7 +4212,6 @@ recheck: if (!capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { unsigned long rlim_rtprio; - unsigned long flags; if (!lock_task_sighand(p, &flags)) return -ESRCH; @@ -4292,6 +4227,12 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* + * Like positive nice levels, dont allow tasks to + * move out of SCHED_IDLE either: + */ + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) + return -EPERM; /* can't change other user's priorities */ if ((current->euid != p->euid) && @@ -4319,13 +4260,14 @@ recheck: spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - array = p->array; - if (array) - deactivate_task(p, rq); + update_rq_clock(rq); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq, p, 0); oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); - if (array) { - __activate_task(p, rq); + __setscheduler(rq, p, policy, param->sched_priority); + if (on_rq) { + activate_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -4334,8 +4276,9 @@ recheck: if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else { + check_preempt_curr(rq, p); + } } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4573,10 +4516,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) out_unlock: read_unlock(&tasklist_lock); mutex_unlock(&sched_hotcpu_mutex); - if (retval) - return retval; - return 0; + return retval; } /** @@ -4607,41 +4548,15 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, /** * sys_sched_yield - yield the current processor to other threads. * - * This function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this - * CPU then this function will return. + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { struct rq *rq = this_rq_lock(); - struct prio_array *array = current->array, *target = rq->expired; schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; - - if (array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); - if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); - } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); - - if (array != target) { - dequeue_task(current, array); - enqueue_task(current, target); - } else - /* - * requeue_task is cheaper so perform that if possible. - */ - requeue_task(current, array); + current->sched_class->yield_task(rq, current); /* * Since we are going to call schedule() anyway, there's @@ -4792,6 +4707,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_IDLE: ret = 0; break; } @@ -4816,6 +4732,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_IDLE: ret = 0; } return ret; @@ -4850,7 +4767,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) goto out_unlock; jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : task_timeslice(p), &t); + 0 : static_prio_timeslice(p->static_prio), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -4870,14 +4787,14 @@ static void show_task(struct task_struct *p) state = p->state ? __ffs(p->state) + 1 : 0; printk("%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if (BITS_PER_LONG == 32) +#if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - printk(" running "); + printk(" running "); else - printk(" %08lX ", thread_saved_pc(p)); + printk(" %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - printk(" running task "); + printk(" running task "); else printk(" %016lx ", thread_saved_pc(p)); #endif @@ -4889,11 +4806,7 @@ static void show_task(struct task_struct *p) free = (unsigned long)n - (unsigned long)end_of_stack(p); } #endif - printk("%5lu %5d %6d", free, p->pid, p->parent->pid); - if (!p->mm) - printk(" (L-TLB)\n"); - else - printk(" (NOTLB)\n"); + printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); if (state != TASK_RUNNING) show_stack(p, NULL); @@ -4903,14 +4816,12 @@ void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; -#if (BITS_PER_LONG == 32) - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); #else - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -4925,6 +4836,9 @@ void show_state_filter(unsigned long state_filter) touch_all_softlockup_watchdogs(); +#ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); +#endif read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: @@ -4935,7 +4849,7 @@ void show_state_filter(unsigned long state_filter) void __cpuinit init_idle_bootup_task(struct task_struct *idle) { - /* nothing yet */ + idle->sched_class = &idle_sched_class; } /** @@ -4951,13 +4865,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - idle->timestamp = sched_clock(); - idle->sleep_avg = 0; - idle->array = NULL; + __sched_fork(idle); + idle->se.exec_start = sched_clock(); + idle->prio = idle->normal_prio = MAX_PRIO; - idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); - set_task_cpu(idle, cpu); + __set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; @@ -4972,6 +4885,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) #else task_thread_info(idle)->preempt_count = 0; #endif + /* + * The idle tasks have their own, simple scheduling class: + */ + idle->sched_class = &idle_sched_class; } /* @@ -4983,6 +4900,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static inline void sched_init_granularity(void) +{ + unsigned int factor = 1 + ilog2(num_online_cpus()); + const unsigned long limit = 100000000; + + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; + + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_runtime_limit = sysctl_sched_latency; + sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; +} + #ifdef CONFIG_SMP /* * This is how migration works: @@ -5056,7 +4999,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0; + int ret = 0, on_rq; if (unlikely(cpu_is_offline(dest_cpu))) return ret; @@ -5072,20 +5015,14 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq_src, p, 0); + set_task_cpu(p, dest_cpu); - if (p->array) { - /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step - * afterwards, and pretending it was a local activate. - * This way is cleaner and logically correct. - */ - p->timestamp = p->timestamp - rq_src->most_recent_timestamp - + rq_dest->most_recent_timestamp; - deactivate_task(p, rq_src); - __activate_task(p, rq_dest); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); + if (on_rq) { + activate_task(rq_dest, p, 0); + check_preempt_curr(rq_dest, p); } ret = 1; out: @@ -5111,8 +5048,6 @@ static int migration_thread(void *data) struct migration_req *req; struct list_head *head; - try_to_freeze(); - spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { @@ -5237,7 +5172,8 @@ static void migrate_live_tasks(int src_cpu) write_unlock_irq(&tasklist_lock); } -/* Schedules idle task to be the next runnable task on current CPU. +/* + * Schedules idle task to be the next runnable task on current CPU. * It does so by boosting its priority to highest possible and adding it to * the _front_ of the runqueue. Used by CPU offline code. */ @@ -5257,10 +5193,10 @@ void sched_idle_next(void) */ spin_lock_irqsave(&rq->lock, flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); /* Add idle task to the _front_ of its priority queue: */ - __activate_idle_task(p, rq); + activate_idle_task(p, rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -5310,20 +5246,142 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) static void migrate_dead_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - unsigned int arr, i; + struct task_struct *next; - for (arr = 0; arr < 2; arr++) { - for (i = 0; i < MAX_PRIO; i++) { - struct list_head *list = &rq->arrays[arr].queue[i]; + for ( ; ; ) { + if (!rq->nr_running) + break; + update_rq_clock(rq); + next = pick_next_task(rq, rq->curr); + if (!next) + break; + migrate_dead(dead_cpu, next); - while (!list_empty(list)) - migrate_dead(dead_cpu, list_entry(list->next, - struct task_struct, run_list)); - } } } #endif /* CONFIG_HOTPLUG_CPU */ +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {0,}, +}; + +static struct ctl_table sd_ctl_root[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {0,}, +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); + + BUG_ON(!entry); + memset(entry, 0, n * sizeof(struct ctl_table)); + + return entry; +} + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[12], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void init_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + sd_ctl_dir[0].child = entry; + + for (i = 0; i < cpu_num; i++, entry++) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + } + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} +#else +static void init_sched_domain_sysctl(void) +{ +} +#endif + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -5343,14 +5401,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); + p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); if (IS_ERR(p)) return NOTIFY_BAD; - p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; @@ -5381,9 +5438,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); - deactivate_task(rq->idle, rq); + update_rq_clock(rq); + deactivate_task(rq, rq->idle, 0); rq->idle->static_prio = MAX_PRIO; - __setscheduler(rq->idle, SCHED_NORMAL, 0); + __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); + rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); task_rq_unlock(rq, &flags); migrate_nr_uninterruptible(rq); @@ -5992,7 +6051,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) static int build_sched_domains(const cpumask_t *cpu_map) { int i; - struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6000,7 +6058,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); @@ -6019,8 +6077,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) - > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if (cpus_weight(*cpu_map) > + SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; @@ -6079,7 +6137,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_sibling_map)) continue; - init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); + init_sched_build_groups(this_sibling_map, cpu_map, + &cpu_to_cpu_group); } #endif @@ -6090,11 +6149,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(this_core_map, this_core_map, *cpu_map); if (i != first_cpu(this_core_map)) continue; - init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); + init_sched_build_groups(this_core_map, cpu_map, + &cpu_to_core_group); } #endif - /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -6109,7 +6168,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) - init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); + init_sched_build_groups(*cpu_map, cpu_map, + &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6137,6 +6197,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sched_group_nodes[i] = sg; for_each_cpu_mask(j, nodemask) { struct sched_domain *sd; + sd = &per_cpu(node_domains, j); sd->groups = sg; } @@ -6181,19 +6242,22 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(cpu_domains, i); + struct sched_domain *sd = &per_cpu(cpu_domains, i); + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(core_domains, i); + struct sched_domain *sd = &per_cpu(core_domains, i); + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(phys_domains, i); + struct sched_domain *sd = &per_cpu(phys_domains, i); + init_sched_groups_power(i, sd); } @@ -6297,7 +6361,7 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static int arch_reinit_sched_domains(void) { int err; @@ -6326,24 +6390,6 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) return ret ? ret : count; } -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); -#endif - return err; -} -#endif - #ifdef CONFIG_SCHED_MC static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) { @@ -6354,8 +6400,8 @@ static ssize_t sched_mc_power_savings_store(struct sys_device *dev, { return sched_power_savings_store(buf, count, 0); } -SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT @@ -6368,8 +6414,26 @@ static ssize_t sched_smt_power_savings_store(struct sys_device *dev, { return sched_power_savings_store(buf, count, 1); } -SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, - sched_smt_power_savings_store); +static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; + +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} #endif /* @@ -6424,13 +6488,17 @@ void __init sched_init_smp(void) /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_sched_domain_sysctl(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); + sched_init_granularity(); } #else void __init sched_init_smp(void) { + sched_init_granularity(); } #endif /* CONFIG_SMP */ @@ -6444,28 +6512,51 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } +static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->fair_clock = 1; +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->rq = rq; +#endif +} + void __init sched_init(void) { - int i, j, k; + u64 now = sched_clock(); int highest_cpu = 0; + int i, j; + + /* + * Link up the scheduling class hierarchy: + */ + rt_sched_class.next = &fair_sched_class; + fair_sched_class.next = &idle_sched_class; + idle_sched_class.next = NULL; for_each_possible_cpu(i) { - struct prio_array *array; + struct rt_prio_array *array; struct rq *rq; rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; + rq->clock = 1; + init_cfs_rq(&rq->cfs, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); +#endif + rq->ls.load_update_last = now; + rq->ls.load_update_start = now; + for (j = 0; j < CPU_LOAD_IDX_MAX; j++) + rq->cpu_load[j] = 0; #ifdef CONFIG_SMP rq->sd = NULL; - for (j = 1; j < 3; j++) - rq->cpu_load[j] = 0; rq->active_balance = 0; + rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; rq->migration_thread = NULL; @@ -6473,20 +6564,22 @@ void __init sched_init(void) #endif atomic_set(&rq->nr_iowait, 0); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + array = &rq->rt.active; + for (j = 0; j < MAX_RT_PRIO; j++) { + INIT_LIST_HEAD(array->queue + j); + __clear_bit(j, array->bitmap); } highest_cpu = i; + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); } set_load_weight(&init_task); +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + #ifdef CONFIG_SMP nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); @@ -6509,6 +6602,10 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -6539,29 +6636,58 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) { - struct prio_array *array; struct task_struct *g, *p; unsigned long flags; struct rq *rq; + int on_rq; read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { - if (!rt_task(p)) + p->se.fair_key = 0; + p->se.wait_runtime = 0; + p->se.exec_start = 0; + p->se.wait_start_fair = 0; + p->se.sleep_start_fair = 0; +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; + p->se.sleep_start = 0; + p->se.block_start = 0; +#endif + task_rq(p)->cfs.fair_clock = 0; + task_rq(p)->clock = 0; + + if (!rt_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: + */ + if (TASK_NICE(p) < 0 && p->mm) + set_user_nice(p, 0); continue; + } spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); +#ifdef CONFIG_SMP + /* + * Do not touch the migration thread: + */ + if (p == rq->migration_thread) + goto out_unlock; +#endif - array = p->array; - if (array) - deactivate_task(p, task_rq(p)); - __setscheduler(p, SCHED_NORMAL, 0); - if (array) { - __activate_task(p, task_rq(p)); + update_rq_clock(rq); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq, p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + activate_task(rq, p, 0); resched_task(rq->curr); } - +#ifdef CONFIG_SMP + out_unlock: +#endif __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); } while_each_thread(g, p);