X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched_fair.c;h=6c091d6e159d01fb23c0dd786495c533cb005c5b;hb=d9ae90ac4bdce769ddb27c2e24c3351a30c3daf8;hp=895fef74d99a7c8c8843c546be35be62996022b8;hpb=9014623c0e3545be58a7f19f55793f6517bdc274;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 895fef7..6c091d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -20,33 +20,38 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ +#include + /* * Targeted preemption latency for CPU-bound tasks: - * (default: 20ms, units: nanoseconds) + * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length. - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches field) + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. * - * On SMP systems the value of this is multiplied by the log2 of the - * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way - * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) - * Targeted preemption latency for CPU-bound tasks: + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches (cs) field) */ -const_debug unsigned int sysctl_sched_latency = 20000000ULL; +unsigned int sysctl_sched_latency = 20000000ULL; /* - * After fork, child runs first. (default) If set to 0 then - * parent will (try to) run first. + * Minimal preemption granularity for CPU-bound tasks: + * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -const_debug unsigned int sysctl_sched_child_runs_first = 1; +unsigned int sysctl_sched_min_granularity = 4000000ULL; /* - * Minimal preemption granularity for CPU-bound tasks: - * (default: 2 msec, units: nanoseconds) + * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + */ +static unsigned int sched_nr_latency = 5; + +/* + * After fork, child runs first. (default) If set to 0 then + * parent will (try to) run first. */ -unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; +const_debug unsigned int sysctl_sched_child_runs_first = 1; /* * sys_sched_yield() compat mode @@ -58,27 +63,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_BATCH wake-up granularity. - * (default: 25 msec, units: nanoseconds) + * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL; +unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 1 msec, units: nanoseconds) + * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL; - -unsigned int sysctl_sched_runtime_limit __read_mostly; +unsigned int sysctl_sched_wakeup_granularity = 10000000UL; -extern struct sched_class fair_sched_class; +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; /************************************************************** * CFS operations on generic schedulable entities: @@ -116,31 +119,33 @@ static inline struct task_struct *task_of(struct sched_entity *se) * Scheduling class tree data structure manipulation methods: */ -static inline void -set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost) +static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) { - struct sched_entity *se; + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) + min_vruntime = vruntime; - cfs_rq->rb_leftmost = leftmost; - if (leftmost) { - se = rb_entry(leftmost, struct sched_entity, run_node); - if ((se->vruntime > cfs_rq->min_vruntime) || - (cfs_rq->min_vruntime > (1ULL << 61) && - se->vruntime < (1ULL << 50))) - cfs_rq->min_vruntime = se->vruntime; - } + return min_vruntime; } -s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) { - return se->fair_key - cfs_rq->min_vruntime; + s64 delta = (s64)(vruntime - min_vruntime); + if (delta < 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return se->vruntime - cfs_rq->min_vruntime; } /* * Enqueue an entity into the rb-tree: */ -static void -__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; @@ -171,29 +176,18 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * used): */ if (leftmost) - set_leftmost(cfs_rq, &se->run_node); + cfs_rq->rb_leftmost = &se->run_node; rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); - update_load_add(&cfs_rq->load, se->load.weight); - cfs_rq->nr_running++; - se->on_rq = 1; - - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } -static void -__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) - set_leftmost(cfs_rq, rb_next(&se->run_node)); + cfs_rq->rb_leftmost = rb_next(&se->run_node); rb_erase(&se->run_node, &cfs_rq->tasks_timeline); - update_load_sub(&cfs_rq->load, se->load.weight); - cfs_rq->nr_running--; - se->on_rq = 0; - - schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); } static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) @@ -225,65 +219,84 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ +#ifdef CONFIG_SCHED_DEBUG +int sched_nr_latency_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, + sysctl_sched_min_granularity); + + return 0; +} +#endif + +/* + * The idea is to set a period in which each task runs once. + * + * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * this period because otherwise the slices get too small. + * + * p = (nr <= nl) ? l : l*nr/nl + */ static u64 __sched_period(unsigned long nr_running) { u64 period = sysctl_sched_latency; - unsigned long nr_latency = - sysctl_sched_latency / sysctl_sched_min_granularity; + unsigned long nr_latency = sched_nr_latency; if (unlikely(nr_running > nr_latency)) { + period = sysctl_sched_min_granularity; period *= nr_running; - do_div(period, nr_latency); } return period; } +/* + * We calculate the wall-time slice from the period by taking a part + * proportional to the weight. + * + * s = p*w/rw + */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 period = __sched_period(cfs_rq->nr_running); + u64 slice = __sched_period(cfs_rq->nr_running); - period *= se->load.weight; - do_div(period, cfs_rq->load.weight); + slice *= se->load.weight; + do_div(slice, cfs_rq->load.weight); - return period; + return slice; } -static inline void -limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) +/* + * We calculate the vruntime slice. + * + * vs = s/w = p/rw + */ +static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) { - long limit = sysctl_sched_runtime_limit; + u64 vslice = __sched_period(nr_running); - /* - * Niced tasks have the same history dynamic range as - * non-niced tasks: - */ - if (unlikely(se->wait_runtime > limit)) { - se->wait_runtime = limit; - schedstat_inc(se, wait_runtime_overruns); - schedstat_inc(cfs_rq, wait_runtime_overruns); - } - if (unlikely(se->wait_runtime < -limit)) { - se->wait_runtime = -limit; - schedstat_inc(se, wait_runtime_underruns); - schedstat_inc(cfs_rq, wait_runtime_underruns); - } + vslice *= NICE_0_LOAD; + do_div(vslice, rq_weight); + + return vslice; } -static inline void -__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) +static u64 sched_vslice(struct cfs_rq *cfs_rq) { - se->wait_runtime += delta; - schedstat_add(se, sum_wait_runtime, delta); - limit_wait_runtime(cfs_rq, se); + return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); } -static void -add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) +static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); - __add_wait_runtime(cfs_rq, se, delta); - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); + return __sched_vslice(cfs_rq->load.weight + se->load.weight, + cfs_rq->nr_running + 1); } /* @@ -294,14 +307,13 @@ static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { - unsigned long delta, delta_fair, delta_mine, delta_exec_weighted; - struct load_weight *lw = &cfs_rq->load; - unsigned long load = lw->weight; + unsigned long delta_exec_weighted; + u64 vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; - cfs_rq->exec_clock += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = delta_exec; if (unlikely(curr->load.weight != NICE_0_LOAD)) { delta_exec_weighted = calc_delta_fair(delta_exec_weighted, @@ -309,32 +321,18 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, } curr->vruntime += delta_exec_weighted; - if (!sched_feat(FAIR_SLEEPERS)) - return; - - if (unlikely(!load)) - return; - - delta_fair = calc_delta_fair(delta_exec, lw); - delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - - if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { - delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); - delta = min(delta, (unsigned long)( - (long)sysctl_sched_runtime_limit - curr->wait_runtime)); - cfs_rq->sleeper_bonus -= delta; - delta_mine -= delta; - } - - cfs_rq->fair_clock += delta_fair; /* - * We executed delta_exec amount of time on the CPU, - * but we were only entitled to delta_mine amount of - * time during that period (if nr_running == 1 then - * the two values are equal) - * [Note: delta_mine - delta_exec is negative]: + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. */ - add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(curr->vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = curr->vruntime; + + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) @@ -355,26 +353,20 @@ static void update_curr(struct cfs_rq *cfs_rq) __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; + + if (entity_is_task(curr)) { + struct task_struct *curtask = task_of(curr); + + cpuacct_charge(curtask, delta_exec); + } } static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - se->wait_start_fair = cfs_rq->fair_clock; schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); } -static inline unsigned long -calc_weighted(unsigned long delta, struct sched_entity *se) -{ - unsigned long weight = se->load.weight; - - if (unlikely(weight != NICE_0_LOAD)) - return (u64)delta * se->load.weight >> NICE_0_SHIFT; - else - return delta; -} - /* * Task is being enqueued - update stats: */ @@ -386,48 +378,22 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); - /* - * Update the key: - */ - se->fair_key = se->vruntime; -} - -/* - * Note: must be called with a freshly updated rq->fair_clock. - */ -static inline void -__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long delta_fair) -{ - schedstat_set(se->wait_max, max(se->wait_max, - rq_of(cfs_rq)->clock - se->wait_start)); - - delta_fair = calc_weighted(delta_fair, se); - - add_wait_runtime(cfs_rq, se, delta_fair); } static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long delta_fair; - - if (unlikely(!se->wait_start_fair)) - return; - - delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), - (u64)(cfs_rq->fair_clock - se->wait_start_fair)); - - __update_stats_wait_end(cfs_rq, se, delta_fair); - - se->wait_start_fair = 0; + schedstat_set(se->wait_max, max(se->wait_max, + rq_of(cfs_rq)->clock - se->wait_start)); + schedstat_set(se->wait_count, se->wait_count + 1); + schedstat_set(se->wait_sum, se->wait_sum + + rq_of(cfs_rq)->clock - se->wait_start); schedstat_set(se->wait_start, 0); } static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_curr(cfs_rq); /* * Mark the end of the wait period if dequeueing a * waiting task: @@ -448,74 +414,32 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) se->exec_start = rq_of(cfs_rq)->clock; } -/* - * We are descheduling a task - update its stats: - */ -static inline void -update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - se->exec_start = 0; -} - /************************************************** * Scheduling class queueing methods: */ -static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long delta_fair) +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long load = cfs_rq->load.weight; - long prev_runtime; - - /* - * Do not boost sleepers if there's too much bonus 'in flight' - * already: - */ - if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) - return; - - if (sched_feat(SLEEPER_LOAD_AVG)) - load = rq_of(cfs_rq)->cpu_load[2]; - - /* - * Fix up delta_fair with the effect of us running - * during the whole sleep period: - */ - if (sched_feat(SLEEPER_AVG)) - delta_fair = div64_likely32((u64)delta_fair * load, - load + se->load.weight); - - delta_fair = calc_weighted(delta_fair, se); - - prev_runtime = se->wait_runtime; - __add_wait_runtime(cfs_rq, se, delta_fair); - delta_fair = se->wait_runtime - prev_runtime; + update_load_add(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running++; + se->on_rq = 1; +} - /* - * Track the amount of bonus we've given to sleepers: - */ - cfs_rq->sleeper_bonus += delta_fair; +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running--; + se->on_rq = 0; } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *tsk = task_of(se); - unsigned long delta_fair; - - if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || - !sched_feat(FAIR_SLEEPERS)) - return; - - delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), - (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); - - __enqueue_sleeper(cfs_rq, se, delta_fair); - - se->sleep_start_fair = 0; - #ifdef CONFIG_SCHEDSTATS if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; + struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -525,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; + + account_scheduler_latency(tsk, delta >> 10, 1); } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; + struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -544,49 +471,70 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) * time that the task spent sleeping: */ if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), delta >> 20); } + account_scheduler_latency(tsk, delta >> 10, 0); } #endif } +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) + d = -d; + + if (d > 3*sysctl_sched_latency) + schedstat_inc(cfs_rq, nr_spread_over); +#endif +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { - u64 min_runtime, latency; + u64 vruntime; - min_runtime = cfs_rq->min_vruntime; + vruntime = cfs_rq->min_vruntime; - if (sched_feat(USE_TREE_AVG)) { + if (sched_feat(TREE_AVG)) { struct sched_entity *last = __pick_last_entity(cfs_rq); if (last) { - min_runtime = __pick_next_entity(cfs_rq)->vruntime; - min_runtime += last->vruntime; - min_runtime >>= 1; + vruntime += last->vruntime; + vruntime >>= 1; } - } else if (sched_feat(APPROX_AVG)) - min_runtime += sysctl_sched_latency/2; + } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) + vruntime += sched_vslice(cfs_rq)/2; + /* + * The 'current' period is already promised to the current tasks, + * however the extra weight of the new task will slow them down a + * little, place the new task so that it fits in the slot that + * stays open at the end. + */ if (initial && sched_feat(START_DEBIT)) - min_runtime += sched_slice(cfs_rq, se); + vruntime += sched_vslice_add(cfs_rq, se); - if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) { - latency = sysctl_sched_latency; - if (min_runtime > latency) - min_runtime -= latency; - else - min_runtime = 0; + if (!initial) { + /* sleeps upto a single latency don't count. */ + if (sched_feat(NEW_FAIR_SLEEPERS)) + vruntime -= sysctl_sched_latency; + + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); } - se->vruntime = max(se->vruntime, min_runtime); + se->vruntime = vruntime; } static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { /* - * Update the fair clock. + * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); @@ -596,15 +544,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) } update_stats_enqueue(cfs_rq, se); - __enqueue_entity(cfs_rq, se); + check_spread(cfs_rq, se); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + account_entity_enqueue(cfs_rq, se); } static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + update_stats_dequeue(cfs_rq, se); if (sleep) { - se->sleep_start_fair = cfs_rq->fair_clock; #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -616,7 +571,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) } #endif } - __dequeue_entity(cfs_rq, se); + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + account_entity_dequeue(cfs_rq, se); } /* @@ -633,17 +591,20 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) resched_task(rq_of(cfs_rq)->curr); } -static inline void +static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * Any task has to be enqueued before it get to execute on - * a CPU. So account for the time it spent waiting on the - * runqueue. (note, here we rely on pick_next_task() having - * done a put_prev_task_fair() shortly before this, which - * updated rq->fair_clock - used by update_stats_wait_end()) - */ - update_stats_wait_end(cfs_rq, se); + /* 'current' is not kept within the tree. */ + if (se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + update_stats_wait_end(cfs_rq, se); + __dequeue_entity(cfs_rq, se); + } + update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; #ifdef CONFIG_SCHEDSTATS @@ -652,7 +613,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->ls.load.weight >= 2*se->load.weight) { + if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->slice_max = max(se->slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } @@ -662,9 +623,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = NULL; - set_next_entity(cfs_rq, se); + if (first_fair(cfs_rq)) { + se = __pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); + } return se; } @@ -678,23 +642,39 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) if (prev->on_rq) update_curr(cfs_rq); - update_stats_curr_end(cfs_rq, prev); - - if (prev->on_rq) + check_spread(cfs_rq, prev); + if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); + /* Put 'current' back into the tree. */ + __enqueue_entity(cfs_rq, prev); + } cfs_rq->curr = NULL; } -static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +static void +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Update run-time statistics of the 'current'. */ - dequeue_entity(cfs_rq, curr, 0); - enqueue_entity(cfs_rq, curr, 0); + update_curr(cfs_rq); - if (cfs_rq->nr_running > 1) +#ifdef CONFIG_SCHED_HRTICK + /* + * queued ticks are scheduled to match the slice, so don't bother + * validating it and just reschedule. + */ + if (queued) + return resched_task(rq_of(cfs_rq)->curr); + /* + * don't let the period tick interfere with the hrtick preemption + */ + if (!sched_feat(DOUBLE_TICK) && + hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) + return; +#endif + + if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) check_preempt_tick(cfs_rq, curr); } @@ -730,23 +710,30 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) */ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) { - /* A later patch will take group into account */ - return &cpu_rq(this_cpu)->cfs; + return cfs_rq->tg->cfs_rq[this_cpu]; } /* Iterate thr' all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) -/* Do the two (enqueued) tasks belong to the same group ? */ -static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) { - if (curr->se.cfs_rq == p->se.cfs_rq) + if (se->cfs_rq == pse->cfs_rq) return 1; return 0; } +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + +#define GROUP_IMBALANCE_PCT 20 + #else /* CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -779,13 +766,56 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) { return 1; } +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_SCHED_HRTICK +static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ + int requeue = rq->curr == p; + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + WARN_ON(task_rq(p) != rq); + + if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { + u64 slice = sched_slice(cfs_rq, se); + u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; + s64 delta = slice - ran; + + if (delta < 0) { + if (rq->curr == p) + resched_task(p); + return; + } + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + if (!requeue) + delta = max(10000LL, delta); + + hrtick_start(rq, delta, requeue); + } +} +#else +static inline void +hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ +} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -794,14 +824,28 @@ static inline int is_same_group(struct task_struct *curr, struct task_struct *p) static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; + struct sched_entity *se = &p->se, + *topse = NULL; /* Highest schedulable entity */ + int incload = 1; for_each_sched_entity(se) { - if (se->on_rq) + topse = se; + if (se->on_rq) { + incload = 0; break; + } cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, wakeup); + wakeup = 1; } + /* Increment cpu load if we just enqueued the first task of a group on + * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs + * at the highest grouping level. + */ + if (incload) + inc_cpu_load(rq, topse->load.weight); + + hrtick_start_fair(rq, rq->curr); } /* @@ -812,15 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; + struct sched_entity *se = &p->se, + *topse = NULL; /* Highest schedulable entity */ + int decload = 1; for_each_sched_entity(se) { + topse = se; cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, sleep); /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) + if (cfs_rq->load.weight) { + if (parent_entity(se)) + decload = 0; break; + } + sleep = 1; } + /* Decrement cpu load if we just dequeued the last task of a group on + * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs + * at the highest grouping level. + */ + if (decload) + dec_cpu_load(rq, topse->load.weight); + + hrtick_start_fair(rq, rq->curr); } /* @@ -828,12 +887,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) * * If compat_yield is turned on then we requeue to the end of the tree. */ -static void yield_task_fair(struct rq *rq, struct task_struct *p) +static void yield_task_fair(struct rq *rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *rightmost, *se = &p->se; - struct rb_node *parent; + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *rightmost, *se = &curr->se; /* * Are we the only task in the tree? @@ -841,46 +899,180 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p) if (unlikely(cfs_rq->nr_running == 1)) return; - if (likely(!sysctl_sched_compat_yield)) { + if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { __update_rq_clock(rq); /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Update run-time statistics of the 'current'. */ - dequeue_entity(cfs_rq, &p->se, 0); - enqueue_entity(cfs_rq, &p->se, 0); + update_curr(cfs_rq); return; } /* * Find the rightmost entry in the rbtree: */ - do { - parent = *link; - link = &parent->rb_right; - } while (*link); - - rightmost = rb_entry(parent, struct sched_entity, run_node); + rightmost = __pick_last_entity(cfs_rq); /* * Already in the rightmost position? */ - if (unlikely(rightmost == se)) + if (unlikely(rightmost->vruntime < se->vruntime)) return; /* * Minimally necessary key value to be last in the tree: + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. */ - se->fair_key = rightmost->fair_key + 1; + se->vruntime = rightmost->vruntime + 1; +} + +/* + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, struct task_struct *p) +{ + cpumask_t tmp; + struct sched_domain *sd; + int i; - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); /* - * Relink the task to the rightmost position: + * If it is idle, then it is the best cpu to run this task. + * + * This cpu is also the best, if it has more than one task already. + * Siblings must be also busy(in most cases) as they didn't already + * pickup the extra load from this cpu and hence we need not check + * sibling runqueue info. This will avoid the checks and cache miss + * penalities associated with that. */ - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) + return cpu; + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) { + if (i != task_cpu(p)) { + schedstat_inc(p, + se.nr_wakeups_idle); + } + return i; + } + } + } else { + break; + } + } + return cpu; +} +#else +static inline int wake_idle(int cpu, struct task_struct *p) +{ + return cpu; } +#endif + +#ifdef CONFIG_SMP +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + int cpu, this_cpu; + struct rq *rq; + struct sched_domain *sd, *this_sd = NULL; + int new_cpu; + + cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + new_cpu = cpu; + + if (cpu == this_cpu) + goto out_set_cpu; + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + this_sd = sd; + break; + } + } + + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + /* + * Check for affine wakeup and passive balancing possibilities. + */ + if (this_sd) { + int idx = this_sd->wake_idx; + unsigned int imbalance; + unsigned long load, this_load; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + if (this_sd->flags & SD_WAKE_AFFINE) { + unsigned long tl = this_load; + unsigned long tl_per_task; + + /* + * Attract cache-cold tasks on sync wakeups: + */ + if (sync && !task_hot(p, rq->clock, this_sd)) + goto out_set_cpu; + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && + tl + target_load(cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); + goto out_set_cpu; + } + } + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + goto out_set_cpu; + } + } + } + + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + return wake_idle(new_cpu, p); +} +#endif /* CONFIG_SMP */ + /* * Preempt the current task with a newly woken task if needed: @@ -889,6 +1081,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se, *pse = &p->se; + unsigned long gran; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -896,16 +1090,36 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } - if (is_same_group(curr, p)) { - s64 delta = curr->se.vruntime - p->se.vruntime; + /* + * Batch tasks do not preempt (their preemption is driven by + * the tick): + */ + if (unlikely(p->policy == SCHED_BATCH)) + return; + + if (!sched_feat(WAKEUP_PREEMPT)) + return; - if (delta > (s64)sysctl_sched_wakeup_granularity) - resched_task(curr); + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); } + + gran = sysctl_sched_wakeup_granularity; + /* + * More easily preempt - nice tasks, while not making + * it harder for + nice tasks. + */ + if (unlikely(se->load.weight > NICE_0_LOAD)) + gran = calc_delta_fair(gran, &se->load); + + if (pse->vruntime + gran < se->vruntime) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) { + struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -917,7 +1131,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + hrtick_start_fair(rq, p); + + return p; } /* @@ -934,6 +1151,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) } } +#ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: */ @@ -945,7 +1163,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) * achieve that by always pre-iterating before returning * the current task: */ -static inline struct task_struct * +static struct task_struct * __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) { struct task_struct *p; @@ -973,91 +1191,136 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); } -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) -{ - struct sched_entity *curr; - struct task_struct *p; - - if (!cfs_rq->nr_running) - return MAX_PRIO; - - curr = __pick_next_entity(cfs_rq); - p = task_of(curr); - - return p->prio; -} -#endif - static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, + unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { struct cfs_rq *busy_cfs_rq; - unsigned long load_moved, total_nr_moved = 0, nr_moved; long rem_load_move = max_load_move; struct rq_iterator cfs_rq_iterator; + unsigned long load_moved; cfs_rq_iterator.start = load_balance_start_fair; cfs_rq_iterator.next = load_balance_next_fair; for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq; - long imbalance; - unsigned long maxload; + struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; + unsigned long maxload, task_load, group_weight; + unsigned long thisload, per_task_load; + struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + task_load = busy_cfs_rq->load.weight; + group_weight = se->load.weight; - imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) + /* + * 'group_weight' is contributed by tasks of total weight + * 'task_load'. To move 'rem_load_move' worth of weight only, + * we need to move a maximum task load of: + * + * maxload = (remload / group_weight) * task_load; + */ + maxload = (rem_load_move * task_load) / group_weight; + + if (!maxload || !task_load) continue; - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); + per_task_load = task_load / busy_cfs_rq->nr_running; + /* + * balance_tasks will try to forcibly move atleast one task if + * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if + * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. + */ + if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) + continue; - *this_best_prio = cfs_rq_best_prio(this_cfs_rq); + /* Disable priority-based load balance */ + *this_best_prio = 0; + thisload = this_cfs_rq->load.weight; #else # define maxload rem_load_move #endif - /* pass busy_cfs_rq argument into + /* + * pass busy_cfs_rq argument into * load_balance_[start|next]_fair iterators */ cfs_rq_iterator.arg = busy_cfs_rq; - nr_moved = balance_tasks(this_rq, this_cpu, busiest, - max_nr_move, maxload, sd, idle, all_pinned, - &load_moved, this_best_prio, &cfs_rq_iterator); + load_moved = balance_tasks(this_rq, this_cpu, busiest, + maxload, sd, idle, all_pinned, + this_best_prio, + &cfs_rq_iterator); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * load_moved holds the task load that was moved. The + * effective (group) weight moved would be: + * load_moved_eff = load_moved/task_load * group_weight; + */ + load_moved = (group_weight * load_moved) / task_load; + + /* Adjust shares on both cpus to reflect load_moved */ + group_weight -= load_moved; + set_se_shares(se, group_weight); + + se = busy_cfs_rq->tg->se[this_cpu]; + if (!thisload) + group_weight = load_moved; + else + group_weight = se->load.weight + load_moved; + set_se_shares(se, group_weight); +#endif - total_nr_moved += nr_moved; - max_nr_move -= nr_moved; rem_load_move -= load_moved; - if (max_nr_move <= 0 || rem_load_move <= 0) + if (rem_load_move <= 0) break; } return max_load_move - rem_load_move; } +static int +move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct cfs_rq *busy_cfs_rq; + struct rq_iterator cfs_rq_iterator; + + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + + for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { + /* + * pass busy_cfs_rq argument into + * load_balance_[start|next]_fair iterators + */ + cfs_rq_iterator.arg = busy_cfs_rq; + if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, + &cfs_rq_iterator)) + return 1; + } + + return 0; +} +#endif + /* * scheduler tick hitting a task of our scheduling class: */ -static void task_tick_fair(struct rq *rq, struct task_struct *curr) +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - entity_tick(cfs_rq, se); + entity_tick(cfs_rq, se, queued); } } -#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) +#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) /* * Share the fairness runtime between parent and child, thus the @@ -1070,33 +1333,63 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; + int this_cpu = smp_processor_id(); sched_info_queued(p); update_curr(cfs_rq); place_entity(cfs_rq, se, 1); - /* - * The statistical average of wait_runtime is about - * -granularity/2, so initialize the task with that: - */ - if (sched_feat(START_DEBIT)) - se->wait_runtime = -(__sched_period(cfs_rq->nr_running+1) / 2); - - if (sysctl_sched_child_runs_first && - curr->vruntime < se->vruntime) { - - dequeue_entity(cfs_rq, curr, 0); + /* 'curr' will be NULL if the child belongs to a different group */ + if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && + curr && curr->vruntime < se->vruntime) { + /* + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. + */ swap(curr->vruntime, se->vruntime); - enqueue_entity(cfs_rq, curr, 0); } - update_stats_enqueue(cfs_rq, se); - __enqueue_entity(cfs_rq, se); + enqueue_task_fair(rq, p, 0); resched_task(rq->curr); } -#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Priority of the task has changed. Check to see if we preempt + * the current task. + */ +static void prio_changed_fair(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (running) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr(rq, p); +} + +/* + * We switched to the sched_fair class. + */ +static void switched_to_fair(struct rq *rq, struct task_struct *p, + int running) +{ + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (running) + resched_task(rq->curr); + else + check_preempt_curr(rq, p); +} + /* Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task @@ -1109,30 +1402,35 @@ static void set_curr_task_fair(struct rq *rq) for_each_sched_entity(se) set_next_entity(cfs_rq_of(se), se); } -#else -static void set_curr_task_fair(struct rq *rq) -{ -} -#endif /* * All the scheduling class methods: */ -struct sched_class fair_sched_class __read_mostly = { +static const struct sched_class fair_sched_class = { + .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_fair, +#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, +#ifdef CONFIG_SMP .load_balance = load_balance_fair, + .move_one_task = move_one_task_fair, +#endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_new = task_new_fair, + + .prio_changed = prio_changed_fair, + .switched_to = switched_to_fair, }; #ifdef CONFIG_SCHED_DEBUG @@ -1140,7 +1438,12 @@ static void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; +#ifdef CONFIG_FAIR_GROUP_SCHED + print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); +#endif + rcu_read_lock(); for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); + rcu_read_unlock(); } #endif