X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched_fair.c;h=217e4a9393e42c2f5dfdfae058625601c15e235b;hb=1f522509c77a5dea8dc384b735314f03908a6415;hp=10408323794e1e79adf53b262fb61a75d323ed2c;hpb=1af3ed3ddf27499c3f57662c4c29871e2b95e5f9;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1040832..217e4a9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -35,8 +35,8 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency = 5000000ULL; -unsigned int normalized_sysctl_sched_latency = 5000000ULL; +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable @@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 1000000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; +unsigned int sysctl_sched_min_granularity = 2000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -static unsigned int sched_nr_latency = 5; +static unsigned int sched_nr_latency = 3; /* * After fork, child runs first. If set to 0 (default) then @@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, { unsigned long delta_exec_weighted; - schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); + schedstat_set(curr->statistics.exec_max, + max((u64)delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); @@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); + schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); } /* @@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->wait_max, max(se->wait_max, - rq_of(cfs_rq)->clock - se->wait_start)); - schedstat_set(se->wait_count, se->wait_count + 1); - schedstat_set(se->wait_sum, se->wait_sum + - rq_of(cfs_rq)->clock - se->wait_start); + schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, + rq_of(cfs_rq)->clock - se->statistics.wait_start)); + schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); + schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + + rq_of(cfs_rq)->clock - se->statistics.wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->wait_start); + rq_of(cfs_rq)->clock - se->statistics.wait_start); } #endif - schedstat_set(se->wait_start, 0); + schedstat_set(se->statistics.wait_start, 0); } static inline void @@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (entity_is_task(se)) tsk = task_of(se); - if (se->sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; + if (se->statistics.sleep_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->sleep_max)) - se->sleep_max = delta; + if (unlikely(delta > se->statistics.sleep_max)) + se->statistics.sleep_max = delta; - se->sleep_start = 0; - se->sum_sleep_runtime += delta; + se->statistics.sleep_start = 0; + se->statistics.sum_sleep_runtime += delta; if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); trace_sched_stat_sleep(tsk, delta); } } - if (se->block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->block_start; + if (se->statistics.block_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->block_max)) - se->block_max = delta; + if (unlikely(delta > se->statistics.block_max)) + se->statistics.block_max = delta; - se->block_start = 0; - se->sum_sleep_runtime += delta; + se->statistics.block_start = 0; + se->statistics.sum_sleep_runtime += delta; if (tsk) { if (tsk->in_iowait) { - se->iowait_sum += delta; - se->iowait_count++; + se->statistics.iowait_sum += delta; + se->statistics.iowait_count++; trace_sched_stat_iowait(tsk, delta); } @@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime += sched_vslice(cfs_rq, se); /* sleeps up to a single latency don't count. */ - if (!initial && sched_feat(FAIR_SLEEPERS)) { + if (!initial) { unsigned long thresh = sysctl_sched_latency; /* - * Convert the sleeper threshold into virtual time. - * SCHED_IDLE is a special sub-class. We care about - * fairness only relative to other SCHED_IDLE tasks, - * all of which have the same weight. - */ - if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || - task_of(se)->policy != SCHED_IDLE)) - thresh = calc_delta_fair(thresh, se); - - /* * Halve their sleep time's effect, to allow * for a gentler effect of sleepers: */ @@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_MIGRATE 2 - static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; /* @@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update run-time statistics of the 'current'. @@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_curr(cfs_rq); update_stats_dequeue(cfs_rq, se); - if (sleep) { + if (flags & DEQUEUE_SLEEP) { #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->sleep_start = rq_of(cfs_rq)->clock; + se->statistics.sleep_start = rq_of(cfs_rq)->clock; if (tsk->state & TASK_UNINTERRUPTIBLE) - se->block_start = rq_of(cfs_rq)->clock; + se->statistics.block_start = rq_of(cfs_rq)->clock; } #endif } @@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) * update can refer to the ->curr item and we need to reflect this * movement in our normalized position. */ - if (!sleep) + if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; } @@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * when there are only lesser-weight tasks around): */ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->slice_max = max(se->slice_max, + se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } #endif @@ -1053,16 +1041,11 @@ static inline void hrtick_update(struct rq *rq) * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ -static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int flags = 0; - - if (wakeup) - flags |= ENQUEUE_WAKEUP; - if (p->state == TASK_WAKING) - flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) @@ -1080,18 +1063,18 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, sleep); + dequeue_entity(cfs_rq, se, flags); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; - sleep = 1; + flags |= DEQUEUE_SLEEP; } hrtick_update(rq); @@ -1239,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = current; unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; @@ -1254,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (sync) { - if (sched_feat(SYNC_LESS) && - (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; - } else { - if (sched_feat(SYNC_MORE) && - (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost)) - sync = 1; - } - /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1305,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) if (sync && balanced) return 1; - schedstat_inc(p, se.nr_wakeups_affine_attempts); + schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); if (balanced || @@ -1317,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * there is no bad imbalance. */ schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); return 1; } @@ -1405,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int -select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_sibling(struct task_struct *p, int target) { int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); + struct sched_domain *sd; int i; /* - * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE - * test in select_task_rq_fair) and the prev_cpu is idle then that's - * always a better target than the current cpu. + * If the task is going to be woken-up on this cpu and if it is + * already idle, then it is the right target. + */ + if (target == cpu && idle_cpu(cpu)) + return cpu; + + /* + * If the task is going to be woken-up on the cpu where it previously + * ran and if it is currently idle, then it the right target. */ - if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + if (target == prev_cpu && idle_cpu(prev_cpu)) return prev_cpu; /* - * Otherwise, iterate the domain and find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (!cpu_rq(i)->cfs.nr_running) { - target = i; + for_each_domain(target, sd) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; + + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (idle_cpu(i)) { + target = i; + break; + } } + + /* + * Lets stop looking for an idle sibling when we reached + * the domain that spans the current cpu and prev_cpu. + */ + if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && + cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + break; } return target; @@ -1444,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) * * preempt must be disabled. */ -static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +static int +select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -1455,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { - if (sched_feat(AFFINE_WAKEUPS) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) want_affine = 1; new_cpu = prev_cpu; } @@ -1490,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } /* - * While iterating the domains looking for a spanning - * WAKE_AFFINE domain, adjust the affine target to any idle cpu - * in cache sharing domains along the way. + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. */ - if (want_affine) { - int target = -1; - - /* - * If both cpu and prev_cpu are part of this domain, - * cpu is a valid SD_WAKE_AFFINE target. - */ - if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) - target = cpu; - - /* - * If there's an idle sibling in this domain, make that - * the wake_affine target instead of the current cpu. - */ - if (tmp->flags & SD_PREFER_SIBLING) - target = select_idle_sibling(p, tmp, target); - - if (target >= 0) { - if (tmp->flags & SD_WAKE_AFFINE) { - affine_sd = tmp; - want_affine = 0; - } - cpu = target; - } + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + affine_sd = tmp; + want_affine = 0; } if (!want_sd && !want_affine) @@ -1530,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag sd = tmp; } +#ifdef CONFIG_FAIR_GROUP_SCHED if (sched_feat(LB_SHARES_UPDATE)) { /* * Pick the largest domain to update shares over */ tmp = sd; - if (affine_sd && (!tmp || - cpumask_weight(sched_domain_span(affine_sd)) > - cpumask_weight(sched_domain_span(sd)))) + if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) tmp = affine_sd; - if (tmp) + if (tmp) { + raw_spin_unlock(&rq->lock); update_shares(tmp); + raw_spin_lock(&rq->lock); + } } +#endif - if (affine_sd && wake_affine(affine_sd, p, sync)) - return cpu; + if (affine_sd) { + if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + return select_idle_sibling(p, cpu); + else + return select_idle_sibling(p, prev_cpu); + } while (sd) { int load_idx = sd->forkexec_idx; @@ -1575,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); + weight = sd->span_weight; sd = NULL; for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) + if (weight <= tmp->span_weight) break; if (tmp->flags & sd_flag) sd = tmp; @@ -1590,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } #endif /* CONFIG_SMP */ -/* - * Adaptive granularity - * - * se->avg_wakeup gives the average time a task runs until it does a wakeup, - * with the limit of wakeup_gran -- when it never does a wakeup. - * - * So the smaller avg_wakeup is the faster we want this task to preempt, - * but we don't want to treat the preemptee unfairly and therefore allow it - * to run for at least the amount of time we'd like to run. - * - * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one - * - * NOTE: we use *nr_running to scale with load, this nicely matches the - * degrading latency on load. - */ -static unsigned long -adaptive_gran(struct sched_entity *curr, struct sched_entity *se) -{ - u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running; - u64 gran = 0; - - if (this_run < expected_wakeup) - gran = expected_wakeup - this_run; - - return min_t(s64, gran, sysctl_sched_wakeup_granularity); -} - static unsigned long wakeup_gran(struct sched_entity *curr, struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; - if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN)) - gran = adaptive_gran(curr, se); - /* * Since its curr running now, convert the gran from real-time * to virtual-time in his units. + * + * By using 'se' instead of 'curr' we penalize light tasks, so + * they get preempted easier. That is, if 'se' < 'curr' then + * the resulting gran will be larger, therefore penalizing the + * lighter, if otoh 'se' > 'curr' then the resulting gran will + * be smaller, again penalizing the lighter task. + * + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. */ - if (sched_feat(ASYM_GRAN)) { - /* - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - } else { - if (unlikely(curr->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, curr); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, se); return gran; } @@ -1704,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; if (unlikely(rt_prio(p->prio))) @@ -1737,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(curr->policy == SCHED_IDLE)) goto preempt; - if (sched_feat(WAKEUP_SYNC) && sync) - goto preempt; - - if (sched_feat(WAKEUP_OVERLAP) && - se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost) - goto preempt; - if (!sched_feat(WAKEUP_PREEMPT)) return; @@ -1843,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { - schedstat_inc(p, se.nr_failed_migrations_affine); + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } *all_pinned = 0; if (task_running(rq, p)) { - schedstat_inc(p, se.nr_failed_migrations_running); + schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -1865,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { schedstat_inc(sd, lb_hot_gained[idle]); - schedstat_inc(p, se.nr_forced_migrations); + schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif return 1; } if (tsk_cache_hot) { - schedstat_inc(p, se.nr_failed_migrations_hot); + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); return 0; } return 1; @@ -2096,6 +2025,7 @@ struct sd_lb_stats { unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; + unsigned long busiest_group_capacity; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2309,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; smt_gain /= weight; @@ -2342,7 +2272,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; @@ -2415,17 +2345,12 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, unsigned long load, max_cpu_load, min_cpu_load; int i; unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_avg_load_per_task; - unsigned long avg_load_per_task; + unsigned long avg_load_per_task = 0; - if (local_group) { + if (local_group) balance_cpu = group_first_cpu(group); - if (balance_cpu == this_cpu) - update_group_power(sd, this_cpu); - } /* Tally up the load of all CPUs in the group */ - sum_avg_load_per_task = avg_load_per_task = 0; max_cpu_load = 0; min_cpu_load = ~0UL; @@ -2455,7 +2380,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->sum_nr_running += rq->nr_running; sgs->sum_weighted_load += weighted_cpuload(i); - sum_avg_load_per_task += cpu_avg_load_per_task(i); } /* @@ -2465,15 +2389,16 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * to do the newly idle load balance. */ if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu && balance) { + balance_cpu != this_cpu) { *balance = 0; return; } + update_group_power(sd, this_cpu); + /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; - /* * Consider the group unbalanced when the imbalance is larger * than the average weight of two tasks. @@ -2483,8 +2408,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * normalized nr_running number somewhere that negates * the hierarchy? */ - avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / - group->cpu_power; + if (sgs->sum_nr_running) + avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; @@ -2528,7 +2453,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); - if (local_group && balance && !(*balance)) + if (local_group && !(*balance)) return; sds->total_load += sgs.group_load; @@ -2553,6 +2478,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->max_load = sgs.avg_load; sds->busiest = group; sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->group_imb = sgs.group_imb; } @@ -2575,6 +2501,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, { unsigned long tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2; + unsigned long scaled_busy_load_per_task; if (sds->this_nr_running) { sds->this_load_per_task /= sds->this_nr_running; @@ -2585,8 +2512,12 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, sds->this_load_per_task = cpu_avg_load_per_task(this_cpu); - if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= - sds->busiest_load_per_task * imbn) { + scaled_busy_load_per_task = sds->busiest_load_per_task + * SCHED_LOAD_SCALE; + scaled_busy_load_per_task /= sds->busiest->cpu_power; + + if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= + (scaled_busy_load_per_task * imbn)) { *imbalance = sds->busiest_load_per_task; return; } @@ -2637,7 +2568,14 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, unsigned long *imbalance) { - unsigned long max_pull; + unsigned long max_pull, load_above_capacity = ~0UL; + + sds->busiest_load_per_task /= sds->busiest_nr_running; + if (sds->group_imb) { + sds->busiest_load_per_task = + min(sds->busiest_load_per_task, sds->avg_load); + } + /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below @@ -2648,9 +2586,29 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, return fix_small_imbalance(sds, this_cpu, imbalance); } - /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(sds->max_load - sds->avg_load, - sds->max_load - sds->busiest_load_per_task); + if (!sds->group_imb) { + /* + * Don't want to pull so many tasks that a group would go idle. + */ + load_above_capacity = (sds->busiest_nr_running - + sds->busiest_group_capacity); + + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); + + load_above_capacity /= sds->busiest->cpu_power; + } + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load. At the same time, + * we also don't want to reduce the group load below the group capacity + * (so that we can implement power-savings policies etc). Thus we look + * for the minimum possible imbalance. + * Be careful of negative numbers as they'll appear as very large values + * with unsigned longs. + */ + max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * sds->busiest->cpu_power, @@ -2718,9 +2676,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * 4) This group is more busy than the avg busieness at this * sched_domain. * 5) The imbalance is within the specified limit. - * 6) Any rebalance would lead to ping-pong */ - if (balance && !(*balance)) + if (!(*balance)) goto ret; if (!sds.busiest || sds.busiest_nr_running == 0) @@ -2737,25 +2694,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; - sds.busiest_load_per_task /= sds.busiest_nr_running; - if (sds.group_imb) - sds.busiest_load_per_task = - min(sds.busiest_load_per_task, sds.avg_load); - - /* - * We're trying to get all the cpus to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load, as either of these - * actions would just result in more rebalancing later, and ping-pong - * tasks around. Thus we look for the minimum possible imbalance. - * Negative imbalances (*we* are more loaded than anyone else) will - * be counted as no imbalance for these purposes -- we can't fix that - * by pulling tasks to us. Be careful of negative numbers as they'll - * appear as very large values with unsigned longs. - */ - if (sds.max_load <= sds.busiest_load_per_task) - goto out_balanced; - /* Looks like there is an imbalance. Compute it */ calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; @@ -2792,12 +2730,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, continue; rq = cpu_rq(i); - wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; - wl /= power; + wl = weighted_cpuload(i); + /* + * When comparing with imbalance, use weighted_cpuload() + * which is not scaled with the cpu power. + */ if (capacity && rq->nr_running == 1 && wl > imbalance) continue; + /* + * For the load comparisons with the other cpu's, consider + * the weighted_cpuload() scaled with the cpu power, so that + * the load can be moved away from the cpu that is potentially + * running at a lower capacity. + */ + wl = (wl * SCHED_LOAD_SCALE) / power; + if (wl > max_load) { max_load = wl; busiest = rq; @@ -2849,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } +static int active_load_balance_cpu_stop(void *data); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2938,8 +2889,9 @@ redo: if (need_active_balance(sd, sd_idle, idle)) { raw_spin_lock_irqsave(&busiest->lock, flags); - /* don't kick the migration_thread, if the curr - * task on busiest cpu can't be moved to this_cpu + /* don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest cpu can't be + * moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { @@ -2949,14 +2901,22 @@ redo: goto out_one_pinned; } + /* + * ->active_balance synchronizes accesses to + * ->active_balance_work. Once set, it's cleared + * only after active load balance is finished. + */ if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } raw_spin_unlock_irqrestore(&busiest->lock, flags); + if (active_balance) - wake_up_process(busiest->migration_thread); + stop_one_cpu_nowait(cpu_of(busiest), + active_load_balance_cpu_stop, busiest, + &busiest->active_balance_work); /* * We've kicked active balancing, reset the failure @@ -3010,125 +2970,6 @@ out: } /* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - * - * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). - * this_rq is locked. - */ -static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) -{ - struct sched_group *group; - struct rq *busiest = NULL; - unsigned long imbalance; - int ld_moved = 0; - int sd_idle = 0; - int all_pinned = 0; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - - cpumask_copy(cpus, cpu_active_mask); - - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - - schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); -redo: - update_shares_locked(this_rq, sd); - group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, - &sd_idle, cpus, NULL); - if (!group) { - schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); - goto out_balanced; - } - - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); - if (!busiest) { - schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); - goto out_balanced; - } - - BUG_ON(busiest == this_rq); - - schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); - - ld_moved = 0; - if (busiest->nr_running > 1) { - /* Attempt to move tasks */ - double_lock_balance(this_rq, busiest); - /* this_rq->clock is already updated */ - update_rq_clock(busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, CPU_NEWLY_IDLE, - &all_pinned); - double_unlock_balance(this_rq, busiest); - - if (unlikely(all_pinned)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) - goto redo; - } - } - - if (!ld_moved) { - int active_balance = 0; - - schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); - sd->nr_balance_failed++; - - if (need_active_balance(sd, sd_idle, CPU_NEWLY_IDLE)) { - double_lock_balance(this_rq, busiest); - - /* - * don't kick the migration_thread, if the curr - * task on busiest cpu can't be moved to this_cpu - */ - if (!cpumask_test_cpu(this_cpu, - &busiest->curr->cpus_allowed)) { - double_unlock_balance(this_rq, busiest); - all_pinned = 1; - return ld_moved; - } - - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - active_balance = 1; - } - - double_unlock_balance(this_rq, busiest); - /* - * Should not call ttwu while holding a rq->lock - */ - raw_spin_unlock(&this_rq->lock); - if (active_balance) - wake_up_process(busiest->migration_thread); - raw_spin_lock(&this_rq->lock); - } - } else - sd->nr_balance_failed = 0; - - update_shares_locked(this_rq, sd); - return ld_moved; - -out_balanced: - schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - sd->nr_balance_failed = 0; - - return 0; -} - -/* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ @@ -3143,16 +2984,23 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; + /* + * Drop the rq->lock, but keep IRQ/preempt disabled. + */ + raw_spin_unlock(&this_rq->lock); + for_each_domain(this_cpu, sd) { unsigned long interval; + int balance = 1; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (sd->flags & SD_BALANCE_NEWIDLE) + if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance_newidle(this_cpu, this_rq, - sd); + pulled_task = load_balance(this_cpu, this_rq, + sd, CPU_NEWLY_IDLE, &balance); + } interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3162,6 +3010,9 @@ static void idle_balance(int this_cpu, struct rq *this_rq) break; } } + + raw_spin_lock(&this_rq->lock); + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on @@ -3172,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq) } /* - * active_load_balance is run by migration threads. It pushes running tasks - * off the busiest CPU onto idle CPUs. It requires at least 1 task to be - * running on each physical CPU where possible, and avoids physical / - * logical imbalances. - * - * Called with busiest_rq locked. + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. */ -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) +static int active_load_balance_cpu_stop(void *data) { + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; - struct rq *target_rq; + + raw_spin_lock_irq(&busiest_rq->lock); + + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) + goto out_unlock; /* Is there any task to move? */ if (busiest_rq->nr_running <= 1) - return; - - target_rq = cpu_rq(target_cpu); + goto out_unlock; /* * This condition is "impossible", if it occurs @@ -3200,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); - update_rq_clock(busiest_rq); - update_rq_clock(target_rq); /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { @@ -3220,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) schedstat_inc(sd, alb_failed); } double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; } #ifdef CONFIG_NO_HZ @@ -3564,7 +3422,7 @@ static void run_rebalance_domains(struct softirq_action *h) static inline int on_null_domain(int cpu) { - return !rcu_dereference(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(cpu_rq(cpu)->sd); } /*