* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
/*
* After fork, child runs first. If set to 0 (default) then
{
unsigned long delta_exec_weighted;
- schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+ schedstat_set(curr->statistics.exec_max,
+ max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+ schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
}
/*
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_max, max(se->wait_max,
- rq_of(cfs_rq)->clock - se->wait_start));
- schedstat_set(se->wait_count, se->wait_count + 1);
- schedstat_set(se->wait_sum, se->wait_sum +
- rq_of(cfs_rq)->clock - se->wait_start);
+ schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+ rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+ schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->wait_start);
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
}
#endif
- schedstat_set(se->wait_start, 0);
+ schedstat_set(se->statistics.wait_start, 0);
}
static inline void
if (entity_is_task(se))
tsk = task_of(se);
- if (se->sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+ if (se->statistics.sleep_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->sleep_max))
- se->sleep_max = delta;
+ if (unlikely(delta > se->statistics.sleep_max))
+ se->statistics.sleep_max = delta;
- se->sleep_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.sleep_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
- if (se->block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+ if (se->statistics.block_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->block_max))
- se->block_max = delta;
+ if (unlikely(delta > se->statistics.block_max))
+ se->statistics.block_max = delta;
- se->block_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.block_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
- se->iowait_sum += delta;
- se->iowait_count++;
+ se->statistics.iowait_sum += delta;
+ se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
- if (!initial && sched_feat(FAIR_SLEEPERS)) {
+ if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
- * Convert the sleeper threshold into virtual time.
- * SCHED_IDLE is a special sub-class. We care about
- * fairness only relative to other SCHED_IDLE tasks,
- * all of which have the same weight.
- */
- if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
- task_of(se)->policy != SCHED_IDLE))
- thresh = calc_delta_fair(thresh, se);
-
- /*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
se->vruntime = vruntime;
}
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_MIGRATE 2
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
/*
}
static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
update_curr(cfs_rq);
update_stats_dequeue(cfs_rq, se);
- if (sleep) {
+ if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- se->sleep_start = rq_of(cfs_rq)->clock;
+ se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->block_start = rq_of(cfs_rq)->clock;
+ se->statistics.block_start = rq_of(cfs_rq)->clock;
}
#endif
}
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
- if (!sleep)
+ if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
}
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->slice_max = max(se->slice_max,
+ se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int flags = 0;
-
- if (wakeup)
- flags |= ENQUEUE_WAKEUP;
- if (p->state == TASK_WAKING)
- flags |= ENQUEUE_MIGRATE;
for_each_sched_entity(se) {
if (se->on_rq)
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, sleep);
+ dequeue_entity(cfs_rq, se, flags);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
- sleep = 1;
+ flags |= DEQUEUE_SLEEP;
}
hrtick_update(rq);
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- struct task_struct *curr = current;
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
- unsigned int imbalance;
struct task_group *tg;
unsigned long weight;
int balanced;
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
- if (sync) {
- if (sched_feat(SYNC_LESS) &&
- (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
- } else {
- if (sched_feat(SYNC_MORE) &&
- (curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost))
- sync = 1;
- }
-
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
tg = task_group(p);
weight = p->se.load.weight;
- imbalance = 100 + (sd->imbalance_pct - 100) / 2;
-
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped this_load to 0, we'll
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- balanced = !this_load ||
- 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
- imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+ if (this_load) {
+ unsigned long this_eff_load, prev_eff_load;
+
+ this_eff_load = 100;
+ this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= this_load +
+ effective_load(tg, this_cpu, weight, weight);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+
+ balanced = this_eff_load <= prev_eff_load;
+ } else
+ balanced = true;
/*
* If the currently running task will sleep within
if (sync && balanced)
return 1;
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
if (balanced ||
* there is no bad imbalance.
*/
schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
return 1;
}
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
{
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
+ struct sched_domain *sd;
int i;
/*
- * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
- * test in select_task_rq_fair) and the prev_cpu is idle then that's
- * always a better target than the current cpu.
+ * If the task is going to be woken-up on this cpu and if it is
+ * already idle, then it is the right target.
+ */
+ if (target == cpu && idle_cpu(cpu))
+ return cpu;
+
+ /*
+ * If the task is going to be woken-up on the cpu where it previously
+ * ran and if it is currently idle, then it the right target.
*/
- if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+ if (target == prev_cpu && idle_cpu(prev_cpu))
return prev_cpu;
/*
- * Otherwise, iterate the domain and find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an elegible idle cpu.
*/
- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
- if (!cpu_rq(i)->cfs.nr_running) {
- target = i;
+ for_each_domain(target, sd) {
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
+
+ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+ if (idle_cpu(i)) {
+ target = i;
+ break;
+ }
}
+
+ /*
+ * Lets stop looking for an idle sibling when we reached
+ * the domain that spans the current cpu and prev_cpu.
+ */
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ break;
}
return target;
*
* preempt must be disabled.
*/
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE) {
- if (sched_feat(AFFINE_WAKEUPS) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed))
want_affine = 1;
new_cpu = prev_cpu;
}
}
/*
- * While iterating the domains looking for a spanning
- * WAKE_AFFINE domain, adjust the affine target to any idle cpu
- * in cache sharing domains along the way.
+ * If both cpu and prev_cpu are part of this domain,
+ * cpu is a valid SD_WAKE_AFFINE target.
*/
- if (want_affine) {
- int target = -1;
-
- /*
- * If both cpu and prev_cpu are part of this domain,
- * cpu is a valid SD_WAKE_AFFINE target.
- */
- if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
- target = cpu;
-
- /*
- * If there's an idle sibling in this domain, make that
- * the wake_affine target instead of the current cpu.
- */
- if (tmp->flags & SD_PREFER_SIBLING)
- target = select_idle_sibling(p, tmp, target);
-
- if (target >= 0) {
- if (tmp->flags & SD_WAKE_AFFINE) {
- affine_sd = tmp;
- want_affine = 0;
- }
- cpu = target;
- }
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+ affine_sd = tmp;
+ want_affine = 0;
}
if (!want_sd && !want_affine)
sd = tmp;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (sched_feat(LB_SHARES_UPDATE)) {
/*
* Pick the largest domain to update shares over
*/
tmp = sd;
- if (affine_sd && (!tmp ||
- cpumask_weight(sched_domain_span(affine_sd)) >
- cpumask_weight(sched_domain_span(sd))))
+ if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
tmp = affine_sd;
- if (tmp)
+ if (tmp) {
+ raw_spin_unlock(&rq->lock);
update_shares(tmp);
+ raw_spin_lock(&rq->lock);
+ }
}
+#endif
- if (affine_sd && wake_affine(affine_sd, p, sync))
- return cpu;
+ if (affine_sd) {
+ if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+ return select_idle_sibling(p, cpu);
+ else
+ return select_idle_sibling(p, prev_cpu);
+ }
while (sd) {
int load_idx = sd->forkexec_idx;
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
- weight = cpumask_weight(sched_domain_span(sd));
+ weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
- if (weight <= cpumask_weight(sched_domain_span(tmp)))
+ if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
#endif /* CONFIG_SMP */
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- * degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
- u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
- u64 gran = 0;
-
- if (this_run < expected_wakeup)
- gran = expected_wakeup - this_run;
-
- return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
- if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
- gran = adaptive_gran(curr, se);
-
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
*/
- if (sched_feat(ASYM_GRAN)) {
- /*
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- if (unlikely(se->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, se);
- } else {
- if (unlikely(curr->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, curr);
- }
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, se);
return gran;
}
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int sync = wake_flags & WF_SYNC;
int scale = cfs_rq->nr_running >= sched_nr_latency;
if (unlikely(rt_prio(p->prio)))
if (unlikely(curr->policy == SCHED_IDLE))
goto preempt;
- if (sched_feat(WAKEUP_SYNC) && sync)
- goto preempt;
-
- if (sched_feat(WAKEUP_OVERLAP) &&
- se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost)
- goto preempt;
-
if (!sched_feat(WAKEUP_PREEMPT))
return;
*/
/*
- * Load-balancing iterator. Note: while the runqueue stays locked
- * during the whole iteration, the current task might be
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
- */
-static struct task_struct *
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
-{
- struct task_struct *p = NULL;
- struct sched_entity *se;
-
- if (next == &cfs_rq->tasks)
- return NULL;
-
- se = list_entry(next, struct sched_entity, group_node);
- p = task_of(se);
- cfs_rq->balance_iterator = next->next;
-
- return p;
-}
-
-static struct task_struct *load_balance_start_fair(void *arg)
-{
- struct cfs_rq *cfs_rq = arg;
-
- return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
-}
-
-static struct task_struct *load_balance_next_fair(void *arg)
-{
- struct cfs_rq *cfs_rq = arg;
-
- return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
-}
-
-/*
- * runqueue iterator, to support SMP load-balancing between different
- * scheduling classes, without having to expose their internal data
- * structures to the load-balancing proper:
- */
-struct rq_iterator {
- void *arg;
- struct task_struct *(*start)(void *);
- struct task_struct *(*next)(void *);
-};
-
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned,
- int *this_best_prio, struct rq_iterator *iterator);
-
-
-static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
- struct cfs_rq *cfs_rq)
-{
- struct rq_iterator cfs_rq_iterator;
-
- cfs_rq_iterator.start = load_balance_start_fair;
- cfs_rq_iterator.next = load_balance_next_fair;
- cfs_rq_iterator.arg = cfs_rq;
-
- return balance_tasks(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
- this_best_prio, &cfs_rq_iterator);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
-{
- long rem_load_move = max_load_move;
- int busiest_cpu = cpu_of(busiest);
- struct task_group *tg;
-
- rcu_read_lock();
- update_h_load(busiest_cpu);
-
- list_for_each_entry_rcu(tg, &task_groups, list) {
- struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
- unsigned long busiest_h_load = busiest_cfs_rq->h_load;
- unsigned long busiest_weight = busiest_cfs_rq->load.weight;
- u64 rem_load, moved_load;
-
- /*
- * empty group
- */
- if (!busiest_cfs_rq->task_weight)
- continue;
-
- rem_load = (u64)rem_load_move * busiest_weight;
- rem_load = div_u64(rem_load, busiest_h_load + 1);
-
- moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
- rem_load, sd, idle, all_pinned, this_best_prio,
- tg->cfs_rq[busiest_cpu]);
-
- if (!moved_load)
- continue;
-
- moved_load *= busiest_h_load;
- moved_load = div_u64(moved_load, busiest_weight + 1);
-
- rem_load_move -= moved_load;
- if (rem_load_move < 0)
- break;
- }
- rcu_read_unlock();
-
- return max_load_move - rem_load_move;
-}
-#else
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
-{
- return __load_balance_fair(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
- this_best_prio, &busiest->cfs);
-}
-#endif
-
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- struct sched_domain *sd, enum cpu_idle_type idle,
- struct rq_iterator *iterator);
-
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int
-move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- struct sched_domain *sd, enum cpu_idle_type idle)
-{
- struct cfs_rq *busy_cfs_rq;
- struct rq_iterator cfs_rq_iterator;
-
- cfs_rq_iterator.start = load_balance_start_fair;
- cfs_rq_iterator.next = load_balance_next_fair;
-
- for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
- /*
- * pass busy_cfs_rq argument into
- * load_balance_[start|next]_fair iterators
- */
- cfs_rq_iterator.arg = busy_cfs_rq;
- if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
- &cfs_rq_iterator))
- return 1;
- }
-
- return 0;
-}
-
-/*
* pull_task - move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
- schedstat_inc(p, se.nr_failed_migrations_affine);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
return 0;
}
*all_pinned = 0;
if (task_running(rq, p)) {
- schedstat_inc(p, se.nr_failed_migrations_running);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
- schedstat_inc(p, se.nr_forced_migrations);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
if (tsk_cache_hot) {
- schedstat_inc(p, se.nr_failed_migrations_hot);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
return 1;
}
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int
+move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ struct task_struct *p, *n;
+ struct cfs_rq *cfs_rq;
+ int pinned = 0;
+
+ for_each_leaf_cfs_rq(busiest, cfs_rq) {
+ list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+
+ if (!can_migrate_task(p, busiest, this_cpu,
+ sd, idle, &pinned))
+ continue;
+
+ pull_task(busiest, p, this_rq, this_cpu);
+ /*
+ * Right now, this is only the second place pull_task()
+ * is called, so we can safely collect pull_task()
+ * stats here rather than inside pull_task().
+ */
+ schedstat_inc(sd, lb_gained[idle]);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
enum cpu_idle_type idle, int *all_pinned,
- int *this_best_prio, struct rq_iterator *iterator)
+ int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
{
int loops = 0, pulled = 0, pinned = 0;
- struct task_struct *p;
long rem_load_move = max_load_move;
+ struct task_struct *p, *n;
if (max_load_move == 0)
goto out;
pinned = 1;
- /*
- * Start the load-balancing iterator:
- */
- p = iterator->start(iterator->arg);
-next:
- if (!p || loops++ > sysctl_sched_nr_migrate)
- goto out;
+ list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+ if (loops++ > sysctl_sched_nr_migrate)
+ break;
- if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
- p = iterator->next(iterator->arg);
- goto next;
- }
+ if ((p->se.load.weight >> 1) > rem_load_move ||
+ !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+ continue;
- pull_task(busiest, p, this_rq, this_cpu);
- pulled++;
- rem_load_move -= p->se.load.weight;
+ pull_task(busiest, p, this_rq, this_cpu);
+ pulled++;
+ rem_load_move -= p->se.load.weight;
#ifdef CONFIG_PREEMPT
- /*
- * NEWIDLE balancing is a source of latency, so preemptible kernels
- * will stop after the first task is pulled to minimize the critical
- * section.
- */
- if (idle == CPU_NEWLY_IDLE)
- goto out;
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible
+ * kernels will stop after the first task is pulled to minimize
+ * the critical section.
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ break;
#endif
- /*
- * We only want to steal up to the prescribed amount of weighted load.
- */
- if (rem_load_move > 0) {
+ /*
+ * We only want to steal up to the prescribed amount of
+ * weighted load.
+ */
+ if (rem_load_move <= 0)
+ break;
+
if (p->prio < *this_best_prio)
*this_best_prio = p->prio;
- p = iterator->next(iterator->arg);
- goto next;
}
out:
/*
return max_load_move - rem_load_move;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ long rem_load_move = max_load_move;
+ int busiest_cpu = cpu_of(busiest);
+ struct task_group *tg;
+
+ rcu_read_lock();
+ update_h_load(busiest_cpu);
+
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+ unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+ unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+ u64 rem_load, moved_load;
+
+ /*
+ * empty group
+ */
+ if (!busiest_cfs_rq->task_weight)
+ continue;
+
+ rem_load = (u64)rem_load_move * busiest_weight;
+ rem_load = div_u64(rem_load, busiest_h_load + 1);
+
+ moved_load = balance_tasks(this_rq, this_cpu, busiest,
+ rem_load, sd, idle, all_pinned, this_best_prio,
+ busiest_cfs_rq);
+
+ if (!moved_load)
+ continue;
+
+ moved_load *= busiest_h_load;
+ moved_load = div_u64(moved_load, busiest_weight + 1);
+
+ rem_load_move -= moved_load;
+ if (rem_load_move < 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ return max_load_move - rem_load_move;
+}
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ return balance_tasks(this_rq, this_cpu, busiest,
+ max_load_move, sd, idle, all_pinned,
+ this_best_prio, &busiest->cfs);
+}
+#endif
+
/*
* move_tasks tries to move up to max_load_move weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
*/
if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
break;
+
+ if (raw_spin_is_contended(&this_rq->lock) ||
+ raw_spin_is_contended(&busiest->lock))
+ break;
#endif
} while (load_moved && max_load_move > total_load_moved);
return total_load_moved > 0;
}
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- struct sched_domain *sd, enum cpu_idle_type idle,
- struct rq_iterator *iterator)
-{
- struct task_struct *p = iterator->start(iterator->arg);
- int pinned = 0;
-
- while (p) {
- if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
- pull_task(busiest, p, this_rq, this_cpu);
- /*
- * Right now, this is only the second place pull_task()
- * is called, so we can safely collect pull_task()
- * stats here rather than inside pull_task().
- */
- schedstat_inc(sd, lb_gained[idle]);
-
- return 1;
- }
- p = iterator->next(iterator->arg);
- }
-
- return 0;
-}
-
/********** Helpers for find_busiest_group ************************/
/*
* sd_lb_stats - Structure to store the statistics of a sched_domain
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
+ unsigned long busiest_group_capacity;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
smt_gain /= weight;
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long weight = sd->span_weight;
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
if (!power)
power = 1;
+ cpu_rq(cpu)->cpu_power = power;
sdg->cpu_power = power;
}
unsigned long load, max_cpu_load, min_cpu_load;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
- unsigned long sum_avg_load_per_task;
- unsigned long avg_load_per_task;
+ unsigned long avg_load_per_task = 0;
- if (local_group) {
+ if (local_group)
balance_cpu = group_first_cpu(group);
- if (balance_cpu == this_cpu)
- update_group_power(sd, this_cpu);
- }
/* Tally up the load of all CPUs in the group */
- sum_avg_load_per_task = avg_load_per_task = 0;
max_cpu_load = 0;
min_cpu_load = ~0UL;
sgs->sum_nr_running += rq->nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
- sum_avg_load_per_task += cpu_avg_load_per_task(i);
}
/*
* to do the newly idle load balance.
*/
if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu && balance) {
+ balance_cpu != this_cpu) {
*balance = 0;
return;
}
+ update_group_power(sd, this_cpu);
+
/* Adjust by relative CPU power of the group */
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
/*
* Consider the group unbalanced when the imbalance is larger
* than the average weight of two tasks.
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
- avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
- group->cpu_power;
+ if (sgs->sum_nr_running)
+ avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
sgs->group_imb = 1;
update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);
- if (local_group && balance && !(*balance))
+ if (local_group && !(*balance))
return;
sds->total_load += sgs.group_load;
sds->max_load = sgs.avg_load;
sds->busiest = group;
sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->group_imb = sgs.group_imb;
}
{
unsigned long tmp, pwr_now = 0, pwr_move = 0;
unsigned int imbn = 2;
+ unsigned long scaled_busy_load_per_task;
if (sds->this_nr_running) {
sds->this_load_per_task /= sds->this_nr_running;
sds->this_load_per_task =
cpu_avg_load_per_task(this_cpu);
- if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
- sds->busiest_load_per_task * imbn) {
+ scaled_busy_load_per_task = sds->busiest_load_per_task
+ * SCHED_LOAD_SCALE;
+ scaled_busy_load_per_task /= sds->busiest->cpu_power;
+
+ if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+ (scaled_busy_load_per_task * imbn)) {
*imbalance = sds->busiest_load_per_task;
return;
}
static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
unsigned long *imbalance)
{
- unsigned long max_pull;
+ unsigned long max_pull, load_above_capacity = ~0UL;
+
+ sds->busiest_load_per_task /= sds->busiest_nr_running;
+ if (sds->group_imb) {
+ sds->busiest_load_per_task =
+ min(sds->busiest_load_per_task, sds->avg_load);
+ }
+
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
return fix_small_imbalance(sds, this_cpu, imbalance);
}
- /* Don't want to pull so many tasks that a group would go idle */
- max_pull = min(sds->max_load - sds->avg_load,
- sds->max_load - sds->busiest_load_per_task);
+ if (!sds->group_imb) {
+ /*
+ * Don't want to pull so many tasks that a group would go idle.
+ */
+ load_above_capacity = (sds->busiest_nr_running -
+ sds->busiest_group_capacity);
+
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+
+ load_above_capacity /= sds->busiest->cpu_power;
+ }
+
+ /*
+ * We're trying to get all the cpus to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded cpu below the average load. At the same time,
+ * we also don't want to reduce the group load below the group capacity
+ * (so that we can implement power-savings policies etc). Thus we look
+ * for the minimum possible imbalance.
+ * Be careful of negative numbers as they'll appear as very large values
+ * with unsigned longs.
+ */
+ max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * sds->busiest->cpu_power,
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
- * 6) Any rebalance would lead to ping-pong
*/
- if (balance && !(*balance))
+ if (!(*balance))
goto ret;
if (!sds.busiest || sds.busiest_nr_running == 0)
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
- sds.busiest_load_per_task /= sds.busiest_nr_running;
- if (sds.group_imb)
- sds.busiest_load_per_task =
- min(sds.busiest_load_per_task, sds.avg_load);
-
- /*
- * We're trying to get all the cpus to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded cpu below the average load, as either of these
- * actions would just result in more rebalancing later, and ping-pong
- * tasks around. Thus we look for the minimum possible imbalance.
- * Negative imbalances (*we* are more loaded than anyone else) will
- * be counted as no imbalance for these purposes -- we can't fix that
- * by pulling tasks to us. Be careful of negative numbers as they'll
- * appear as very large values with unsigned longs.
- */
- if (sds.max_load <= sds.busiest_load_per_task)
- goto out_balanced;
-
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
continue;
rq = cpu_rq(i);
- wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
- wl /= power;
+ wl = weighted_cpuload(i);
+ /*
+ * When comparing with imbalance, use weighted_cpuload()
+ * which is not scaled with the cpu power.
+ */
if (capacity && rq->nr_running == 1 && wl > imbalance)
continue;
+ /*
+ * For the load comparisons with the other cpu's, consider
+ * the weighted_cpuload() scaled with the cpu power, so that
+ * the load can be moved away from the cpu that is potentially
+ * running at a lower capacity.
+ */
+ wl = (wl * SCHED_LOAD_SCALE) / power;
+
if (wl > max_load) {
max_load = wl;
busiest = rq;
/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+{
+ if (idle == CPU_NEWLY_IDLE) {
+ /*
+ * The only task running in a non-idle cpu can be moved to this
+ * cpu in an attempt to completely freeup the other CPU
+ * package.
+ *
+ * The package power saving logic comes from
+ * find_busiest_group(). If there are no imbalance, then
+ * f_b_g() will return NULL. However when sched_mc={1,2} then
+ * f_b_g() will select a group from which a running task may be
+ * pulled to this cpu in order to make the other package idle.
+ * If there is no opportunity to make a package idle and if
+ * there are no imbalance, then f_b_g() will return NULL and no
+ * action will be taken in load_balance_newidle().
+ *
+ * Under normal task pull operation due to imbalance, there
+ * will be more than one task in the source run queue and
+ * move_tasks() will succeed. ld_moved will be true and this
+ * active balance code will not be triggered.
+ */
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+ return 0;
+
+ if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+ return 0;
+ }
+
+ return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+
+static int active_load_balance_cpu_stop(void *data);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
- if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-
+ if (need_active_balance(sd, sd_idle, idle)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
- /* don't kick the migration_thread, if the curr
- * task on busiest cpu can't be moved to this_cpu
+ /* don't kick the active_load_balance_cpu_stop,
+ * if the curr task on busiest cpu can't be
+ * moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) {
goto out_one_pinned;
}
+ /*
+ * ->active_balance synchronizes accesses to
+ * ->active_balance_work. Once set, it's cleared
+ * only after active load balance is finished.
+ */
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
if (active_balance)
- wake_up_process(busiest->migration_thread);
+ stop_one_cpu_nowait(cpu_of(busiest),
+ active_load_balance_cpu_stop, busiest,
+ &busiest->active_balance_work);
/*
* We've kicked active balancing, reset the failure
}
/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
- * this_rq is locked.
- */
-static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
-{
- struct sched_group *group;
- struct rq *busiest = NULL;
- unsigned long imbalance;
- int ld_moved = 0;
- int sd_idle = 0;
- int all_pinned = 0;
- struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-
- cpumask_copy(cpus, cpu_active_mask);
-
- /*
- * When power savings policy is enabled for the parent domain, idle
- * sibling can pick up load irrespective of busy siblings. In this case,
- * let the state of idle sibling percolate up as IDLE, instead of
- * portraying it as CPU_NOT_IDLE.
- */
- if (sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- sd_idle = 1;
-
- schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
-redo:
- update_shares_locked(this_rq, sd);
- group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
- &sd_idle, cpus, NULL);
- if (!group) {
- schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
- goto out_balanced;
- }
-
- busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
- if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
- goto out_balanced;
- }
-
- BUG_ON(busiest == this_rq);
-
- schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
-
- ld_moved = 0;
- if (busiest->nr_running > 1) {
- /* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
- /* this_rq->clock is already updated */
- update_rq_clock(busiest);
- ld_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, CPU_NEWLY_IDLE,
- &all_pinned);
- double_unlock_balance(this_rq, busiest);
-
- if (unlikely(all_pinned)) {
- cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus))
- goto redo;
- }
- }
-
- if (!ld_moved) {
- int active_balance = 0;
-
- schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- return -1;
-
- if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
- return -1;
-
- if (sd->nr_balance_failed++ < 2)
- return -1;
-
- /*
- * The only task running in a non-idle cpu can be moved to this
- * cpu in an attempt to completely freeup the other CPU
- * package. The same method used to move task in load_balance()
- * have been extended for load_balance_newidle() to speedup
- * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
- *
- * The package power saving logic comes from
- * find_busiest_group(). If there are no imbalance, then
- * f_b_g() will return NULL. However when sched_mc={1,2} then
- * f_b_g() will select a group from which a running task may be
- * pulled to this cpu in order to make the other package idle.
- * If there is no opportunity to make a package idle and if
- * there are no imbalance, then f_b_g() will return NULL and no
- * action will be taken in load_balance_newidle().
- *
- * Under normal task pull operation due to imbalance, there
- * will be more than one task in the source run queue and
- * move_tasks() will succeed. ld_moved will be true and this
- * active balance code will not be triggered.
- */
-
- /* Lock busiest in correct order while this_rq is held */
- double_lock_balance(this_rq, busiest);
-
- /*
- * don't kick the migration_thread, if the curr
- * task on busiest cpu can't be moved to this_cpu
- */
- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
- double_unlock_balance(this_rq, busiest);
- all_pinned = 1;
- return ld_moved;
- }
-
- if (!busiest->active_balance) {
- busiest->active_balance = 1;
- busiest->push_cpu = this_cpu;
- active_balance = 1;
- }
-
- double_unlock_balance(this_rq, busiest);
- /*
- * Should not call ttwu while holding a rq->lock
- */
- raw_spin_unlock(&this_rq->lock);
- if (active_balance)
- wake_up_process(busiest->migration_thread);
- raw_spin_lock(&this_rq->lock);
-
- } else
- sd->nr_balance_failed = 0;
-
- update_shares_locked(this_rq, sd);
- return ld_moved;
-
-out_balanced:
- schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- return -1;
- sd->nr_balance_failed = 0;
-
- return 0;
-}
-
-/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
+ /*
+ * Drop the rq->lock, but keep IRQ/preempt disabled.
+ */
+ raw_spin_unlock(&this_rq->lock);
+
for_each_domain(this_cpu, sd) {
unsigned long interval;
+ int balance = 1;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- if (sd->flags & SD_BALANCE_NEWIDLE)
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
/* If we've pulled tasks over stop searching: */
- pulled_task = load_balance_newidle(this_cpu, this_rq,
- sd);
+ pulled_task = load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE, &balance);
+ }
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
break;
}
}
+
+ raw_spin_lock(&this_rq->lock);
+
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
/*
* We are going idle. next_balance may be set based on
}
/*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
*/
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
{
+ struct rq *busiest_rq = data;
+ int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
- struct rq *target_rq;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance))
+ goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
- return;
-
- target_rq = cpu_rq(target_cpu);
+ goto out_unlock;
/*
* This condition is "impossible", if it occurs
/* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq);
- update_rq_clock(busiest_rq);
- update_rq_clock(target_rq);
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
schedstat_inc(sd, alb_failed);
}
double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
}
#ifdef CONFIG_NO_HZ
static inline int on_null_domain(int cpu)
{
- return !rcu_dereference(cpu_rq(cpu)->sd);
+ return !rcu_dereference_sched(cpu_rq(cpu)->sd);
}
/*