X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched_fair.c;h=217e4a9393e42c2f5dfdfae058625601c15e235b;hb=aa679c36756003f1fabdb9fc6f00eb159559f7c3;hp=69e582020ff89ac66908276dc8f9ae3b93831b37;hpb=a64692a3afd85fe048551ab89142fd5ca99a0dbd;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 69e5820..217e4a9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -35,8 +35,8 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency = 5000000ULL; -unsigned int normalized_sysctl_sched_latency = 5000000ULL; +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable @@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 1000000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; +unsigned int sysctl_sched_min_granularity = 2000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -static unsigned int sched_nr_latency = 5; +static unsigned int sched_nr_latency = 3; /* * After fork, child runs first. If set to 0 (default) then @@ -738,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime += sched_vslice(cfs_rq, se); /* sleeps up to a single latency don't count. */ - if (!initial && sched_feat(FAIR_SLEEPERS)) { + if (!initial) { unsigned long thresh = sysctl_sched_latency; /* - * Convert the sleeper threshold into virtual time. - * SCHED_IDLE is a special sub-class. We care about - * fairness only relative to other SCHED_IDLE tasks, - * all of which have the same weight. - */ - if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || - task_of(se)->policy != SCHED_IDLE)) - thresh = calc_delta_fair(thresh, se); - - /* * Halve their sleep time's effect, to allow * for a gentler effect of sleepers: */ @@ -767,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_MIGRATE 2 - static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -777,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; /* @@ -813,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update run-time statistics of the 'current'. @@ -821,7 +808,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_curr(cfs_rq); update_stats_dequeue(cfs_rq, se); - if (sleep) { + if (flags & DEQUEUE_SLEEP) { #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -846,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) * update can refer to the ->curr item and we need to reflect this * movement in our normalized position. */ - if (!sleep) + if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; } @@ -1055,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq) * then put the task into the rbtree: */ static void -enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int flags = 0; - - if (wakeup) - flags |= ENQUEUE_WAKEUP; - if (p->state == TASK_WAKING) - flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) @@ -1082,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, sleep); + dequeue_entity(cfs_rq, se, flags); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; - sleep = 1; + flags |= DEQUEUE_SLEEP; } hrtick_update(rq); @@ -1394,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int -select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_sibling(struct task_struct *p, int target) { int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); + struct sched_domain *sd; int i; /* - * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE - * test in select_task_rq_fair) and the prev_cpu is idle then that's - * always a better target than the current cpu. + * If the task is going to be woken-up on this cpu and if it is + * already idle, then it is the right target. */ - if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + if (target == cpu && idle_cpu(cpu)) + return cpu; + + /* + * If the task is going to be woken-up on the cpu where it previously + * ran and if it is currently idle, then it the right target. + */ + if (target == prev_cpu && idle_cpu(prev_cpu)) return prev_cpu; /* - * Otherwise, iterate the domain and find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (!cpu_rq(i)->cfs.nr_running) { - target = i; + for_each_domain(target, sd) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; + + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (idle_cpu(i)) { + target = i; + break; + } } + + /* + * Lets stop looking for an idle sibling when we reached + * the domain that spans the current cpu and prev_cpu. + */ + if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && + cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + break; } return target; @@ -1433,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) * * preempt must be disabled. */ -static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +static int +select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -1444,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { - if (sched_feat(AFFINE_WAKEUPS) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) want_affine = 1; new_cpu = prev_cpu; } @@ -1479,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } /* - * While iterating the domains looking for a spanning - * WAKE_AFFINE domain, adjust the affine target to any idle cpu - * in cache sharing domains along the way. + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. */ - if (want_affine) { - int target = -1; - - /* - * If both cpu and prev_cpu are part of this domain, - * cpu is a valid SD_WAKE_AFFINE target. - */ - if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) - target = cpu; - - /* - * If there's an idle sibling in this domain, make that - * the wake_affine target instead of the current cpu. - */ - if (tmp->flags & SD_SHARE_PKG_RESOURCES) - target = select_idle_sibling(p, tmp, target); - - if (target >= 0) { - if (tmp->flags & SD_WAKE_AFFINE) { - affine_sd = tmp; - want_affine = 0; - } - cpu = target; - } + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + affine_sd = tmp; + want_affine = 0; } if (!want_sd && !want_affine) @@ -1519,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag sd = tmp; } +#ifdef CONFIG_FAIR_GROUP_SCHED if (sched_feat(LB_SHARES_UPDATE)) { /* * Pick the largest domain to update shares over */ tmp = sd; - if (affine_sd && (!tmp || - cpumask_weight(sched_domain_span(affine_sd)) > - cpumask_weight(sched_domain_span(sd)))) + if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) tmp = affine_sd; - if (tmp) + if (tmp) { + raw_spin_unlock(&rq->lock); update_shares(tmp); + raw_spin_lock(&rq->lock); + } } +#endif - if (affine_sd && wake_affine(affine_sd, p, sync)) - return cpu; + if (affine_sd) { + if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + return select_idle_sibling(p, cpu); + else + return select_idle_sibling(p, prev_cpu); + } while (sd) { int load_idx = sd->forkexec_idx; @@ -1564,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); + weight = sd->span_weight; sd = NULL; for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) + if (weight <= tmp->span_weight) break; if (tmp->flags & sd_flag) sd = tmp; @@ -1587,24 +1573,18 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) /* * Since its curr running now, convert the gran from real-time * to virtual-time in his units. + * + * By using 'se' instead of 'curr' we penalize light tasks, so + * they get preempted easier. That is, if 'se' < 'curr' then + * the resulting gran will be larger, therefore penalizing the + * lighter, if otoh 'se' > 'curr' then the resulting gran will + * be smaller, again penalizing the lighter task. + * + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. */ - if (sched_feat(ASYM_GRAN)) { - /* - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - } else { - if (unlikely(curr->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, curr); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, se); return gran; } @@ -1662,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; if (unlikely(rt_prio(p->prio))) @@ -1695,9 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(curr->policy == SCHED_IDLE)) goto preempt; - if (sched_feat(WAKEUP_SYNC) && sync) - goto preempt; - if (!sched_feat(WAKEUP_PREEMPT)) return; @@ -2263,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; smt_gain /= weight; @@ -2296,7 +2272,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; @@ -2822,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } +static int active_load_balance_cpu_stop(void *data); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2911,8 +2889,9 @@ redo: if (need_active_balance(sd, sd_idle, idle)) { raw_spin_lock_irqsave(&busiest->lock, flags); - /* don't kick the migration_thread, if the curr - * task on busiest cpu can't be moved to this_cpu + /* don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest cpu can't be + * moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { @@ -2922,14 +2901,22 @@ redo: goto out_one_pinned; } + /* + * ->active_balance synchronizes accesses to + * ->active_balance_work. Once set, it's cleared + * only after active load balance is finished. + */ if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } raw_spin_unlock_irqrestore(&busiest->lock, flags); + if (active_balance) - wake_up_process(busiest->migration_thread); + stop_one_cpu_nowait(cpu_of(busiest), + active_load_balance_cpu_stop, busiest, + &busiest->active_balance_work); /* * We've kicked active balancing, reset the failure @@ -3036,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq) } /* - * active_load_balance is run by migration threads. It pushes running tasks - * off the busiest CPU onto idle CPUs. It requires at least 1 task to be - * running on each physical CPU where possible, and avoids physical / - * logical imbalances. - * - * Called with busiest_rq locked. + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. */ -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) +static int active_load_balance_cpu_stop(void *data) { + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; - struct rq *target_rq; + + raw_spin_lock_irq(&busiest_rq->lock); + + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) + goto out_unlock; /* Is there any task to move? */ if (busiest_rq->nr_running <= 1) - return; - - target_rq = cpu_rq(target_cpu); + goto out_unlock; /* * This condition is "impossible", if it occurs @@ -3082,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) schedstat_inc(sd, alb_failed); } double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; } #ifdef CONFIG_NO_HZ @@ -3426,7 +3422,7 @@ static void run_rebalance_domains(struct softirq_action *h) static inline int on_null_domain(int cpu) { - return !rcu_dereference(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(cpu_rq(cpu)->sd); } /*