X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched.c;h=3c11ae0a948d9337732ce9d0e37076d84c324ff6;hb=995bcd3dc1924095ddda45d0f8ece6bf6124d74e;hp=71e8dcaf2c790eefc3ca429466061e419eb8b9d7;hpb=a021dc03376707c55a3483e32c16b8986d4414cc;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 71e8dca..3c11ae0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -63,22 +64,22 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include -#include #include #include #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,36 +119,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - -#ifdef CONFIG_SMP - -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -231,13 +202,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) spin_lock(&rt_b->rt_runtime_lock); for (;;) { + unsigned long delta; + ktime_t soft, hard; + if (hrtimer_active(&rt_b->rt_period_timer)) break; now = hrtimer_cb_get_time(&rt_b->rt_period_timer); hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - hrtimer_start_expires(&rt_b->rt_period_timer, - HRTIMER_MODE_ABS); + + soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); + hard = hrtimer_get_expires(&rt_b->rt_period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); } spin_unlock(&rt_b->rt_runtime_lock); } @@ -306,8 +284,8 @@ void set_tg_uid(struct user_struct *user) /* * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. */ struct task_group root_task_group; @@ -315,12 +293,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group @@ -331,6 +309,8 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_SMP static int root_task_group_empty(void) { @@ -338,7 +318,6 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else /* !CONFIG_USER_SCHED */ @@ -398,13 +377,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return 1; -} -#endif - static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { @@ -490,6 +462,7 @@ struct rt_rq { #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; + unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; #endif @@ -533,14 +506,6 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif }; /* @@ -577,6 +542,7 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; + u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -611,6 +577,7 @@ struct rq { unsigned char idle_at_tick; /* For active balancing */ + int post_schedule; int active_balance; int push_cpu; /* cpu of this runqueue: */ @@ -621,8 +588,15 @@ struct rq { struct task_struct *migration_thread; struct list_head migration_queue; + + u64 rt_avg; + u64 age_stamp; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -656,9 +630,10 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - rq->curr->sched_class->check_preempt_curr(rq, p, sync); + rq->curr->sched_class->check_preempt_curr(rq, p, flags); } static inline int cpu_of(struct rq *rq) @@ -684,8 +659,9 @@ static inline int cpu_of(struct rq *rq) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -701,20 +677,15 @@ static inline void update_rq_clock(struct rq *rq) /** * runqueue_is_locked + * @cpu: the processor in question. * * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -int runqueue_is_locked(void) +int runqueue_is_locked(int cpu) { - int cpu = get_cpu(); - struct rq *rq = cpu_rq(cpu); - int ret; - - ret = spin_is_locked(&rq->lock); - put_cpu(); - return ret; + return spin_is_locked(&cpu_rq(cpu)->lock); } /* @@ -811,7 +782,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) return single_open(filp, sched_feat_show, NULL); } -static struct file_operations sched_feat_fops = { +static const struct file_operations sched_feat_fops = { .open = sched_feat_open, .write = sched_feat_write, .read = seq_read, @@ -852,6 +823,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; unsigned int sysctl_sched_shares_thresh = 4; /* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + +/* * period over which we measure -rt task cpu usage in us. * default: 1s */ @@ -1110,7 +1089,7 @@ static void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { hrtimer_restart(timer); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; } } @@ -1146,7 +1125,8 @@ static __init void init_hrtick(void) */ static void hrtick_start(struct rq *rq, u64 delay) { - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); + __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, + HRTIMER_MODE_REL_PINNED, 0); } static inline void init_hrtick(void) @@ -1268,12 +1248,37 @@ void wake_up_idle_cpu(int cpu) } #endif /* CONFIG_NO_HZ */ +static u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +static void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} + #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -1410,10 +1415,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct rq_iterator *iterator); #endif +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + #ifdef CONFIG_CGROUP_CPUACCT static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +static inline void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) {} #endif static inline void inc_cpu_load(struct rq *rq, unsigned long load) @@ -1472,8 +1489,65 @@ static int tg_nop(struct task_group *tg, void *data) #endif #ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->cpu_power; +} + static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) @@ -1491,28 +1565,31 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED +static __read_mostly unsigned long *update_shares_data; + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* * Calculate and set the cpu's group shares. */ -static void -update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) +static void update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, + unsigned long sd_rq_weight, + unsigned long *usd_rq_weight) { - unsigned long shares; - unsigned long rq_weight; - - if (!tg->se[cpu]) - return; + unsigned long shares, rq_weight; + int boost = 0; - rq_weight = tg->cfs_rq[cpu]->rq_weight; + rq_weight = usd_rq_weight[cpu]; + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * + * \Sum_j shares_j * rq_weight_i + * shares_i = ----------------------------- + * \Sum_j rq_weight_j */ shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); @@ -1523,8 +1600,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->shares = shares; - + tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); } @@ -1537,22 +1614,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0; - unsigned long shares = 0; + unsigned long weight, rq_weight = 0, shares = 0; + unsigned long *usd_rq_weight; struct sched_domain *sd = data; + unsigned long flags; int i; + if (!tg->se[0]) + return 0; + + local_irq_save(flags); + usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); + for_each_cpu(i, sched_domain_span(sd)) { + weight = tg->cfs_rq[i]->load.weight; + usd_rq_weight[i] = weight; + /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. */ - weight = tg->cfs_rq[i]->load.weight; if (!weight) weight = NICE_0_LOAD; - tg->cfs_rq[i]->rq_weight = weight; rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1564,7 +1649,9 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight); + update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); + + local_irq_restore(flags); return 0; } @@ -1594,8 +1681,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_shares(struct sched_domain *sd) { - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; + s64 elapsed; + u64 now; + + if (root_task_group_empty()) + return; + + now = cpu_clock(raw_smp_processor_id()); + elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; @@ -1605,6 +1698,9 @@ static void update_shares(struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd) { + if (root_task_group_empty()) + return; + spin_unlock(&rq->lock); update_shares(sd); spin_lock(&rq->lock); @@ -1612,6 +1708,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_h_load(long cpu) { + if (root_task_group_empty()) + return; + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } @@ -1629,6 +1728,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #ifdef CONFIG_PREEMPT +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * fair double_lock_balance: Safely acquires both rq->locks in a fair * way at the expense of forcing extra atomic operations in all @@ -1708,6 +1809,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -1890,14 +1993,39 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } -#ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + * + * Function lives here instead of kthread.c because it messes with + * scheduler internals which require locking. + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) { - return cpu_rq(cpu)->load.weight; + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + spin_lock_irqsave(&rq->lock, flags); + set_task_cpu(p, cpu); + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; + spin_unlock_irqrestore(&rq->lock, flags); } +EXPORT_SYMBOL(kthread_bind); +#ifdef CONFIG_SMP /* * Is this task likely cache-hot: */ @@ -1909,7 +2037,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -1938,7 +2066,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) @@ -1947,12 +2075,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; +#endif if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); + p->se.nr_migrations++; + new_rq->nr_migrations_in++; +#ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); - } #endif + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, + 1, 1, NULL, 0); + } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -1995,6 +2128,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) } /* + * wait_task_context_switch - wait for a thread to complete at least one + * context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ + unsigned long nvcsw, nivcsw, flags; + int running; + struct rq *rq; + + nvcsw = p->nvcsw; + nivcsw = p->nivcsw; + for (;;) { + /* + * The runqueue is assigned before the actual context + * switch. We need to take the runqueue lock. + * + * We could check initially without the lock but it is + * very likely that we need to take the lock in every + * iteration. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + task_rq_unlock(rq, &flags); + + if (likely(!running)) + break; + /* + * The switch count is incremented before the actual + * context switch. We thus wait for two switches to be + * sure at least one completed. + */ + if ((p->nvcsw - nvcsw) > 1) + break; + if ((p->nivcsw - nivcsw) > 1) + break; + + cpu_relax(); + } +} + +/* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and @@ -2122,261 +2298,94 @@ void kick_process(struct task_struct *p) smp_send_reschedule(cpu); preempt_enable(); } +EXPORT_SYMBOL_GPL(kick_process); +#endif /* CONFIG_SMP */ -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument * - * We want to under-estimate the load of migration sources, to - * balance conservatively. + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly */ -static unsigned long source_load(int cpu, int type) +void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info) { - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; + int cpu; - return min(rq->cpu_load[type-1], total); + preempt_disable(); + cpu = task_cpu(p); + if (task_curr(p)) + smp_call_function_single(cpu, func, info, 1); + preempt_enable(); } -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. */ -static unsigned long target_load(int cpu, int type) +static int try_to_wake_up(struct task_struct *p, unsigned int state, + int wake_flags) { - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; + int cpu, orig_cpu, this_cpu, success = 0; + unsigned long flags; + struct rq *rq, *orig_rq; - return max(rq->cpu_load[type-1], total); -} + if (!sched_feat(SYNC_WAKEUPS)) + wake_flags &= ~WF_SYNC; -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; + this_cpu = get_cpu(); - do { - unsigned long load, avg_load; - int local_group; - int i; + smp_wmb(); + rq = orig_rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + if (!(p->state & state)) + goto out; - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) - continue; + if (p->se.on_rq) + goto out_running; - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + cpu = task_cpu(p); + orig_cpu = cpu; - /* Tally up the load of all CPUs in the group */ - avg_load = 0; +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) + goto out_activate; - for_each_cpu(i, sched_group_cpus(group)) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; - if (tmp->flags & flag) - sd = tmp; - } - - if (sd) - update_shares(sd); - - while (sd) { - struct sched_group *group; - int new_cpu, weight; - - if (!(sd->flags & flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, t, cpu); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - return cpu; -} - -#endif /* CONFIG_SMP */ - -/*** - * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread - * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? - * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. - * - * returns failure only if the task is already active. - */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) -{ - int cpu, orig_cpu, this_cpu, success = 0; - unsigned long flags; - long old_state; - struct rq *rq; - - if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; - -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { - struct sched_domain *sd; - - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); + /* + * In order to handle concurrent wakeups and release the rq->lock + * we put the task in TASK_WAKING state. + * + * First fix up the nr_uninterruptible count: + */ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + p->state = TASK_WAKING; + task_rq_unlock(rq, &flags); - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (cpu != orig_cpu) + set_task_cpu(p, cpu); - smp_wmb(); rq = task_rq_lock(p, &flags); - update_rq_clock(rq); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; + if (rq != orig_rq) + update_rq_clock(rq); + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); - orig_cpu = cpu; - this_cpu = smp_processor_id(); - -#ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) - goto out_activate; - - cpu = p->sched_class->select_task_rq(p, sync); - if (cpu != orig_cpu) { - set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2396,7 +2405,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); - if (sync) + if (wake_flags & WF_SYNC) schedstat_inc(p, se.nr_wakeups_sync); if (orig_cpu != cpu) schedstat_inc(p, se.nr_wakeups_migrate); @@ -2425,7 +2434,7 @@ out_activate: out_running: trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, sync); + check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2434,10 +2443,22 @@ out_running: #endif out: task_rq_unlock(rq, &flags); + put_cpu(); return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -2460,21 +2481,45 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; + p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sum_sleep_runtime = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; - p->se.sleep_max = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.wait_max = 0; + p->se.wait_start = 0; + p->se.wait_max = 0; + p->se.wait_count = 0; + p->se.wait_sum = 0; + + p->se.sleep_start = 0; + p->se.sleep_max = 0; + p->se.sum_sleep_runtime = 0; + + p->se.block_start = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + + p->se.nr_migrations_cold = 0; + p->se.nr_failed_migrations_affine = 0; + p->se.nr_failed_migrations_running = 0; + p->se.nr_failed_migrations_hot = 0; + p->se.nr_forced_migrations = 0; + p->se.nr_forced2_migrations = 0; + + p->se.nr_wakeups = 0; + p->se.nr_wakeups_sync = 0; + p->se.nr_wakeups_migrate = 0; + p->se.nr_wakeups_local = 0; + p->se.nr_wakeups_remote = 0; + p->se.nr_wakeups_affine = 0; + p->se.nr_wakeups_affine_attempts = 0; + p->se.nr_wakeups_passive = 0; + p->se.nr_wakeups_idle = 0; + #endif INIT_LIST_HEAD(&p->rt.run_list); @@ -2503,18 +2548,41 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { + p->policy = SCHED_NORMAL; + p->normal_prio = p->static_prio; + } + + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; + set_load_weight(p); + } + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } /* - * Make sure we do not leak PI boosting priority to the child: + * Make sure we do not leak PI boosting priority to the child. */ p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; +#ifdef CONFIG_SMP + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); +#endif + set_task_cpu(p, cpu); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -2547,8 +2615,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - p->prio = effective_prio(p); - if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { @@ -2560,7 +2626,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) inc_nr_running(rq); } trace_sched_wakeup_new(rq, p, 1); - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2668,12 +2734,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) { struct mm_struct *mm = rq->prev_mm; long prev_state; -#ifdef CONFIG_SMP - int post_schedule = 0; - - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif rq->prev_mm = NULL; @@ -2690,11 +2750,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif fire_sched_in_preempt_notifiers(current); if (mm) @@ -2709,6 +2766,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } } +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif + /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -2719,6 +2812,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) struct rq *rq = this_rq(); finish_task_switch(rq, prev); + + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); + #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); @@ -2746,7 +2846,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; @@ -2836,19 +2936,94 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +unsigned long nr_iowait_cpu(void) { - unsigned long i, running = 0, uninterruptible = 0; + struct rq *this = this_rq(); + return atomic_read(&this->nr_iowait); +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; + + if (time_before(jiffies, upd)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; - return running + uninterruptible; + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); + } +} + +/* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_migrations(int cpu) +{ + return cpu_rq(cpu)->nr_migrations_in; } /* @@ -2879,6 +3054,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -2968,7 +3148,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); @@ -3183,14 +3363,15 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, { const struct sched_class *class; - for (class = sched_class_highest; class; class = class->next) + for_each_class(class) { if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) return 1; + } return 0; } /********** Helpers for find_busiest_group ************************/ -/** +/* * sd_lb_stats - Structure to store the statistics of a sched_domain * during load balancing. */ @@ -3222,7 +3403,7 @@ struct sd_lb_stats { #endif }; -/** +/* * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { @@ -3270,8 +3451,246 @@ static inline int get_sd_load_idx(struct sched_domain *sd, } +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * init_sd_power_savings_stats - Initialize power savings statistics for + * the given sched_domain, during load balancing. + * + * @sd: Sched domain whose power-savings statistics are to be initialized. + * @sds: Variable containing the statistics for sd. + * @idle: Idle status of the CPU at which we're performing load-balancing. + */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + sds->power_savings_balance = 0; + else { + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; + sds->leader_nr_running = 0; + } +} + +/** + * update_sd_power_savings_stats - Update the power saving stats for a + * sched_domain while performing load balancing. + * + * @group: sched_group belonging to the sched_domain under consideration. + * @sds: Variable containing the statistics of the sched_domain + * @local_group: Does group contain the CPU for which we're performing + * load balancing ? + * @sgs: Variable containing the statistics of the group. + */ +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + + if (!sds->power_savings_balance) + return; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (sds->this_nr_running >= sgs->group_capacity || + !sds->this_nr_running)) + sds->power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!sds->power_savings_balance || + sgs->sum_nr_running >= sgs->group_capacity || + !sgs->sum_nr_running) + return; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sgs->sum_nr_running < sds->min_nr_running) || + (sgs->sum_nr_running == sds->min_nr_running && + group_first_cpu(group) > group_first_cpu(sds->group_min))) { + sds->group_min = group; + sds->min_nr_running = sgs->sum_nr_running; + sds->min_load_per_task = sgs->sum_weighted_load / + sgs->sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sgs->sum_nr_running + 1 > sgs->group_capacity) + return; + + if (sgs->sum_nr_running > sds->leader_nr_running || + (sgs->sum_nr_running == sds->leader_nr_running && + group_first_cpu(group) < group_first_cpu(sds->group_leader))) { + sds->group_leader = group; + sds->leader_nr_running = sgs->sum_nr_running; + } +} + +/** + * check_power_save_busiest_group - see if there is potential for some power-savings balance + * @sds: Variable containing the statistics of the sched_domain + * under consideration. + * @this_cpu: Cpu at which we're currently performing load-balancing. + * @imbalance: Variable to store the imbalance. + * + * Description: + * Check if we have potential to perform some power-savings balance. + * If yes, set the busiest group to be the least loaded group in the + * sched_domain, so that it's CPUs can be put to idle. + * + * Returns 1 if there is potential to perform power-savings balance. + * Else returns 0. + */ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + if (!sds->power_savings_balance) + return 0; + + if (sds->this != sds->group_leader || + sds->group_leader == sds->group_min) + return 0; + + *imbalance = sds->min_load_per_task; + sds->busiest = sds->group_min; + + return 1; + +} +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + return; +} + +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + return; +} + +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + return 0; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_LOAD_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; + + return smt_gain; +} + +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + return default_scale_smt_power(sd, cpu); +} + +unsigned long scale_rt_power(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 total, available; + + sched_avg_update(rq); + + total = sched_avg_period() + (rq->clock - rq->age_stamp); + available = total - rq->rt_avg; + + if (unlikely((s64)total < SCHED_LOAD_SCALE)) + total = SCHED_LOAD_SCALE; + + total >>= SCHED_LOAD_SHIFT; + + return div_u64(available, total); +} + +static void update_cpu_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long power = SCHED_LOAD_SCALE; + struct sched_group *sdg = sd->groups; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + if (sched_feat(ARCH_POWER)) + power *= arch_scale_smt_power(sd, cpu); + else + power *= default_scale_smt_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + } + + power *= scale_rt_power(cpu); + power >>= SCHED_LOAD_SHIFT; + + if (!power) + power = 1; + + sdg->cpu_power = power; +} + +static void update_group_power(struct sched_domain *sd, int cpu) +{ + struct sched_domain *child = sd->child; + struct sched_group *group, *sdg = sd->groups; + unsigned long power; + + if (!child) { + update_cpu_power(sd, cpu); + return; + } + + power = 0; + + group = child->groups; + do { + power += group->cpu_power; + group = group->next; + } while (group != child->groups); + + sdg->cpu_power = power; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu @@ -3282,7 +3701,8 @@ static inline int get_sd_load_idx(struct sched_domain *sd, * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ -static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, +static inline void update_sg_lb_stats(struct sched_domain *sd, + struct sched_group *group, int this_cpu, enum cpu_idle_type idle, int load_idx, int *sd_idle, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) @@ -3293,8 +3713,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - if (local_group) + if (local_group) { balance_cpu = group_first_cpu(group); + if (balance_cpu == this_cpu) + update_group_power(sd, this_cpu); + } /* Tally up the load of all CPUs in the group */ sum_avg_load_per_task = avg_load_per_task = 0; @@ -3343,8 +3766,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = sg_div_cpu_power(group, - sgs->group_load * SCHED_LOAD_SCALE); + sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; /* @@ -3356,14 +3778,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, * normalized nr_running number somewhere that negates * the hierarchy? */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); + avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / + group->cpu_power; if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; - sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - + sgs->group_capacity = + DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); } /** @@ -3381,23 +3803,15 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, const struct cpumask *cpus, int *balance, struct sd_lb_stats *sds) { + struct sched_domain *child = sd->child; struct sched_group *group = sd->groups; struct sg_lb_stats sgs; - int load_idx; + int load_idx, prefer_sibling = 0; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -#endif + if (child && child->flags & SD_PREFER_SIBLING) + prefer_sibling = 1; + + init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); do { @@ -3406,14 +3820,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && balance && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->__cpu_power; + sds->total_pwr += group->cpu_power; + + /* + * In case the child domain prefers tasks go to siblings + * first, lower the group capacity to one so that we'll try + * and move all the excess tasks away. + */ + if (prefer_sibling) + sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { sds->this_load = sgs.avg_load; @@ -3430,64 +3852,9 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->group_imb = sgs.group_imb; } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - - if (!sds->power_savings_balance) - goto group_next; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && - (sds->this_nr_running >= sgs.group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs.sum_nr_running >= sgs.group_capacity || - !sgs.sum_nr_running) - goto group_next; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs.sum_nr_running < sds->min_nr_running) || - (sgs.sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > - group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs.sum_nr_running; - sds->min_load_per_task = sgs.sum_weighted_load / - sgs.sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs.sum_nr_running > sgs.group_capacity - 1) - goto group_next; - - if (sgs.sum_nr_running > sds->leader_nr_running || - (sgs.sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < - group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs.sum_nr_running; - } -group_next: -#endif + update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); - } /** @@ -3525,28 +3892,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->__cpu_power * + pwr_now += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->__cpu_power * + pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(sds->busiest, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->busiest->cpu_power; if (sds->max_load > tmp) - pwr_move += sds->busiest->__cpu_power * + pwr_move += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->__cpu_power < + if (sds->max_load * sds->busiest->cpu_power < sds->busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(sds->this, - sds->max_load * sds->busiest->__cpu_power); + tmp = (sds->max_load * sds->busiest->cpu_power) / + sds->this->cpu_power; else - tmp = sg_div_cpu_power(sds->this, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += sds->this->__cpu_power * + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->this->cpu_power; + pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; @@ -3581,8 +3948,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, sds->max_load - sds->busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->__cpu_power, - (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + *imbalance = min(max_pull * sds->busiest->cpu_power, + (sds->avg_load - sds->this_load) * sds->this->cpu_power) / SCHED_LOAD_SCALE; /* @@ -3597,10 +3964,30 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, } /******* find_busiest_group() helpers end here *********************/ -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @sd: The sched_domain whose busiest group is to be returned. + * @this_cpu: The cpu for which load balancing is currently being performed. + * @imbalance: Variable which stores amount of weighted load which should + * be moved to restore balance/put a group to idle. + * @idle: The idle status of this_cpu. + * @sd_idle: The idleness of sd + * @cpus: The set of CPUs under consideration for load-balancing. + * @balance: Pointer to a variable indicating if this_cpu + * is the appropriate cpu to perform load balancing at this_level. + * + * Returns: - the busiest group if imbalance exists. + * - If no imbalance and user has opted for power-savings balance, + * return the least loaded group whose CPUs can be + * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, @@ -3618,17 +4005,31 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds); + /* Cases where imbalance does not exist from POV of this_cpu */ + /* 1) this_cpu is not the appropriate cpu to perform load balancing + * at this level. + * 2) There is no busy sibling group to pull from. + * 3) This group is the busiest group. + * 4) This group is more busy than the avg busieness at this + * sched_domain. + * 5) The imbalance is within the specified limit. + * 6) Any rebalance would lead to ping-pong + */ if (balance && !(*balance)) goto ret; - if (!sds.busiest || sds.this_load >= sds.max_load - || sds.busiest_nr_running == 0) + if (!sds.busiest || sds.busiest_nr_running == 0) + goto out_balanced; + + if (sds.this_load >= sds.max_load) goto out_balanced; sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load || - 100*sds.max_load <= sd->imbalance_pct * sds.this_load) + if (sds.this_load >= sds.avg_load) + goto out_balanced; + + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; sds.busiest_load_per_task /= sds.busiest_nr_running; @@ -3655,21 +4056,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, return sds.busiest; out_balanced: -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (!sds.power_savings_balance) - goto ret; - - if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) - goto ret; - - *imbalance = sds.min_load_per_task; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds.group_leader); - } - return sds.group_min; - -#endif + /* + * There is no obvious imbalance. But check if we can do some balancing + * to save power. + */ + if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) + return sds.busiest; ret: *imbalance = 0; return NULL; @@ -3687,15 +4079,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, int i; for_each_cpu(i, sched_group_cpus(group)) { + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; + wl /= power; - if (rq->nr_running == 1 && wl > imbalance) + if (capacity && rq->nr_running == 1 && wl > imbalance) continue; if (wl > max_load) { @@ -3713,19 +4108,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ #define MAX_PINNED_INTERVAL 512 +/* Working cpumask for load_balance and load_balance_newidle. */ +static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, struct cpumask *cpus) + int *balance) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_setall(cpus); @@ -3880,8 +4279,7 @@ out: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - struct cpumask *cpus) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) { struct sched_group *group; struct rq *busiest = NULL; @@ -3889,6 +4287,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_setall(cpus); @@ -4029,10 +4428,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_var_t tmpmask; - - if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) - return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4043,7 +4438,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, tmpmask); + sd); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -4058,7 +4453,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } - free_cpumask_var(tmpmask); } /* @@ -4116,10 +4510,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +int get_nohz_load_balancer(void) +{ + return atomic_read(&nohz.load_balancer); +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return cpumask_first(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4174,8 +4689,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; @@ -4208,11 +4739,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_var_t tmp; - - /* Fails alloc? Rebalancing probably not a priority right now. */ - if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) - return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -4237,7 +4763,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -4271,8 +4797,6 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; - - free_cpumask_var(tmp); } /* @@ -4351,15 +4875,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -4406,9 +4922,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return any ns on the sched_clock that have not yet been banked in + * Return any ns on the sched_clock that have not yet been accounted in * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. */ +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ + u64 ns = 0; + + if (task_current(rq, p)) { + update_rq_clock(rq); + ns = rq->clock - p->se.exec_start; + if ((s64)ns < 0) + ns = 0; + } + + return ns; +} + unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; @@ -4416,16 +4948,49 @@ unsigned long long task_delta_exec(struct task_struct *p) u64 ns = 0; rq = task_rq_lock(p, &flags); + ns = do_task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); - if (task_current(rq, p)) { - u64 delta_exec; + return ns; +} - update_rq_clock(rq); - delta_exec = rq->clock - p->se.exec_start; - if ((s64)delta_exec > 0) - ns = delta_exec; - } +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); + + return ns; +} + +/* + * Return sum_exec_runtime for the thread group. + * In case the task is currently running, return the sum plus current's + * pending runtime that have not been accounted yet. + * + * Note that the thread group might have other running tasks as well, + * so the return value not includes other pending runtime that other + * running tasks might have. + */ +unsigned long long thread_group_sched_runtime(struct task_struct *p) +{ + struct task_cputime totals; + unsigned long flags; + struct rq *rq; + u64 ns; + rq = task_rq_lock(p, &flags); + thread_group_cputime(p, &totals); + ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, &flags); return ns; @@ -4454,6 +5019,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime, cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + + cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ acct_update_integrals(p); } @@ -4515,6 +5082,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else cpustat->system = cputime64_add(cpustat->system, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + /* Account for system time used */ acct_update_integrals(p); } @@ -4556,17 +5125,16 @@ void account_idle_time(cputime_t cputime) */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); - else if (p != rq->idle) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, one_jiffy_scaled); else - account_idle_time(one_jiffy); + account_idle_time(cputime_one_jiffy); } /* @@ -4670,16 +5238,15 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); + perf_event_task_tick(curr, cpu); + #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -static inline unsigned long get_parent_ip(unsigned long addr) +notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { addr = CALLER_ADDR2; @@ -4689,6 +5256,9 @@ static inline unsigned long get_parent_ip(unsigned long addr) return addr; } +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + void __kprobes add_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@ -4780,14 +5350,13 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *prev) +static void put_prev_task(struct rq *rq, struct task_struct *p) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; + u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_running, runtime); + if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -4797,9 +5366,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) * correlates to the amount of cache footprint a task can * build up. */ - update_avg(&prev->se.avg_overlap, runtime); + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_overlap, runtime); + } else { + update_avg(&p->se.avg_running, 0); } - prev->sched_class->put_prev_task(rq, prev); + p->sched_class->put_prev_task(rq, p); } /* @@ -4848,7 +5420,7 @@ need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_qsctr_inc(cpu); + rcu_sched_qs(cpu); prev = rq->curr; switch_count = &prev->nivcsw; @@ -4872,10 +5444,7 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -4885,6 +5454,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -4900,15 +5470,78 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); + post_schedule(rq); + if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_SMP +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) +{ + unsigned int cpu; + struct rq *rq; + + if (!sched_feat(OWNER_SPIN)) + return 0; + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * Need to access the cpu field knowing that + * DEBUG_PAGEALLOC could have unmapped it if + * the mutex owner just released it and exited. + */ + if (probe_kernel_address(&owner->cpu, cpu)) + goto out; +#else + cpu = owner->cpu; +#endif + + /* + * Even if the access succeeded (likely case), + * the cpu field may no longer be valid. + */ + if (cpu >= nr_cpumask_bits) + goto out; + + /* + * We need to validate that we can do a + * get_cpu() and that we have the percpu area. + */ + if (!cpu_online(cpu)) + goto out; + + rq = cpu_rq(cpu); + + for (;;) { + /* + * Owner changed, break to re-assess state. + */ + if (lock->owner != owner) + break; + + /* + * Is that owner really running on that cpu? + */ + if (task_thread_info(rq->curr) != owner || need_resched()) + return 0; + + cpu_relax(); + } +out: + return 1; +} +#endif + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -4970,10 +5603,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -4986,15 +5619,15 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, sync, key) && + if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } @@ -5006,6 +5639,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5026,11 +5662,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) __wake_up_common(q, mode, 1, 0, NULL); } +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} + /** - * __wake_up_sync - wake up threads blocked on a waitqueue. + * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not @@ -5038,23 +5680,35 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ -void -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) { unsigned long flags; - int sync = 1; + int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) - sync = 0; + wake_flags = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); spin_unlock_irqrestore(&q->lock, flags); } +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ /** @@ -5065,6 +5719,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5082,6 +5739,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { @@ -5557,17 +6217,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ - if (policy < 0) + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -5611,6 +6279,10 @@ recheck: /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; } if (user) { @@ -5654,6 +6326,8 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); @@ -5770,14 +6444,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } read_unlock(&tasklist_lock); return retval; } /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ @@ -5998,27 +6673,21 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +static inline int should_resched(void) +{ + return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - __might_sleep(__FILE__, __LINE__); -#endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) { - if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && - system_state == SYSTEM_RUNNING) { + if (should_resched()) { __cond_resched(); return 1; } @@ -6027,21 +6696,23 @@ int __sched _cond_resched(void) EXPORT_SYMBOL(_cond_resched); /* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_lock(spinlock_t *lock) { - int resched = need_resched() && system_state == SYSTEM_RUNNING; + int resched = should_resched(); int ret = 0; + lockdep_assert_held(lock); + if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched && need_resched()) + if (resched) __cond_resched(); else cpu_relax(); @@ -6050,13 +6721,13 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_lock); -int __sched cond_resched_softirq(void) +int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && system_state == SYSTEM_RUNNING) { + if (should_resched()) { local_bh_enable(); __cond_resched(); local_bh_disable(); @@ -6064,7 +6735,7 @@ int __sched cond_resched_softirq(void) } return 0; } -EXPORT_SYMBOL(cond_resched_softirq); +EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. @@ -6082,17 +6753,16 @@ EXPORT_SYMBOL(yield); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) */ void __sched io_schedule(void) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; schedule(); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -6100,12 +6770,14 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); long ret; delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; ret = schedule_timeout(timeout); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; @@ -6189,23 +6861,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - /* - * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER - * tasks that are on an otherwise idle runqueue: - */ - time_slice = 0; - if (p->policy == SCHED_RR) { - time_slice = DEF_TIMESLICE; - } else if (p->policy != SCHED_FIFO) { - struct sched_entity *se = &p->se; - unsigned long flags; - struct rq *rq; + time_slice = p->sched_class->get_rr_interval(p); - rq = task_rq_lock(p, &flags); - if (rq->cfs.load.weight) - time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); - } read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -6238,15 +6895,11 @@ void sched_show_task(struct task_struct *p) printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long *n = end_of_stack(p); - while (!*n) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(p); - } + free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } @@ -6426,8 +7079,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); + put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; @@ -6485,6 +7142,11 @@ fail: return ret; } +#define RCU_MIGRATION_IDLE 0 +#define RCU_MIGRATION_NEED_QS 1 +#define RCU_MIGRATION_GOT_QS 2 +#define RCU_MIGRATION_MUST_SYNC 3 + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -6492,6 +7154,7 @@ fail: */ static int migration_thread(void *data) { + int badcpu; int cpu = (long)data; struct rq *rq; @@ -6507,7 +7170,7 @@ static int migration_thread(void *data) if (cpu_is_offline(cpu)) { spin_unlock_irq(&rq->lock); - goto wait_to_die; + break; } if (rq->active_balance) { @@ -6526,23 +7189,23 @@ static int migration_thread(void *data) req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); + if (req->task != NULL) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + } else if (likely(cpu == (badcpu = smp_processor_id()))) { + req->dest_cpu = RCU_MIGRATION_GOT_QS; + spin_unlock(&rq->lock); + } else { + req->dest_cpu = RCU_MIGRATION_MUST_SYNC; + spin_unlock(&rq->lock); + WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); + } local_irq_enable(); complete(&req->done); } __set_current_state(TASK_RUNNING); - return 0; -wait_to_die: - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); return 0; } @@ -6725,6 +7388,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -6948,7 +7620,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = task_rq_lock(p, &flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); + get_task_struct(p); cpu_rq(cpu)->migration_thread = p; + rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: @@ -6976,6 +7650,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) kthread_bind(cpu_rq(cpu)->migration_thread, cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); + put_task_struct(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; @@ -6985,6 +7660,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); + put_task_struct(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ spin_lock_irq(&rq->lock); @@ -6998,7 +7674,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -7034,8 +7710,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else. This has to be lower priority than + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -7053,7 +7731,7 @@ static int __init migration_init(void) migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - return err; + return 0; } early_initcall(migration_init); #endif @@ -7100,7 +7778,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->__cpu_power) { + if (!group->cpu_power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -7122,7 +7800,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); + printk(KERN_CONT " %s", str); + if (group->cpu_power != SCHED_LOAD_SCALE) { + printk(KERN_CONT " (cpu_power = %d)", + group->cpu_power); + } group = group->next; } while (group != sd->groups); @@ -7186,9 +7869,7 @@ static int sd_degenerate(struct sched_domain *sd) } /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) + if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; @@ -7205,10 +7886,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | @@ -7264,7 +7941,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); @@ -7273,26 +7950,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { + gfp_t gfp = GFP_KERNEL; + memset(rd, 0, sizeof(*rd)); - if (bootmem) { - alloc_bootmem_cpumask_var(&def_root_domain.span); - alloc_bootmem_cpumask_var(&def_root_domain.online); - alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri, true); - return 0; - } + if (bootmem) + gfp = GFP_NOWAIT; - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->span, gfp)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->online, gfp)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->rto_mask, gfp)) goto free_online; - if (cpupri_init(&rd->cpupri, false) != 0) + if (cpupri_init(&rd->cpupri, bootmem) != 0) goto free_rto_mask; return 0; @@ -7409,7 +8083,7 @@ init_sched_build_groups(const struct cpumask *span, continue; cpumask_clear(sched_group_cpus(sg)); - sg->__cpu_power = 0; + sg->cpu_power = 0; for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) @@ -7503,8 +8177,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; @@ -7516,6 +8191,39 @@ struct static_sched_domain { DECLARE_BITMAP(span, CONFIG_NR_CPUS); }; +struct s_data { +#ifdef CONFIG_NUMA + int sd_allnodes; + cpumask_var_t domainspan; + cpumask_var_t covered; + cpumask_var_t notcovered; +#endif + cpumask_var_t nodemask; + cpumask_var_t this_sibling_map; + cpumask_var_t this_core_map; + cpumask_var_t send_covered; + cpumask_var_t tmpmask; + struct sched_group **sched_group_nodes; + struct root_domain *rd; +}; + +enum s_alloc { + sa_sched_groups = 0, + sa_rootdomain, + sa_tmpmask, + sa_send_covered, + sa_this_core_map, + sa_this_sibling_map, + sa_nodemask, + sa_sched_group_nodes, +#ifdef CONFIG_NUMA + sa_notcovered, + sa_covered, + sa_domainspan, +#endif + sa_none, +}; + /* * SMT sched-domains: */ @@ -7548,7 +8256,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map, { int group; - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); if (sg) *sg = &per_cpu(sched_group_core, group).sg; @@ -7577,7 +8285,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); #else group = cpu; @@ -7625,7 +8333,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -7633,11 +8341,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) continue; } - sg_inc_cpu_power(sg, sd->groups->__cpu_power); + sg->cpu_power += sd->groups->cpu_power; } sg = sg->next; } while (sg != group_head); } + +static int build_numa_sched_groups(struct s_data *d, + const struct cpumask *cpu_map, int num) +{ + struct sched_domain *sd; + struct sched_group *sg, *prev; + int n, j; + + cpumask_clear(d->covered); + cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); + if (cpumask_empty(d->nodemask)) { + d->sched_group_nodes[num] = NULL; + goto out; + } + + sched_domain_node_span(num, d->domainspan); + cpumask_and(d->domainspan, d->domainspan, cpu_map); + + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for node %d\n", + num); + return -ENOMEM; + } + d->sched_group_nodes[num] = sg; + + for_each_cpu(j, d->nodemask) { + sd = &per_cpu(node_domains, j).sd; + sd->groups = sg; + } + + sg->cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->nodemask); + sg->next = sg; + cpumask_or(d->covered, d->covered, d->nodemask); + + prev = sg; + for (j = 0; j < nr_node_ids; j++) { + n = (num + j) % nr_node_ids; + cpumask_complement(d->notcovered, d->covered); + cpumask_and(d->tmpmask, d->notcovered, cpu_map); + cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); + if (cpumask_empty(d->tmpmask)) + break; + cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); + if (cpumask_empty(d->tmpmask)) + continue; + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + return -ENOMEM; + } + sg->cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->tmpmask); + sg->next = prev->next; + cpumask_or(d->covered, d->covered, d->tmpmask); + prev->next = sg; + prev = sg; + } +out: + return 0; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA @@ -7691,45 +8464,47 @@ static void free_sched_groups(const struct cpumask *cpu_map, * there are asymmetries in the topology. If there are asymmetries, group * having more cpu_power will pickup more load compared to the group having * less cpu_power. - * - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents - * the maximum number of tasks a group can handle in the presence of other idle - * or lightly loaded groups in the same sched domain. */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_domain *child; struct sched_group *group; + long power; + int weight; WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; - sd->groups->__cpu_power = 0; + sd->groups->cpu_power = 0; - /* - * For perf policy, if the groups in child domain share resources - * (for example cores sharing some portions of the cache hierarchy - * or SMT), then set this domain groups cpu_power such that each group - * can handle only one task, when there are other idle groups in the - * same sched domain. - */ - if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && - (child->flags & - (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); + if (!child) { + power = SCHED_LOAD_SCALE; + weight = cpumask_weight(sched_domain_span(sd)); + /* + * SMT siblings share the power of a single core. + * Usually multiple threads get a better yield out of + * that one core than a single thread would have, + * reflect that in sd->smt_gain. + */ + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + power *= sd->smt_gain; + power /= weight; + power >>= SCHED_LOAD_SHIFT; + } + sd->groups->cpu_power += power; return; } /* - * add cpu_power of each child group to this groups cpu_power + * Add cpu_power of each child group to this groups cpu_power. */ group = child->groups; do { - sg_inc_cpu_power(sd->groups, group->__cpu_power); + sd->groups->cpu_power += group->cpu_power; group = group->next; } while (group != child->groups); } @@ -7778,28 +8553,221 @@ static int __init setup_relax_domain_level(char *str) if (val < SD_LV_MAX) default_relax_domain_level = val; - return 1; + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } +} + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_sched_groups: + free_sched_groups(cpu_map, d->tmpmask); /* fall through */ + d->sched_group_nodes = NULL; + case sa_rootdomain: + free_rootdomain(d->rd); /* fall through */ + case sa_tmpmask: + free_cpumask_var(d->tmpmask); /* fall through */ + case sa_send_covered: + free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_core_map: + free_cpumask_var(d->this_core_map); /* fall through */ + case sa_this_sibling_map: + free_cpumask_var(d->this_sibling_map); /* fall through */ + case sa_nodemask: + free_cpumask_var(d->nodemask); /* fall through */ + case sa_sched_group_nodes: +#ifdef CONFIG_NUMA + kfree(d->sched_group_nodes); /* fall through */ + case sa_notcovered: + free_cpumask_var(d->notcovered); /* fall through */ + case sa_covered: + free_cpumask_var(d->covered); /* fall through */ + case sa_domainspan: + free_cpumask_var(d->domainspan); /* fall through */ +#endif + case sa_none: + break; + } +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, + const struct cpumask *cpu_map) +{ +#ifdef CONFIG_NUMA + if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) + return sa_none; + if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) + return sa_domainspan; + if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) + return sa_covered; + /* Allocate the per-node list of sched groups */ + d->sched_group_nodes = kcalloc(nr_node_ids, + sizeof(struct sched_group *), GFP_KERNEL); + if (!d->sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return sa_notcovered; + } + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; +#endif + if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) + return sa_sched_group_nodes; + if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) + return sa_nodemask; + if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) + return sa_this_sibling_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_core_map; + if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) + return sa_send_covered; + d->rd = alloc_rootdomain(); + if (!d->rd) { + printk(KERN_WARNING "Cannot alloc root domain\n"); + return sa_tmpmask; + } + return sa_rootdomain; +} + +static struct sched_domain *__build_numa_sched_domains(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +{ + struct sched_domain *sd = NULL; +#ifdef CONFIG_NUMA + struct sched_domain *parent; + + d->sd_allnodes = 0; + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { + sd = &per_cpu(allnodes_domains, i).sd; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), cpu_map); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); + d->sd_allnodes = 1; + } + parent = sd; + + sd = &per_cpu(node_domains, i).sd; + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + sd->parent = parent; + if (parent) + parent->child = sd; + cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); +#endif + return sd; +} + +static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd; + sd = &per_cpu(phys_domains, i).sd; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), d->nodemask); + sd->parent = parent; + if (parent) + parent->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); + return sd; +} + +static struct sched_domain *__build_mc_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_MC + sd = &per_cpu(core_domains, i).sd; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; } -__setup("relax_domain_level=", setup_relax_domain_level); -static void set_domain_attribute(struct sched_domain *sd, - struct sched_domain_attr *attr) +static struct sched_domain *__build_smt_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) { - int request; + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i).sd; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} - if (!attr || attr->relax_domain_level < 0) { - if (default_relax_domain_level < 0) - return; - else - request = default_relax_domain_level; - } else - request = attr->relax_domain_level; - if (request < sd->level) { - /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); - } else { - /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); +static void build_sched_groups(struct s_data *d, enum sched_domain_level l, + const struct cpumask *cpu_map, int cpu) +{ + switch (l) { +#ifdef CONFIG_SCHED_SMT + case SD_LV_SIBLING: /* set up CPU (sibling) groups */ + cpumask_and(d->this_sibling_map, cpu_map, + topology_thread_cpumask(cpu)); + if (cpu == cpumask_first(d->this_sibling_map)) + init_sched_build_groups(d->this_sibling_map, cpu_map, + &cpu_to_cpu_group, + d->send_covered, d->tmpmask); + break; +#endif +#ifdef CONFIG_SCHED_MC + case SD_LV_MC: /* set up multi-core groups */ + cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); + if (cpu == cpumask_first(d->this_core_map)) + init_sched_build_groups(d->this_core_map, cpu_map, + &cpu_to_core_group, + d->send_covered, d->tmpmask); + break; +#endif + case SD_LV_CPU: /* set up physical groups */ + cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); + if (!cpumask_empty(d->nodemask)) + init_sched_build_groups(d->nodemask, cpu_map, + &cpu_to_phys_group, + d->send_covered, d->tmpmask); + break; +#ifdef CONFIG_NUMA + case SD_LV_ALLNODES: + init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, + d->send_covered, d->tmpmask); + break; +#endif + default: + break; } } @@ -7810,273 +8778,85 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - int i, err = -ENOMEM; - struct root_domain *rd; - cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, - tmpmask; -#ifdef CONFIG_NUMA - cpumask_var_t domainspan, covered, notcovered; - struct sched_group **sched_group_nodes = NULL; - int sd_allnodes = 0; - - if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&covered, GFP_KERNEL)) - goto free_domainspan; - if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) - goto free_covered; -#endif - - if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) - goto free_notcovered; - if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) - goto free_nodemask; - if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) - goto free_this_sibling_map; - if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) - goto free_this_core_map; - if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) - goto free_send_covered; - + enum s_alloc alloc_state = sa_none; + struct s_data d; + struct sched_domain *sd; + int i; #ifdef CONFIG_NUMA - /* - * Allocate the per-node list of sched groups - */ - sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - goto free_tmpmask; - } + d.sd_allnodes = 0; #endif - rd = alloc_rootdomain(); - if (!rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); - goto free_sched_groups; - } - -#ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; -#endif + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + alloc_state = sa_sched_groups; /* * Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd = NULL, *p; - - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); - -#ifdef CONFIG_NUMA - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); - p = sd; - sd_allnodes = 1; - } else - p = NULL; - - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = p; - if (p) - p->child = sd; - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); -#endif - - p = sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), nodemask); - sd->parent = p; - if (p) - p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); - -#ifdef CONFIG_SCHED_MC - p = sd; - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, - cpu_coregroup_mask(i)); - sd->parent = p; - p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); -#endif - -#ifdef CONFIG_SCHED_SMT - p = sd; - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), - &per_cpu(cpu_sibling_map, i), cpu_map); - sd->parent = p; - p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); -#endif - } - -#ifdef CONFIG_SCHED_SMT - /* Set up CPU (sibling) groups */ - for_each_cpu(i, cpu_map) { - cpumask_and(this_sibling_map, - &per_cpu(cpu_sibling_map, i), cpu_map); - if (i != cpumask_first(this_sibling_map)) - continue; + cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), + cpu_map); - init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group, - send_covered, tmpmask); + sd = __build_numa_sched_domains(&d, cpu_map, attr, i); + sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } -#endif -#ifdef CONFIG_SCHED_MC - /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(this_core_map)) - continue; - - init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group, - send_covered, tmpmask); + build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_MC, cpu_map, i); } -#endif /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) { - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) - continue; - - init_sched_build_groups(nodemask, cpu_map, - &cpu_to_phys_group, - send_covered, tmpmask); - } + for (i = 0; i < nr_node_ids; i++) + build_sched_groups(&d, SD_LV_CPU, cpu_map, i); #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) { - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - send_covered, tmpmask); - } - - for (i = 0; i < nr_node_ids; i++) { - /* Set up node groups */ - struct sched_group *sg, *prev; - int j; - - cpumask_clear(covered); - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) { - sched_group_nodes[i] = NULL; - continue; - } - - sched_domain_node_span(i, domainspan); - cpumask_and(domainspan, domainspan, cpu_map); + if (d.sd_allnodes) + build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for " - "node %d\n", i); + for (i = 0; i < nr_node_ids; i++) + if (build_numa_sched_groups(&d, cpu_map, i)) goto error; - } - sched_group_nodes[i] = sg; - for_each_cpu(j, nodemask) { - struct sched_domain *sd; - - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), nodemask); - sg->next = sg; - cpumask_or(covered, covered, nodemask); - prev = sg; - - for (j = 0; j < nr_node_ids; j++) { - int n = (i + j) % nr_node_ids; - - cpumask_complement(notcovered, covered); - cpumask_and(tmpmask, notcovered, cpu_map); - cpumask_and(tmpmask, tmpmask, domainspan); - if (cpumask_empty(tmpmask)) - break; - - cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); - if (cpumask_empty(tmpmask)) - continue; - - sg = kmalloc_node(sizeof(struct sched_group) + - cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - goto error; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), tmpmask); - sg->next = prev->next; - cpumask_or(covered, covered, tmpmask); - prev->next = sg; - prev = sg; - } - } #endif /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; - + sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i).sd; - + sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i).sd; - + sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); + init_numa_sched_groups_power(d.sched_group_nodes[i]); - if (sd_allnodes) { + if (d.sd_allnodes) { struct sched_group *sg; cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - tmpmask); + d.tmpmask); init_numa_sched_groups_power(sg); } #endif /* Attach the domains */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) @@ -8084,44 +8864,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #else sd = &per_cpu(phys_domains, i).sd; #endif - cpu_attach_domain(sd, rd, i); + cpu_attach_domain(sd, d.rd, i); } - err = 0; - -free_tmpmask: - free_cpumask_var(tmpmask); -free_send_covered: - free_cpumask_var(send_covered); -free_this_core_map: - free_cpumask_var(this_core_map); -free_this_sibling_map: - free_cpumask_var(this_sibling_map); -free_nodemask: - free_cpumask_var(nodemask); -free_notcovered: -#ifdef CONFIG_NUMA - free_cpumask_var(notcovered); -free_covered: - free_cpumask_var(covered); -free_domainspan: - free_cpumask_var(domainspan); -out: -#endif - return err; - -free_sched_groups: -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - goto free_tmpmask; + d.sched_group_nodes = NULL; /* don't free this we still need it */ + __free_domain_allocs(&d, sa_tmpmask, cpu_map); + return 0; -#ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map, tmpmask); - free_rootdomain(rd); - goto free_tmpmask; -#endif + __free_domain_allocs(&d, alloc_state, cpu_map); + return -ENOMEM; } static int build_sched_domains(const struct cpumask *cpu_map) @@ -8440,6 +9192,7 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -8471,7 +9224,6 @@ void __init sched_init_smp(void) sched_init_granularity(); free_cpumask_var(non_isolated_cpus); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); init_sched_rt_class(); } #else @@ -8481,6 +9233,8 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ +const_debug unsigned int sysctl_timer_migration = 1; + int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8520,7 +9274,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init(&rq->rt.pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; @@ -8607,12 +9361,15 @@ void __init sched_init(void) #ifdef CONFIG_USER_SCHED alloc_size *= 2; #endif +#ifdef CONFIG_CPUMASK_OFFSTACK + alloc_size += num_possible_cpus() * cpumask_size(); +#endif /* * As sched_init() is called before page_alloc is setup, * we use alloc_bootmem(). */ if (alloc_size) { - ptr = (unsigned long)alloc_bootmem(alloc_size); + ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.se = (struct sched_entity **)ptr; @@ -8644,6 +9401,12 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_CPUMASK_OFFSTACK + for_each_possible_cpu(i) { + per_cpu(load_balance_tmpmask, i) = (void *)ptr; + ptr += cpumask_size(); + } +#endif /* CONFIG_CPUMASK_OFFSTACK */ } #ifdef CONFIG_SMP @@ -8673,12 +9436,18 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_GROUP_SCHED */ +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); +#endif for_each_possible_cpu(i) { struct rq *rq; rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8699,7 +9468,7 @@ void __init sched_init(void) * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -8716,11 +9485,11 @@ void __init sched_init(void) * system cpu resource, based on the weight assigned to root * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_cfs_rq) and having one entity represent this group of + * (init_tg_cfs_rq) and having one entity represent this group of * tasks in rq->cfs (i.e init_task_group->se[] != NULL). */ init_tg_cfs_entry(&init_task_group, - &per_cpu(init_cfs_rq, i), + &per_cpu(init_tg_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1, root_task_group.se[i]); @@ -8746,6 +9515,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -8786,31 +9556,44 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_bootmem_cpumask_var(&nohz_cpu_mask); + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_bootmem_cpumask_var(&nohz.cpu_mask); + zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_bootmem_cpumask_var(&cpu_isolated_map); + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ + perf_event_init(); + scheduler_running = 1; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = preempt_count() & ~PREEMPT_ACTIVE; + + return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); +} + +void __might_sleep(char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; @@ -9541,6 +10324,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -9556,7 +10346,7 @@ static int sched_rt_global_constraints(void) #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -9567,7 +10357,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); @@ -9621,8 +10411,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) } static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -9632,15 +10421,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if (tsk->sched_class != &fair_sched_class) return -EINVAL; #endif + return 0; +} +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup) +{ + int retval = cpu_cgroup_can_attach_task(cgrp, tsk); + if (retval) + return retval; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + retval = cpu_cgroup_can_attach_task(cgrp, c); + if (retval) { + rcu_read_unlock(); + return retval; + } + } + rcu_read_unlock(); + } return 0; } static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) + struct cgroup *old_cont, struct task_struct *tsk, + bool threadgroup) { sched_move_task(tsk); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + sched_move_task(c); + } + rcu_read_unlock(); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9736,6 +10555,7 @@ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; struct cpuacct *parent; }; @@ -9760,20 +10580,32 @@ static struct cgroup_subsys_state *cpuacct_create( struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + int i; if (!ca) - return ERR_PTR(-ENOMEM); + goto out; ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) { - kfree(ca); - return ERR_PTR(-ENOMEM); - } + if (!ca->cpuusage) + goto out_free_ca; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + if (percpu_counter_init(&ca->cpustat[i], 0)) + goto out_free_counters; if (cgrp->parent) ca->parent = cgroup_ca(cgrp->parent); return &ca->css; + +out_free_counters: + while (--i >= 0) + percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); } /* destroy an existing cpu accounting group */ @@ -9781,14 +10613,17 @@ static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); + int i; + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + percpu_counter_destroy(&ca->cpustat[i]); free_percpu(ca->cpuusage); kfree(ca); } static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; #ifndef CONFIG_64BIT @@ -9807,7 +10642,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); #ifndef CONFIG_64BIT /* @@ -9868,6 +10703,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, return 0; } +static const char *cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int i; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { + s64 val = percpu_counter_read(&ca->cpustat[i]); + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[i], val); + } + return 0; +} + static struct cftype files[] = { { .name = "usage", @@ -9878,7 +10732,10 @@ static struct cftype files[] = { .name = "usage_percpu", .read_seq_string = cpuacct_percpu_seq_read, }, - + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -9900,12 +10757,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) return; cpu = task_cpu(tsk); + + rcu_read_lock(); + ca = task_ca(tsk); for (; ca; ca = ca->parent) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } + + rcu_read_unlock(); +} + +/* + * Charge the system/user time to the task's accounting group. + */ +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) +{ + struct cpuacct *ca; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(tsk); + + do { + percpu_counter_add(&ca->cpustat[idx], val); + ca = ca->parent; + } while (ca); + rcu_read_unlock(); } struct cgroup_subsys cpuacct_subsys = { @@ -9916,3 +10799,113 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +#ifndef CONFIG_SMP + +int rcu_expedited_torture_stats(char *page) +{ + return 0; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +void synchronize_sched_expedited(void) +{ +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); +static DEFINE_MUTEX(rcu_sched_expedited_mutex); + +#define RCU_EXPEDITED_STATE_POST -2 +#define RCU_EXPEDITED_STATE_IDLE -1 + +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + +int rcu_expedited_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + + cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); + for_each_online_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d:%d", + cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +static long synchronize_sched_expedited_count; + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + */ +void synchronize_sched_expedited(void) +{ + int cpu; + unsigned long flags; + bool need_full_sync = 0; + struct rq *rq; + struct migration_req *req; + long snap; + int trycount = 0; + + smp_mb(); /* ensure prior mod happens before capturing snap. */ + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + get_online_cpus(); + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + put_online_cpus(); + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + get_online_cpus(); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_POST; + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + req = &per_cpu(rcu_migration_req, cpu); + init_completion(&req->done); + req->task = NULL; + req->dest_cpu = RCU_MIGRATION_NEED_QS; + spin_lock_irqsave(&rq->lock, flags); + list_add(&req->list, &rq->migration_queue); + spin_unlock_irqrestore(&rq->lock, flags); + wake_up_process(rq->migration_thread); + } + for_each_online_cpu(cpu) { + rcu_expedited_state = cpu; + req = &per_cpu(rcu_migration_req, cpu); + rq = cpu_rq(cpu); + wait_for_completion(&req->done); + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) + need_full_sync = 1; + req->dest_cpu = RCU_MIGRATION_IDLE; + spin_unlock_irqrestore(&rq->lock, flags); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + mutex_unlock(&rcu_sched_expedited_mutex); + put_online_cpus(); + if (need_full_sync) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */