X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched.c;h=9ab3cd7858d38001cc4ed0586bf187438e685cd7;hb=80a186074e72e2cd61f6716d90cf32ce54981a56;hp=f96be9370b750d7b13cd848f67f13f3a65c77d89;hpb=6d3e0907b8b239d16720d144e2675ecf10d3bc3b;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index f96be93..9ab3cd7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -602,6 +602,11 @@ static inline int cpu_of(struct rq *rq) #endif } +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&sched_domains_mutex)) + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -610,7 +615,7 @@ static inline int cpu_of(struct rq *rq) * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) @@ -898,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * Check whether the task is waking, we use this to synchronize against + * ttwu() so that task_cpu() reports a stable number. + * + * We need to make an exception for PF_STARTING tasks because the fork + * path might require task_rq_lock() to work, eg. it can call + * set_cpus_allowed_ptr() from the cpuset clone_ns code. + */ +static inline int task_is_waking(struct task_struct *p) +{ + return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); +} + +/* * __task_rq_lock - lock the runqueue a given task resides on. * Must be called interrupts disabled. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { + struct rq *rq; + for (;;) { - struct rq *rq = task_rq(p); + while (task_is_waking(p)) + cpu_relax(); + rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_is_waking(p))) return rq; raw_spin_unlock(&rq->lock); } @@ -924,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) struct rq *rq; for (;;) { + while (task_is_waking(p)) + cpu_relax(); local_irq_save(*flags); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_is_waking(p))) return rq; raw_spin_unlock_irqrestore(&rq->lock, *flags); } @@ -1462,7 +1486,7 @@ static unsigned long target_load(int cpu, int type) static struct sched_group *group_of(int cpu) { - struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); if (!sd) return NULL; @@ -1497,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long *update_shares_data; +static __read_mostly unsigned long __percpu *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -2335,7 +2359,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq, *orig_rq; + struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2343,7 +2367,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = orig_rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2374,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + /* + * Since we migrate the task without holding any rq->lock, + * we need to be careful with task_rq_lock(), since that + * might end up locking an invalid rq. + */ set_task_cpu(p, cpu); + } - rq = __task_rq_lock(p); + rq = cpu_rq(cpu); + raw_spin_lock(&rq->lock); update_rq_clock(rq); + /* + * We migrated the task without holding either rq->lock, however + * since the task is not on the task list itself, nobody else + * will try and migrate the task, hence the rq should match the + * cpu we just moved it to. + */ + WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); - cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2613,7 +2650,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu __maybe_unused = get_cpu(); + int cpu = get_cpu(); #ifdef CONFIG_SMP /* @@ -2629,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) set_task_cpu(p, cpu); #endif - rq = task_rq_lock(p, &flags); + /* + * Since the task is not on the rq and we still have TASK_WAKING set + * nobody else will migrate this task. + */ + rq = cpu_rq(cpu); + raw_spin_lock_irqsave(&rq->lock, flags); + BUG_ON(p->state != TASK_WAKING); p->state = TASK_RUNNING; update_rq_clock(rq); @@ -2760,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_event_task_sched_in(current, cpu_of(rq)); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + perf_event_task_sched_in(current); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -3466,7 +3515,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr, cpu); + perf_event_task_tick(curr); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -3680,7 +3729,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next); rq->nr_switches++; rq->curr = next; @@ -4211,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) unsigned long flags; int oldprio, on_rq, running; struct rq *rq; - const struct sched_class *prev_class = p->sched_class; + const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -4219,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) update_rq_clock(rq); oldprio = p->prio; + prev_class = p->sched_class; on_rq = p->se.on_rq; running = task_current(rq, p); if (on_rq) @@ -4303,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice) /* convert nice value [19,-20] to rlimit style value [1,40] */ int nice_rlim = 20 - nice; - return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); } @@ -4438,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; - const struct sched_class *prev_class = p->sched_class; + const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; @@ -4480,7 +4530,7 @@ recheck: if (!lock_task_sighand(p, &flags)) return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); unlock_task_sighand(p, &flags); /* can't set/change the rt policy */ @@ -4552,6 +4602,7 @@ recheck: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); if (running) @@ -5302,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; - /* - * Since we rely on wake-ups to migrate sleeping tasks, don't change - * the ->cpus_allowed mask from under waking tasks, which would be - * possible when we change rq->lock in ttwu(), so synchronize against - * TASK_WAKING to avoid that. - * - * Make an exception for freshly cloned tasks, since cpuset namespaces - * might move the task about, we have to validate the target in - * wake_up_new_task() anyway since the cpu might have gone away. - */ -again: - while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) - cpu_relax(); - rq = task_rq_lock(p, &flags); - if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { - task_rq_unlock(rq, &flags); - goto again; - } - if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -7374,11 +7406,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) #ifdef CONFIG_SCHED_MC static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + struct sysdev_class_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); @@ -7390,11 +7424,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, #ifdef CONFIG_SCHED_SMT static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); @@ -8781,7 +8817,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 *cpuusage; + u64 __percpu *cpuusage; struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; struct cpuacct *parent; }; @@ -8998,12 +9034,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) } /* + * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large + * in cputime_t units. As a result, cpuacct_update_stats calls + * percpu_counter_add with values large enough to always overflow the + * per cpu batch limit causing bad SMP scalability. + * + * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we + * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled + * and enabled. We cap it at INT_MAX which is the largest allowed batch value. + */ +#ifdef CONFIG_SMP +#define CPUACCT_BATCH \ + min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) +#else +#define CPUACCT_BATCH 0 +#endif + +/* * Charge the system/user time to the task's accounting group. */ static void cpuacct_update_stats(struct task_struct *tsk, enum cpuacct_stat_index idx, cputime_t val) { struct cpuacct *ca; + int batch = CPUACCT_BATCH; if (unlikely(!cpuacct_subsys.active)) return; @@ -9012,7 +9066,7 @@ static void cpuacct_update_stats(struct task_struct *tsk, ca = task_ca(tsk); do { - percpu_counter_add(&ca->cpustat[idx], val); + __percpu_counter_add(&ca->cpustat[idx], val, batch); ca = ca->parent; } while (ca); rcu_read_unlock();