X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched.c;h=3c2a54f70ffed7ca1e0954666e3d333e43d36b95;hb=8213466596bf10b75887754773ee13c10cf86f5c;hp=bae6fcfe6d758539fa5d53b924607308b9f2b638;hpb=fa535a77bd3fa32b9215ba375d6a202fe73e1dd6;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index bae6fcf..3c2a54f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -322,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { + /* + * Strictly speaking this rcu_read_lock() is not needed since the + * task_group is tied to the cgroup, which in turn can never go away + * as long as there are tasks attached to it. + * + * However since task_group() uses task_subsys_state() which is an + * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. + */ + rcu_read_lock(); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; @@ -331,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; #endif + rcu_read_unlock(); } #else @@ -602,6 +613,11 @@ static inline int cpu_of(struct rq *rq) #endif } +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&sched_domains_mutex)) + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -610,7 +626,7 @@ static inline int cpu_of(struct rq *rq) * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) @@ -898,16 +914,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * Check whether the task is waking, we use this to synchronize against + * ttwu() so that task_cpu() reports a stable number. + * + * We need to make an exception for PF_STARTING tasks because the fork + * path might require task_rq_lock() to work, eg. it can call + * set_cpus_allowed_ptr() from the cpuset clone_ns code. + */ +static inline int task_is_waking(struct task_struct *p) +{ + return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); +} + +/* * __task_rq_lock - lock the runqueue a given task resides on. * Must be called interrupts disabled. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { + struct rq *rq; + for (;;) { - struct rq *rq = task_rq(p); + while (task_is_waking(p)) + cpu_relax(); + rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_is_waking(p))) return rq; raw_spin_unlock(&rq->lock); } @@ -924,10 +957,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) struct rq *rq; for (;;) { + while (task_is_waking(p)) + cpu_relax(); local_irq_save(*flags); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_is_waking(p))) return rq; raw_spin_unlock_irqrestore(&rq->lock, *flags); } @@ -1462,7 +1497,7 @@ static unsigned long target_load(int cpu, int type) static struct sched_group *group_of(int cpu) { - struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); if (!sd) return NULL; @@ -1497,7 +1532,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long *update_shares_data; +static __read_mostly unsigned long __percpu *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -2335,7 +2370,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq, *orig_rq; + struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2343,7 +2378,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = orig_rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2374,14 +2409,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + /* + * Since we migrate the task without holding any rq->lock, + * we need to be careful with task_rq_lock(), since that + * might end up locking an invalid rq. + */ set_task_cpu(p, cpu); + } - rq = __task_rq_lock(p); + rq = cpu_rq(cpu); + raw_spin_lock(&rq->lock); update_rq_clock(rq); + /* + * We migrated the task without holding either rq->lock, however + * since the task is not on the task list itself, nobody else + * will try and migrate the task, hence the rq should match the + * cpu we just moved it to. + */ + WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); - cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2629,7 +2677,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) set_task_cpu(p, cpu); #endif - rq = task_rq_lock(p, &flags); + /* + * Since the task is not on the rq and we still have TASK_WAKING set + * nobody else will migrate this task. + */ + rq = cpu_rq(cpu); + raw_spin_lock_irqsave(&rq->lock, flags); + BUG_ON(p->state != TASK_WAKING); p->state = TASK_RUNNING; update_rq_clock(rq); @@ -2760,7 +2814,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_event_task_sched_in(current, cpu_of(rq)); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + perf_event_task_sched_in(current); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -3466,7 +3526,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr, cpu); + perf_event_task_tick(curr); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -3680,7 +3740,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next); rq->nr_switches++; rq->curr = next; @@ -3730,7 +3790,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - goto out; + return 0; #else cpu = owner->cpu; #endif @@ -3740,14 +3800,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - goto out; + return 0; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - goto out; + return 0; rq = cpu_rq(cpu); @@ -3766,7 +3826,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) cpu_relax(); } -out: + return 1; } #endif @@ -4211,7 +4271,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) unsigned long flags; int oldprio, on_rq, running; struct rq *rq; - const struct sched_class *prev_class = p->sched_class; + const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -4219,6 +4279,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) update_rq_clock(rq); oldprio = p->prio; + prev_class = p->sched_class; on_rq = p->se.on_rq; running = task_current(rq, p); if (on_rq) @@ -4303,7 +4364,7 @@ int can_nice(const struct task_struct *p, const int nice) /* convert nice value [19,-20] to rlimit style value [1,40] */ int nice_rlim = 20 - nice; - return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); } @@ -4438,7 +4499,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; - const struct sched_class *prev_class = p->sched_class; + const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; @@ -4480,7 +4541,7 @@ recheck: if (!lock_task_sighand(p, &flags)) return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); unlock_task_sighand(p, &flags); /* can't set/change the rt policy */ @@ -4552,6 +4613,7 @@ recheck: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); if (running) @@ -4851,7 +4913,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, int ret; cpumask_var_t mask; - if (len < cpumask_size()) + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) @@ -4859,10 +4923,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; else - ret = cpumask_size(); + ret = retlen; } free_cpumask_var(mask); @@ -5302,27 +5368,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; - /* - * Since we rely on wake-ups to migrate sleeping tasks, don't change - * the ->cpus_allowed mask from under waking tasks, which would be - * possible when we change rq->lock in ttwu(), so synchronize against - * TASK_WAKING to avoid that. - * - * Make an exception for freshly cloned tasks, since cpuset namespaces - * might move the task about, we have to validate the target in - * wake_up_new_task() anyway since the cpu might have gone away. - */ -again: - while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) - cpu_relax(); - rq = task_rq_lock(p, &flags); - if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { - task_rq_unlock(rq, &flags); - goto again; - } - if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -5351,7 +5398,7 @@ again: get_task_struct(mt); task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); + wake_up_process(mt); put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); @@ -7374,11 +7421,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) #ifdef CONFIG_SCHED_MC static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + struct sysdev_class_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); @@ -7390,11 +7439,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, #ifdef CONFIG_SCHED_SMT static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); @@ -8781,7 +8832,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 *cpuusage; + u64 __percpu *cpuusage; struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; struct cpuacct *parent; };