#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
+#include <linux/slab.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
+ /*
+ * Strictly speaking this rcu_read_lock() is not needed since the
+ * task_group is tied to the cgroup, which in turn can never go away
+ * as long as there are tasks attached to it.
+ *
+ * However since task_group() uses task_subsys_state() which is an
+ * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
+ */
+ rcu_read_lock();
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
p->se.parent = task_group(p)->se[cpu];
p->rt.rt_rq = task_group(p)->rt_rq[cpu];
p->rt.parent = task_group(p)->rt_se[cpu];
#endif
+ rcu_read_unlock();
}
#else
#endif
}
+#define rcu_dereference_check_sched_domain(p) \
+ rcu_dereference_check((p), \
+ rcu_read_lock_sched_held() || \
+ lockdep_is_held(&sched_domains_mutex))
+
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
+ * Check whether the task is waking, we use this to synchronize against
+ * ttwu() so that task_cpu() reports a stable number.
+ *
+ * We need to make an exception for PF_STARTING tasks because the fork
+ * path might require task_rq_lock() to work, eg. it can call
+ * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+ return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+}
+
+/*
* __task_rq_lock - lock the runqueue a given task resides on.
* Must be called interrupts disabled.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
+ struct rq *rq;
+
for (;;) {
- struct rq *rq = task_rq(p);
+ while (task_is_waking(p))
+ cpu_relax();
+ rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) && !task_is_waking(p)))
return rq;
raw_spin_unlock(&rq->lock);
}
struct rq *rq;
for (;;) {
+ while (task_is_waking(p))
+ cpu_relax();
local_irq_save(*flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) && !task_is_waking(p)))
return rq;
raw_spin_unlock_irqrestore(&rq->lock, *flags);
}
static struct sched_group *group_of(int cpu)
{
- struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+ struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
if (!sd)
return NULL;
#ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
{
int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
- struct rq *rq, *orig_rq;
+ struct rq *rq;
if (!sched_feat(SYNC_WAKEUPS))
wake_flags &= ~WF_SYNC;
this_cpu = get_cpu();
smp_wmb();
- rq = orig_rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &flags);
update_rq_clock(rq);
if (!(p->state & state))
goto out;
__task_rq_unlock(rq);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu)
+ if (cpu != orig_cpu) {
+ /*
+ * Since we migrate the task without holding any rq->lock,
+ * we need to be careful with task_rq_lock(), since that
+ * might end up locking an invalid rq.
+ */
set_task_cpu(p, cpu);
+ }
- rq = __task_rq_lock(p);
+ rq = cpu_rq(cpu);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
+ /*
+ * We migrated the task without holding either rq->lock, however
+ * since the task is not on the task list itself, nobody else
+ * will try and migrate the task, hence the rq should match the
+ * cpu we just moved it to.
+ */
+ WARN_ON(task_cpu(p) != cpu);
WARN_ON(p->state != TASK_WAKING);
- cpu = task_cpu(p);
#ifdef CONFIG_SCHEDSTATS
schedstat_inc(rq, ttwu_count);
set_task_cpu(p, cpu);
#endif
- rq = task_rq_lock(p, &flags);
+ /*
+ * Since the task is not on the rq and we still have TASK_WAKING set
+ * nobody else will migrate this task.
+ */
+ rq = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
BUG_ON(p->state != TASK_WAKING);
p->state = TASK_RUNNING;
update_rq_clock(rq);
*/
prev_state = prev->state;
finish_arch_switch(prev);
- perf_event_task_sched_in(current, cpu_of(rq));
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+ perf_event_task_sched_in(current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
fire_sched_in_preempt_notifiers(current);
curr->sched_class->task_tick(rq, curr, 0);
raw_spin_unlock(&rq->lock);
- perf_event_task_tick(curr, cpu);
+ perf_event_task_tick(curr);
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
if (likely(prev != next)) {
sched_info_switch(prev, next);
- perf_event_task_sched_out(prev, next, cpu);
+ perf_event_task_sched_out(prev, next);
rq->nr_switches++;
rq->curr = next;
* the mutex owner just released it and exited.
*/
if (probe_kernel_address(&owner->cpu, cpu))
- goto out;
+ return 0;
#else
cpu = owner->cpu;
#endif
* the cpu field may no longer be valid.
*/
if (cpu >= nr_cpumask_bits)
- goto out;
+ return 0;
/*
* We need to validate that we can do a
* get_cpu() and that we have the percpu area.
*/
if (!cpu_online(cpu))
- goto out;
+ return 0;
rq = cpu_rq(cpu);
cpu_relax();
}
-out:
+
return 1;
}
#endif
unsigned long flags;
int oldprio, on_rq, running;
struct rq *rq;
- const struct sched_class *prev_class = p->sched_class;
+ const struct sched_class *prev_class;
BUG_ON(prio < 0 || prio > MAX_PRIO);
update_rq_clock(rq);
oldprio = p->prio;
+ prev_class = p->sched_class;
on_rq = p->se.on_rq;
running = task_current(rq, p);
if (on_rq)
/* convert nice value [19,-20] to rlimit style value [1,40] */
int nice_rlim = 20 - nice;
- return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
}
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
- const struct sched_class *prev_class = p->sched_class;
+ const struct sched_class *prev_class;
struct rq *rq;
int reset_on_fork;
if (!lock_task_sighand(p, &flags))
return -ESRCH;
- rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
unlock_task_sighand(p, &flags);
/* can't set/change the rt policy */
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
+ prev_class = p->sched_class;
__setscheduler(rq, p, policy, param->sched_priority);
if (running)
int ret;
cpumask_var_t mask;
- if (len < cpumask_size())
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
return -EINVAL;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+ size_t retlen = min_t(size_t, len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, mask, retlen))
ret = -EFAULT;
else
- ret = cpumask_size();
+ ret = retlen;
}
free_cpumask_var(mask);
struct rq *rq;
int ret = 0;
- /*
- * Since we rely on wake-ups to migrate sleeping tasks, don't change
- * the ->cpus_allowed mask from under waking tasks, which would be
- * possible when we change rq->lock in ttwu(), so synchronize against
- * TASK_WAKING to avoid that.
- *
- * Make an exception for freshly cloned tasks, since cpuset namespaces
- * might move the task about, we have to validate the target in
- * wake_up_new_task() anyway since the cpu might have gone away.
- */
-again:
- while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
- cpu_relax();
-
rq = task_rq_lock(p, &flags);
- if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
- task_rq_unlock(rq, &flags);
- goto again;
- }
-
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
goto out;
get_task_struct(mt);
task_rq_unlock(rq, &flags);
- wake_up_process(rq->migration_thread);
+ wake_up_process(mt);
put_task_struct(mt);
wait_for_completion(&req.done);
tlb_migrate_finish(p->mm);
#ifdef CONFIG_SCHED_MC
static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+ struct sysdev_class_attribute *attr,
char *page)
{
return sprintf(page, "%u\n", sched_mc_power_savings);
}
static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+ struct sysdev_class_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 0);
#ifdef CONFIG_SCHED_SMT
static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+ struct sysdev_class_attribute *attr,
char *page)
{
return sprintf(page, "%u\n", sched_smt_power_savings);
}
static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+ struct sysdev_class_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 1);
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
- u64 *cpuusage;
+ u64 __percpu *cpuusage;
struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};
}
/*
+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ * in cputime_t units. As a result, cpuacct_update_stats calls
+ * percpu_counter_add with values large enough to always overflow the
+ * per cpu batch limit causing bad SMP scalability.
+ *
+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ */
+#ifdef CONFIG_SMP
+#define CPUACCT_BATCH \
+ min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+#else
+#define CPUACCT_BATCH 0
+#endif
+
+/*
* Charge the system/user time to the task's accounting group.
*/
static void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val)
{
struct cpuacct *ca;
+ int batch = CPUACCT_BATCH;
if (unlikely(!cpuacct_subsys.active))
return;
ca = task_ca(tsk);
do {
- percpu_counter_add(&ca->cpustat[idx], val);
+ __percpu_counter_add(&ca->cpustat[idx], val, batch);
ca = ca->parent;
} while (ca);
rcu_read_unlock();