#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
-static inline unsigned int task_timeslice(task_t *p)
+static unsigned int task_timeslice(task_t *p)
{
if (p->static_prio < NICE_TO_PRIO(0))
return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
*/
unsigned long nr_running;
#ifdef CONFIG_SMP
- unsigned long cpu_load;
+ unsigned long cpu_load[3];
#endif
unsigned long long nr_switches;
static DEFINE_PER_CPU(struct runqueue, runqueues);
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
#define for_each_domain(cpu, domain) \
- for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-/*
- * Default context-switch locking:
- */
#ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next) do { } while (0)
-# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p) ((rq)->curr == (p))
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev) do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+ return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
+ spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+ return p->oncpu;
+#else
+ return rq->curr == p;
#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ spin_unlock_irq(&rq->lock);
+#else
+ spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->oncpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 11
+#define SCHEDSTAT_VERSION 12
static int show_schedstat(struct seq_file *seq, void *v)
{
#ifdef CONFIG_SMP
/* domain-specific stats */
+ preempt_disable();
for_each_domain(cpu, sd) {
enum idle_type itype;
char mask_str[NR_CPUS];
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
- sd->sbe_pushed, sd->sbe_attempts,
+ sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+ sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
}
+ preempt_enable();
#endif
}
return 0;
return rq;
}
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
- int sib;
- for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
- if (idle_cpu(sib))
- continue;
- return 0;
- }
-
- return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
-
#ifdef CONFIG_SCHEDSTATS
/*
* Called when a process is dequeued from the active array and given
rq->nr_running++;
}
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
unsigned long long __sleep_time = now - p->timestamp;
}
}
- p->prio = effective_prio(p);
+ return effective_prio(p);
}
/*
}
#endif
- recalc_task_prio(p, now);
+ p->prio = recalc_task_prio(p, now);
/*
* This checks to make sure it's not an uninterruptible task
}
#ifdef CONFIG_SMP
-enum request_type {
- REQ_MOVE_TASK,
- REQ_SET_DOMAIN,
-};
-
typedef struct {
struct list_head list;
- enum request_type type;
- /* For REQ_MOVE_TASK */
task_t *task;
int dest_cpu;
- /* For REQ_SET_DOMAIN */
- struct sched_domain *sd;
-
struct completion done;
} migration_req_t;
}
init_completion(&req->done);
- req->type = REQ_MOVE_TASK;
req->task = p;
req->dest_cpu = dest_cpu;
list_add(&req->list, &rq->migration_queue);
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
{
unsigned long flags;
runqueue_t *rq;
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
{
runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ if (type == 0)
+ return load_now;
- return min(rq->cpu_load, load_now);
+ return min(rq->cpu_load[type-1], load_now);
}
/*
* Return a high guess at the load of a migration-target cpu
*/
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
{
runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ if (type == 0)
+ return load_now;
- return max(rq->cpu_load, load_now);
+ return max(rq->cpu_load[type-1], load_now);
}
-#endif
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+ struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+ unsigned long min_load = ULONG_MAX, this_load = 0;
+ int load_idx = sd->forkexec_idx;
+ int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+ do {
+ unsigned long load, avg_load;
+ int local_group;
+ int i;
+
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ goto nextgroup;
+
+ local_group = cpu_isset(this_cpu, group->cpumask);
+
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
+
+ for_each_cpu_mask(i, group->cpumask) {
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+ load = source_load(i, load_idx);
+ else
+ load = target_load(i, load_idx);
+
+ avg_load += load;
+ }
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+ if (local_group) {
+ this_load = avg_load;
+ this = group;
+ } else if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+nextgroup:
+ group = group->next;
+ } while (group != sd->groups);
+
+ if (!idlest || 100*this_load < imbalance*min_load)
+ return NULL;
+ return idlest;
+}
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+ cpumask_t tmp;
+ unsigned long load, min_load = ULONG_MAX;
+ int idlest = -1;
+ int i;
+
+ /* Traverse only the allowed CPUs */
+ cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+ for_each_cpu_mask(i, tmp) {
+ load = source_load(i, 0);
+
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ idlest = i;
+ }
+ }
+
+ return idlest;
+}
+
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+ struct task_struct *t = current;
+ struct sched_domain *tmp, *sd = NULL;
+
+ for_each_domain(cpu, tmp)
+ if (tmp->flags & flag)
+ sd = tmp;
+
+ while (sd) {
+ cpumask_t span;
+ struct sched_group *group;
+ int new_cpu;
+ int weight;
+
+ span = sd->span;
+ group = find_idlest_group(sd, t, cpu);
+ if (!group)
+ goto nextlevel;
+
+ new_cpu = find_idlest_cpu(group, t, cpu);
+ if (new_cpu == -1 || new_cpu == cpu)
+ goto nextlevel;
+
+ /* Now try balancing at a lower domain level */
+ cpu = new_cpu;
+nextlevel:
+ sd = NULL;
+ weight = cpus_weight(span);
+ for_each_domain(cpu, tmp) {
+ if (weight <= cpus_weight(tmp->span))
+ break;
+ if (tmp->flags & flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return cpu;
+}
+
+#endif /* CONFIG_SMP */
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_IDLE) {
- cpus_and(tmp, sd->span, cpu_online_map);
- cpus_and(tmp, tmp, p->cpus_allowed);
+ cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
if (idle_cpu(i))
return i;
}
}
- else break;
+ else
+ break;
}
return cpu;
}
*
* returns failure only if the task is already active.
*/
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
unsigned long flags;
runqueue_t *rq;
#ifdef CONFIG_SMP
unsigned long load, this_load;
- struct sched_domain *sd;
+ struct sched_domain *sd, *this_sd = NULL;
int new_cpu;
#endif
if (unlikely(task_running(rq, p)))
goto out_activate;
-#ifdef CONFIG_SCHEDSTATS
+ new_cpu = cpu;
+
schedstat_inc(rq, ttwu_cnt);
if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local);
- } else {
- for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_wake_remote);
- break;
- }
+ goto out_set_cpu;
+ }
+
+ for_each_domain(this_cpu, sd) {
+ if (cpu_isset(cpu, sd->span)) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ this_sd = sd;
+ break;
}
}
-#endif
- new_cpu = cpu;
- if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+ if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;
- load = source_load(cpu);
- this_load = target_load(this_cpu);
-
/*
- * If sync wakeup then subtract the (maximum possible) effect of
- * the currently running task from the load of the current CPU:
+ * Check for affine wakeup and passive balancing possibilities.
*/
- if (sync)
- this_load -= SCHED_LOAD_SCALE;
+ if (this_sd) {
+ int idx = this_sd->wake_idx;
+ unsigned int imbalance;
- /* Don't pull the task off an idle CPU to a busy one */
- if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
- goto out_set_cpu;
+ imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
- new_cpu = this_cpu; /* Wake to this CPU if we can */
+ load = source_load(cpu, idx);
+ this_load = target_load(this_cpu, idx);
- /*
- * Scan domains for affine wakeup and passive balancing
- * possibilities.
- */
- for_each_domain(this_cpu, sd) {
- unsigned int imbalance;
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+ new_cpu = this_cpu; /* Wake to this CPU if we can */
- if ((sd->flags & SD_WAKE_AFFINE) &&
- !task_hot(p, rq->timestamp_last_tick, sd)) {
+ if (this_sd->flags & SD_WAKE_AFFINE) {
+ unsigned long tl = this_load;
/*
- * This domain has SD_WAKE_AFFINE and p is cache cold
- * in this domain.
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
*/
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_move_affine);
+ if (sync)
+ tl -= SCHED_LOAD_SCALE;
+
+ if ((tl <= load &&
+ tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+ 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+ /*
+ * This domain has SD_WAKE_AFFINE and
+ * p is cache cold in this domain, and
+ * there is no bad imbalance.
+ */
+ schedstat_inc(this_sd, ttwu_move_affine);
goto out_set_cpu;
}
- } else if ((sd->flags & SD_WAKE_BALANCE) &&
- imbalance*this_load <= 100*load) {
- /*
- * This domain has SD_WAKE_BALANCE and there is
- * an imbalance.
- */
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_move_balance);
+ }
+
+ /*
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
+ */
+ if (this_sd->flags & SD_WAKE_BALANCE) {
+ if (imbalance*this_load <= 100*load) {
+ schedstat_inc(this_sd, ttwu_move_balance);
goto out_set_cpu;
}
}
}
/*
+ * Tasks that have marked their sleep as noninteractive get
+ * woken up without updating their sleep average. (i.e. their
+ * sleep is handled in a priority-neutral manner, no priority
+ * boost and no penalty.)
+ */
+ if (old_state & TASK_NONINTERACTIVE)
+ __activate_task(p, rq);
+ else
+ activate_task(p, rq, cpu == this_cpu);
+ /*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
* don't trigger a preemption, if the woken up task will run on
* the waker guarantees that the freshly woken up task is going
* to be considered on this CPU.)
*/
- activate_task(p, rq, cpu == this_cpu);
if (!sync || cpu != this_cpu) {
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
return success;
}
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
{
return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
return try_to_wake_up(p, state, 0);
}
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
- struct sched_domain *sd);
-#endif
-
/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*/
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
{
+ int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+ cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+ set_task_cpu(p, cpu);
+
/*
* We mark the process as running here, but have not actually
* inserted it onto the runqueue yet. This guarantees that
p->state = TASK_RUNNING;
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
- spin_lock_init(&p->switch_lock);
#ifdef CONFIG_SCHEDSTATS
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ p->oncpu = 0;
+#endif
#ifdef CONFIG_PREEMPT
- /*
- * During context-switch we hold precisely one spinlock, which
- * schedule_tail drops. (in the common case it's this_rq()->lock,
- * but it also can be p->switch_lock.) So we compensate with a count
- * of 1. Also, we want to start with kernel preemption disabled.
- */
+ /* Want to start with kernel preemption disabled. */
p->thread_info->preempt_count = 1;
#endif
/*
* runqueue lock is not a problem.
*/
current->time_slice = 1;
- preempt_disable();
scheduler_tick();
- local_irq_enable();
- preempt_enable();
- } else
- local_irq_enable();
+ }
+ local_irq_enable();
+ put_cpu();
}
/*
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
{
unsigned long flags;
int this_cpu, cpu;
runqueue_t *rq, *this_rq;
rq = task_rq_lock(p, &flags);
- cpu = task_cpu(p);
- this_cpu = smp_processor_id();
-
BUG_ON(p->state != TASK_RUNNING);
+ this_cpu = smp_processor_id();
+ cpu = task_cpu(p);
/*
* We decrease the sleep average of forking parents
* artificially, because any timeslice recovered here
* was given away by the parent in the first place.)
*/
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
{
unsigned long flags;
runqueue_t *rq;
* the sleep_avg of the parent as well.
*/
rq = task_rq_lock(p->parent, &flags);
- if (p->first_time_slice) {
+ if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
p->parent->time_slice += p->time_slice;
if (unlikely(p->parent->time_slice > task_timeslice(p)))
p->parent->time_slice = task_timeslice(p);
}
/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+ prepare_lock_switch(rq, next);
+ prepare_arch_switch(next);
+}
+
+/**
* finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
* @prev: the thread we just switched away from.
*
- * We enter this with the runqueue still locked, and finish_arch_switch()
- * will unlock it along with doing any other architecture-specific cleanup
- * actions.
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
*
* Note that we may have delayed dropping an mm in context_switch(). If
* so, we finish that here outside of the runqueue lock. (Doing it
* with the lock held can cause deadlocks; see schedule() for
* details.)
*/
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
__releases(rq->lock)
{
- runqueue_t *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
unsigned long prev_task_flags;
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_task_flags = prev->flags;
- finish_arch_switch(rq, prev);
+ finish_arch_switch(prev);
+ finish_lock_switch(rq, prev);
if (mm)
mmdrop(mm);
if (unlikely(prev_task_flags & PF_DEAD))
asmlinkage void schedule_tail(task_t *prev)
__releases(rq->lock)
{
- finish_task_switch(prev);
-
+ runqueue_t *rq = this_rq();
+ finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+ /* In this case, finish_task_switch does not reenable preemption */
+ preempt_enable();
+#endif
if (current->set_child_tid)
put_user(current->pid, current->set_child_tid);
}
}
/*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
- struct sched_domain *sd)
-{
- unsigned long load, min_load, this_load;
- int i, min_cpu;
- cpumask_t mask;
-
- min_cpu = UINT_MAX;
- min_load = ULONG_MAX;
-
- cpus_and(mask, sd->span, p->cpus_allowed);
-
- for_each_cpu_mask(i, mask) {
- load = target_load(i);
-
- if (load < min_load) {
- min_cpu = i;
- min_load = load;
-
- /* break out early on an idle CPU: */
- if (!min_load)
- break;
- }
- }
-
- /* add +1 to account for the new task */
- this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
-
- /*
- * Would with the addition of the new task to the
- * current CPU there be an imbalance between this
- * CPU and the idlest CPU?
- *
- * Use half of the balancing threshold - new-context is
- * a good opportunity to balance.
- */
- if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
- return min_cpu;
-
- return this_cpu;
-}
-
-/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
* allow dest_cpu, which will force the cpu onto dest_cpu. Then
}
/*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
*/
void sched_exec(void)
{
- struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu();
-
- /* Prefer the current CPU if there's only this task running */
- if (this_rq()->nr_running <= 1)
- goto out;
-
- for_each_domain(this_cpu, tmp)
- if (tmp->flags & SD_BALANCE_EXEC)
- sd = tmp;
-
- if (sd) {
- schedstat_inc(sd, sbe_attempts);
- new_cpu = find_idlest_cpu(current, this_cpu, sd);
- if (new_cpu != this_cpu) {
- schedstat_inc(sd, sbe_pushed);
- put_cpu();
- sched_migrate_task(current, new_cpu);
- return;
- }
- }
-out:
+ new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
put_cpu();
+ if (new_cpu != this_cpu)
+ sched_migrate_task(current, new_cpu);
}
/*
*/
static inline
int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
- struct sched_domain *sd, enum idle_type idle)
+ struct sched_domain *sd, enum idle_type idle,
+ int *all_pinned)
{
/*
* We do not migrate tasks that are:
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
- if (task_running(rq, p))
- return 0;
if (!cpu_isset(this_cpu, p->cpus_allowed))
return 0;
+ *all_pinned = 0;
+
+ if (task_running(rq, p))
+ return 0;
/*
* Aggressive migration if:
- * 1) the [whole] cpu is idle, or
+ * 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
- if (cpu_and_siblings_are_idle(this_cpu) || \
- sd->nr_balance_failed > sd->cache_nice_tries)
+ if (sd->nr_balance_failed > sd->cache_nice_tries)
return 1;
if (task_hot(p, rq->timestamp_last_tick, sd))
- return 0;
+ return 0;
return 1;
}
*/
static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
unsigned long max_nr_move, struct sched_domain *sd,
- enum idle_type idle)
+ enum idle_type idle, int *all_pinned)
{
prio_array_t *array, *dst_array;
struct list_head *head, *curr;
- int idx, pulled = 0;
+ int idx, pulled = 0, pinned = 0;
task_t *tmp;
- if (max_nr_move <= 0 || busiest->nr_running <= 1)
+ if (max_nr_move == 0)
goto out;
+ pinned = 1;
+
/*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
curr = curr->prev;
- if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+ if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
if (curr != head)
goto skip_queue;
idx++;
* inside pull_task().
*/
schedstat_add(sd, lb_gained[idle], pulled);
+
+ if (all_pinned)
+ *all_pinned = pinned;
return pulled;
}
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle)
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ unsigned long max_pull;
+ int load_idx;
max_load = this_load = total_load = total_pwr = 0;
+ if (idle == NOT_IDLE)
+ load_idx = sd->busy_idx;
+ else if (idle == NEWLY_IDLE)
+ load_idx = sd->newidle_idx;
+ else
+ load_idx = sd->idle_idx;
do {
unsigned long load;
avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
+ if (*sd_idle && !idle_cpu(i))
+ *sd_idle = 0;
+
/* Bias balancing toward cpus of our domain */
if (local_group)
- load = target_load(i);
+ load = target_load(i, load_idx);
else
- load = source_load(i);
+ load = source_load(i, load_idx);
avg_load += load;
}
if (local_group) {
this_load = avg_load;
this = group;
- goto nextgroup;
} else if (avg_load > max_load) {
max_load = avg_load;
busiest = group;
}
-nextgroup:
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load)
+ if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
+
+ /* Don't want to pull so many tasks that a group would go idle */
+ max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+
/* How much load to actually move to equalise the imbalance */
- *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+ *imbalance = min(max_pull * busiest->cpu_power,
(avg_load - this_load) * this->cpu_power)
/ SCHED_LOAD_SCALE;
/* Get rid of the scaling factor, rounding down as we divide */
*imbalance = *imbalance / SCHED_LOAD_SCALE;
-
return busiest;
out_balanced:
- if (busiest && (idle == NEWLY_IDLE ||
- (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
- *imbalance = 1;
- return busiest;
- }
*imbalance = 0;
return NULL;
int i;
for_each_cpu_mask(i, group->cpumask) {
- load = source_load(i);
+ load = source_load(i, 0);
if (load > max_load) {
max_load = load;
}
/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL 512
+
+/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*
struct sched_group *group;
runqueue_t *busiest;
unsigned long imbalance;
- int nr_moved;
+ int nr_moved, all_pinned = 0;
+ int active_balance = 0;
+ int sd_idle = 0;
+
+ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+ sd_idle = 1;
- spin_lock(&this_rq->lock);
schedstat_inc(sd, lb_cnt[idle]);
- group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
goto out_balanced;
}
- /*
- * This should be "impossible", but since load
- * balancing is inherently racy and statistical,
- * it could happen in theory.
- */
- if (unlikely(busiest == this_rq)) {
- WARN_ON(1);
- goto out_balanced;
- }
+ BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], imbalance);
* still unbalanced. nr_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- double_lock_balance(this_rq, busiest);
+ double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle);
- spin_unlock(&busiest->lock);
+ imbalance, sd, idle, &all_pinned);
+ double_rq_unlock(this_rq, busiest);
+
+ /* All tasks on this runqueue were pinned by CPU affinity */
+ if (unlikely(all_pinned))
+ goto out_balanced;
}
- spin_unlock(&this_rq->lock);
if (!nr_moved) {
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
- int wake = 0;
spin_lock(&busiest->lock);
+
+ /* don't kick the migration_thread, if the curr
+ * task on busiest cpu can't be moved to this_cpu
+ */
+ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ spin_unlock(&busiest->lock);
+ all_pinned = 1;
+ goto out_one_pinned;
+ }
+
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
- wake = 1;
+ active_balance = 1;
}
spin_unlock(&busiest->lock);
- if (wake)
+ if (active_balance)
wake_up_process(busiest->migration_thread);
/*
* We've kicked active balancing, reset the failure
* counter.
*/
- sd->nr_balance_failed = sd->cache_nice_tries;
+ sd->nr_balance_failed = sd->cache_nice_tries+1;
}
-
- /*
- * We were unbalanced, but unsuccessful in move_tasks(),
- * so bump the balance_interval to lessen the lock contention.
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval++;
- } else {
+ } else
sd->nr_balance_failed = 0;
+ if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
+ } else {
+ /*
+ * If we've begun active balancing, start to back off. This
+ * case may not be covered by the all_pinned logic if there
+ * is only 1 task on the busy runqueue (because we don't call
+ * move_tasks).
+ */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
}
+ if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
return nr_moved;
out_balanced:
- spin_unlock(&this_rq->lock);
-
schedstat_inc(sd, lb_balanced[idle]);
+ sd->nr_balance_failed = 0;
+
+out_one_pinned:
/* tune up the balancing interval */
- if (sd->balance_interval < sd->max_interval)
+ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ (sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
return 0;
}
runqueue_t *busiest = NULL;
unsigned long imbalance;
int nr_moved = 0;
+ int sd_idle = 0;
+
+ if (sd->flags & SD_SHARE_CPUPOWER)
+ sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
if (!group) {
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
- goto out;
+ goto out_balanced;
}
busiest = find_busiest_queue(group);
- if (!busiest || busiest == this_rq) {
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
- goto out;
+ goto out_balanced;
}
- /* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
+ BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, NEWLY_IDLE);
- if (!nr_moved)
- schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
- spin_unlock(&busiest->lock);
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /* Attempt to move tasks */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, NEWLY_IDLE, NULL);
+ spin_unlock(&busiest->lock);
+ }
+
+ if (!nr_moved) {
+ schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
+ } else
+ sd->nr_balance_failed = 0;
-out:
return nr_moved;
+
+out_balanced:
+ schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
+ sd->nr_balance_failed = 0;
+ return 0;
}
/*
static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
{
struct sched_domain *sd;
- struct sched_group *cpu_group;
runqueue_t *target_rq;
- cpumask_t visited_cpus;
- int cpu;
+ int target_cpu = busiest_rq->push_cpu;
+
+ if (busiest_rq->nr_running <= 1)
+ /* no task to move */
+ return;
+
+ target_rq = cpu_rq(target_cpu);
/*
- * Search for suitable CPUs to push tasks to in successively higher
- * domains with SD_LOAD_BALANCE set.
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
*/
- visited_cpus = CPU_MASK_NONE;
- for_each_domain(busiest_cpu, sd) {
- if (!(sd->flags & SD_LOAD_BALANCE))
- /* no more domains to search */
- break;
+ BUG_ON(busiest_rq == target_rq);
- schedstat_inc(sd, alb_cnt);
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
- cpu_group = sd->groups;
- do {
- for_each_cpu_mask(cpu, cpu_group->cpumask) {
- if (busiest_rq->nr_running <= 1)
- /* no more tasks left to move */
- return;
- if (cpu_isset(cpu, visited_cpus))
- continue;
- cpu_set(cpu, visited_cpus);
- if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
- continue;
-
- target_rq = cpu_rq(cpu);
- /*
- * This condition is "impossible", if it occurs
- * we need to fix it. Originally reported by
- * Bjorn Helgaas on a 128-cpu setup.
- */
- BUG_ON(busiest_rq == target_rq);
-
- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
- if (move_tasks(target_rq, cpu, busiest_rq,
- 1, sd, SCHED_IDLE)) {
- schedstat_inc(sd, alb_pushed);
- } else {
- schedstat_inc(sd, alb_failed);
- }
- spin_unlock(&target_rq->lock);
- }
- cpu_group = cpu_group->next;
- } while (cpu_group != sd->groups);
- }
+ /* Search for an sd spanning us and the target CPU. */
+ for_each_domain(target_cpu, sd)
+ if ((sd->flags & SD_LOAD_BALANCE) &&
+ cpu_isset(busiest_cpu, sd->span))
+ break;
+
+ if (unlikely(sd == NULL))
+ goto out;
+
+ schedstat_inc(sd, alb_cnt);
+
+ if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+out:
+ spin_unlock(&target_rq->lock);
}
/*
unsigned long old_load, this_load;
unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd;
+ int i;
- /* Update our load */
- old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (this_load > old_load)
- old_load++;
- this_rq->cpu_load = (old_load + this_load) / 2;
+ /* Update our load */
+ for (i = 0; i < 3; i++) {
+ unsigned long new_load = this_load;
+ int scale = 1 << i;
+ old_load = this_rq->cpu_load[i];
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale-1;
+ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+ }
for_each_domain(this_cpu, sd) {
unsigned long interval;
if (j - sd->last_balance >= interval) {
if (load_balance(this_cpu, this_rq, sd, idle)) {
- /* We've pulled tasks over so no longer idle */
+ /*
+ * We've pulled tasks over so either we're no
+ * longer idle, or one of our SMT siblings is
+ * not idle.
+ */
idle = NOT_IDLE;
}
sd->last_balance += interval;
cpustat->idle = cputime64_add(cpustat->idle, tmp);
/* Account for system time used */
acct_update_integrals(p);
- /* Update rss highwater mark */
- update_mem_hiwater(p);
}
/*
}
#ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+ /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+ if (rq->curr == rq->idle && rq->nr_running)
+ resched_task(rq->idle);
+}
+
static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
{
- struct sched_domain *sd = this_rq->sd;
+ struct sched_domain *tmp, *sd = NULL;
cpumask_t sibling_map;
int i;
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ for_each_domain(this_cpu, tmp)
+ if (tmp->flags & SD_SHARE_CPUPOWER)
+ sd = tmp;
+
+ if (!sd)
return;
/*
for_each_cpu_mask(i, sibling_map) {
runqueue_t *smt_rq = cpu_rq(i);
- /*
- * If an SMT sibling task is sleeping due to priority
- * reasons wake it up now.
- */
- if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
- resched_task(smt_rq->idle);
+ wakeup_busy_runqueue(smt_rq);
}
for_each_cpu_mask(i, sibling_map)
*/
}
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+ return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
{
- struct sched_domain *sd = this_rq->sd;
+ struct sched_domain *tmp, *sd = NULL;
cpumask_t sibling_map;
prio_array_t *array;
int ret = 0, i;
task_t *p;
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ for_each_domain(this_cpu, tmp)
+ if (tmp->flags & SD_SHARE_CPUPOWER)
+ sd = tmp;
+
+ if (!sd)
return 0;
/*
runqueue_t *smt_rq = cpu_rq(i);
task_t *smt_curr = smt_rq->curr;
+ /* Kernel threads do not participate in dependent sleeping */
+ if (!p->mm || !smt_curr->mm || rt_task(p))
+ goto check_smt_task;
+
/*
* If a user task with lower static priority than the
* running task on the SMT sibling is trying to schedule,
* task from using an unfair proportion of the
* physical cpu's resources. -ck
*/
- if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(p) || rt_task(smt_curr)) &&
- p->mm && smt_curr->mm && !rt_task(p))
- ret = 1;
+ if (rt_task(smt_curr)) {
+ /*
+ * With real time tasks we run non-rt tasks only
+ * per_cpu_gain% of the time.
+ */
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ ret = 1;
+ } else
+ if (smt_curr->static_prio < p->static_prio &&
+ !TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(smt_curr, sd) > task_timeslice(p))
+ ret = 1;
+
+check_smt_task:
+ if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+ rt_task(smt_curr))
+ continue;
+ if (!p->mm) {
+ wakeup_busy_runqueue(smt_rq);
+ continue;
+ }
/*
- * Reschedule a lower priority task on the SMT sibling,
- * or wake it up if it has been put to sleep for priority
- * reasons.
+ * Reschedule a lower priority task on the SMT sibling for
+ * it to be put to sleep, or wake it up if it has been put to
+ * sleep for priority reasons to see if it should run now.
*/
- if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(smt_curr) || rt_task(p)) &&
- smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
- (smt_curr == smt_rq->idle && smt_rq->nr_running))
- resched_task(smt_curr);
+ if (rt_task(p)) {
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ resched_task(smt_curr);
+ } else {
+ if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(p, sd) > task_timeslice(smt_curr))
+ resched_task(smt_curr);
+ else
+ wakeup_busy_runqueue(smt_rq);
+ }
}
out_unlock:
for_each_cpu_mask(i, sibling_map)
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu, idx, new_prio;
/*
* Test if we are atomic. Since do_exit() needs to call into
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
- dequeue_task(next, array);
- recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
+ new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+ if (unlikely(next->prio != new_prio)) {
+ dequeue_task(next, array);
+ next->prio = new_prio;
+ enqueue_task(next, array);
+ } else
+ requeue_task(next, array);
}
next->activated = 0;
switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
prefetch(next);
+ prefetch_stack(next);
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
rq->curr = next;
++*switch_count;
- prepare_arch_switch(rq, next);
+ prepare_task_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();
-
- finish_task_switch(prev);
+ /*
+ * this_rq must be evaluated again because prev may have moved
+ * CPUs since it called schedule(), thus the 'rq' on its stack
+ * frame will be invalid.
+ */
+ finish_task_switch(this_rq(), prev);
} else
spin_unlock_irq(&rq->lock);
#endif /* CONFIG_PREEMPT */
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+ void *key)
{
task_t *p = curr->private;
return try_to_wake_up(p, mode, sync);
* @key: is directly passed to the wakeup function
*/
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
+ int nr_exclusive, void *key)
{
unsigned long flags;
*
* On UP it can prevent extra preemption.
*/
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
unsigned long flags;
int sync = 1;
EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
*/
int can_nice(const task_t *p, const int nice)
{
- /* convert nice value [19,-20] to rlimit style value [0,39] */
- int nice_rlim = 19 - nice;
+ /* convert nice value [19,-20] to rlimit style value [1,40] */
+ int nice_rlim = 20 - nice;
return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
capable(CAP_SYS_NICE));
}
{
return TASK_NICE(p);
}
-
-/*
- * The only users of task_nice are binfmt_elf and binfmt_elf32.
- * binfmt_elf is no longer modular, but binfmt_elf32 still is.
- * Therefore, task_nice is needed if there is a compat_mode.
- */
-#ifdef CONFIG_COMPAT
EXPORT_SYMBOL_GPL(task_nice);
-#endif
/**
* idle_cpu - is a given cpu idle currently?
p->policy = policy;
p->rt_priority = prio;
if (policy != SCHED_NORMAL)
- p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
+ p->prio = MAX_RT_PRIO-1 - p->rt_priority;
else
p->prio = p->static_prio;
}
* @policy: new policy.
* @param: structure containing the new RT priority.
*/
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+ struct sched_param *param)
{
int retval;
int oldprio, oldpolicy = -1;
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
*/
if (param->sched_priority < 0 ||
- param->sched_priority > MAX_USER_RT_PRIO-1)
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
return -EINVAL;
- if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
- param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
- !capable(CAP_SYS_NICE))
- return -EPERM;
- if ((current->euid != p->euid) && (current->euid != p->uid) &&
- !capable(CAP_SYS_NICE))
- return -EPERM;
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (!capable(CAP_SYS_NICE)) {
+ /* can't change policy */
+ if (policy != p->policy &&
+ !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+ return -EPERM;
+ /* can't increase priority */
+ if (policy != SCHED_NORMAL &&
+ param->sched_priority > p->rt_priority &&
+ param->sched_priority >
+ p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+ return -EPERM;
+ /* can't change other user's priorities */
+ if ((current->euid != p->euid) &&
+ (current->euid != p->uid))
+ return -EPERM;
+ }
retval = security_task_setscheduler(p, policy, param);
if (retval)
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
int retval;
struct sched_param lparam;
if (rt_task(current))
target = rq->active;
- if (current->array->nr_active == 1) {
+ if (array->nr_active == 1) {
schedstat_inc(rq, yld_act_empty);
if (!rq->expired->nr_active)
schedstat_inc(rq, yld_both_empty);
static inline void __cond_resched(void)
{
+ /*
+ * The BKS might be reacquired before we have dropped
+ * PREEMPT_ACTIVE, which could trigger a second
+ * cond_resched() call.
+ */
+ if (unlikely(preempt_count()))
+ return;
do {
add_preempt_count(PREEMPT_ACTIVE);
schedule();
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
{
int ret = 0;
return list_entry(p->sibling.next,struct task_struct,sibling);
}
-static void show_task(task_t * p)
+static void show_task(task_t *p)
{
task_t *relative;
unsigned state;
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
{
- unsigned long * n = (unsigned long *) (p->thread_info+1);
+ unsigned long *n = (unsigned long *) (p->thread_info+1);
while (!*n)
n++;
free = (unsigned long) n - (unsigned long)(p->thread_info+1);
read_unlock(&tasklist_lock);
}
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
void __devinit init_idle(task_t *idle, int cpu)
{
runqueue_t *rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
- set_tsk_need_resched(idle);
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ idle->oncpu = 1;
+#endif
spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
* thread migration by bumping thread off CPU then 'pushing' onto
* another runqueue.
*/
-static int migration_thread(void * data)
+static int migration_thread(void *data)
{
runqueue_t *rq;
int cpu = (long)data;
req = list_entry(head->next, migration_req_t, list);
list_del_init(head->next);
- if (req->type == REQ_MOVE_TASK) {
- spin_unlock(&rq->lock);
- __migrate_task(req->task, cpu, req->dest_cpu);
- local_irq_enable();
- } else if (req->type == REQ_SET_DOMAIN) {
- rq->sd = req->sd;
- spin_unlock_irq(&rq->lock);
- } else {
- spin_unlock_irq(&rq->lock);
- WARN_ON(1);
- }
+ spin_unlock(&rq->lock);
+ __migrate_task(req->task, cpu, req->dest_cpu);
+ local_irq_enable();
complete(&req->done);
}
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
/* Unbind it from offline cpu so it can run. Fall thru. */
- kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
+ kthread_bind(cpu_rq(cpu)->migration_thread,
+ any_online_cpu(cpu_online_map));
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
migration_req_t *req;
req = list_entry(rq->migration_queue.next,
migration_req_t, list);
- BUG_ON(req->type != REQ_MOVE_TASK);
list_del_init(&req->list);
complete(&req->done);
}
#endif
#ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
#ifdef SCHED_DOMAIN_DEBUG
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
do {
#define sched_domain_debug(sd, cpu) {}
#endif
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpus_weight(sd->span) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if (sd->flags & (SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC)) {
+ if (sd->groups != sd->groups->next)
+ return 0;
+ }
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_IDLE |
+ SD_WAKE_AFFINE |
+ SD_WAKE_BALANCE))
+ return 0;
+
+ return 1;
+}
+
+static int sd_parent_degenerate(struct sched_domain *sd,
+ struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpus_equal(sd->span, parent->span))
+ return 0;
+
+ /* Does parent contain flags not in child? */
+ /* WAKE_BALANCE is a subset of WAKE_AFFINE */
+ if (cflags & SD_WAKE_AFFINE)
+ pflags &= ~SD_WAKE_BALANCE;
+ /* Flags needing groups don't count if only 1 group in parent */
+ if (parent->groups == parent->groups->next) {
+ pflags &= ~(SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC);
+ }
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
{
- migration_req_t req;
- unsigned long flags;
runqueue_t *rq = cpu_rq(cpu);
- int local = 1;
-
- sched_domain_debug(sd, cpu);
-
- spin_lock_irqsave(&rq->lock, flags);
+ struct sched_domain *tmp;
- if (cpu == smp_processor_id() || !cpu_online(cpu)) {
- rq->sd = sd;
- } else {
- init_completion(&req.done);
- req.type = REQ_SET_DOMAIN;
- req.sd = sd;
- list_add(&req.list, &rq->migration_queue);
- local = 0;
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; tmp = tmp->parent) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+ if (sd_parent_degenerate(tmp, parent))
+ tmp->parent = parent->parent;
}
- spin_unlock_irqrestore(&rq->lock, flags);
+ if (sd && sd_degenerate(sd))
+ sd = sd->parent;
- if (!local) {
- wake_up_process(rq->migration_thread);
- wait_for_completion(&req.done);
- }
+ sched_domain_debug(sd, cpu);
+
+ rcu_assign_pointer(rq->sd, sd);
}
/* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
-void __devinit init_sched_build_groups(struct sched_group groups[],
- cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+ int (*group_fn)(int cpu))
{
struct sched_group *first = NULL, *last = NULL;
cpumask_t covered = CPU_MASK_NONE;
last->next = first;
}
+#define SD_NODES_PER_DOMAIN 16
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+ int i, n, val, min_val, best_node = 0;
+
+ min_val = INT_MAX;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ /* Start at @node */
+ n = (node + i) % MAX_NUMNODES;
+
+ if (!nr_cpus_node(n))
+ continue;
+
+ /* Skip already used nodes */
+ if (test_bit(n, used_nodes))
+ continue;
+
+ /* Simple min distance search */
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ set_bit(best_node, used_nodes);
+ return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+ int i;
+ cpumask_t span, nodemask;
+ DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+ cpus_clear(span);
+ bitmap_zero(used_nodes, MAX_NUMNODES);
+
+ nodemask = node_to_cpumask(node);
+ cpus_or(span, span, nodemask);
+ set_bit(node, used_nodes);
+
+ for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+ int next_node = find_next_best_node(node, used_nodes);
+ nodemask = node_to_cpumask(next_node);
+ cpus_or(span, span, nodemask);
+ }
+
+ return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
{
return cpu;
}
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
{
#ifdef CONFIG_SCHED_SMT
return first_cpu(cpu_sibling_map[cpu]);
}
#ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
-{
- return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
/*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
*/
-static void check_sibling_maps(void)
-{
- int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
- for_each_online_cpu(i) {
- for_each_cpu_mask(j, cpu_sibling_map[i]) {
- if (cpu_to_node(i) != cpu_to_node(j)) {
- printk(KERN_INFO "warning: CPU %d siblings map "
- "to different node - isolating "
- "them.\n", i);
- cpu_sibling_map[i] = cpumask_of_cpu(i);
- break;
- }
- }
- }
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+ return cpu_to_node(cpu);
}
#endif
/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
*/
-static void __devinit arch_init_sched_domains(void)
+void build_sched_domains(const cpumask_t *cpu_map)
{
int i;
- cpumask_t cpu_default_map;
+#ifdef CONFIG_NUMA
+ struct sched_group **sched_group_nodes = NULL;
+ struct sched_group *sched_group_allnodes = NULL;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
- check_sibling_maps();
-#endif
/*
- * Setup mask for cpus without special case scheduling requirements.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
+ * Allocate the per-node list of sched groups
*/
- cpus_complement(cpu_default_map, cpu_isolated_map);
- cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
+ sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ GFP_ATOMIC);
+ if (!sched_group_nodes) {
+ printk(KERN_WARNING "Can not alloc sched group node list\n");
+ return;
+ }
+ sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
/*
- * Set up domains. Isolated domains just stay on the dummy domain.
+ * Set up domains for cpus specified by the cpu_map.
*/
- for_each_cpu_mask(i, cpu_default_map) {
+ for_each_cpu_mask(i, *cpu_map) {
int group;
struct sched_domain *sd = NULL, *p;
cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
- cpus_and(nodemask, nodemask, cpu_default_map);
+ cpus_and(nodemask, nodemask, *cpu_map);
#ifdef CONFIG_NUMA
+ if (cpus_weight(*cpu_map)
+ > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+ if (!sched_group_allnodes) {
+ sched_group_allnodes
+ = kmalloc(sizeof(struct sched_group)
+ * MAX_NUMNODES,
+ GFP_KERNEL);
+ if (!sched_group_allnodes) {
+ printk(KERN_WARNING
+ "Can not alloc allnodes sched group\n");
+ break;
+ }
+ sched_group_allnodes_bycpu[i]
+ = sched_group_allnodes;
+ }
+ sd = &per_cpu(allnodes_domains, i);
+ *sd = SD_ALLNODES_INIT;
+ sd->span = *cpu_map;
+ group = cpu_to_allnodes_group(i);
+ sd->groups = &sched_group_allnodes[group];
+ p = sd;
+ } else
+ p = NULL;
+
sd = &per_cpu(node_domains, i);
- group = cpu_to_node_group(i);
*sd = SD_NODE_INIT;
- sd->span = cpu_default_map;
- sd->groups = &sched_group_nodes[group];
+ sd->span = sched_domain_node_span(cpu_to_node(i));
+ sd->parent = p;
+ cpus_and(sd->span, sd->span, *cpu_map);
#endif
p = sd;
group = cpu_to_cpu_group(i);
*sd = SD_SIBLING_INIT;
sd->span = cpu_sibling_map[i];
- cpus_and(sd->span, sd->span, cpu_default_map);
+ cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
sd->groups = &sched_group_cpus[group];
#endif
#ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */
- for_each_online_cpu(i) {
+ for_each_cpu_mask(i, *cpu_map) {
cpumask_t this_sibling_map = cpu_sibling_map[i];
- cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+ cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
if (i != first_cpu(this_sibling_map))
continue;
for (i = 0; i < MAX_NUMNODES; i++) {
cpumask_t nodemask = node_to_cpumask(i);
- cpus_and(nodemask, nodemask, cpu_default_map);
+ cpus_and(nodemask, nodemask, *cpu_map);
if (cpus_empty(nodemask))
continue;
#ifdef CONFIG_NUMA
/* Set up node groups */
- init_sched_build_groups(sched_group_nodes, cpu_default_map,
- &cpu_to_node_group);
+ if (sched_group_allnodes)
+ init_sched_build_groups(sched_group_allnodes, *cpu_map,
+ &cpu_to_allnodes_group);
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ /* Set up node groups */
+ struct sched_group *sg, *prev;
+ cpumask_t nodemask = node_to_cpumask(i);
+ cpumask_t domainspan;
+ cpumask_t covered = CPU_MASK_NONE;
+ int j;
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask)) {
+ sched_group_nodes[i] = NULL;
+ continue;
+ }
+
+ domainspan = sched_domain_node_span(i);
+ cpus_and(domainspan, domainspan, *cpu_map);
+
+ sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ sched_group_nodes[i] = sg;
+ for_each_cpu_mask(j, nodemask) {
+ struct sched_domain *sd;
+ sd = &per_cpu(node_domains, j);
+ sd->groups = sg;
+ if (sd->groups == NULL) {
+ /* Turn off balancing if we have no groups */
+ sd->flags = 0;
+ }
+ }
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", i);
+ continue;
+ }
+ sg->cpu_power = 0;
+ sg->cpumask = nodemask;
+ cpus_or(covered, covered, nodemask);
+ prev = sg;
+
+ for (j = 0; j < MAX_NUMNODES; j++) {
+ cpumask_t tmp, notcovered;
+ int n = (i + j) % MAX_NUMNODES;
+
+ cpus_complement(notcovered, covered);
+ cpus_and(tmp, notcovered, *cpu_map);
+ cpus_and(tmp, tmp, domainspan);
+ if (cpus_empty(tmp))
+ break;
+
+ nodemask = node_to_cpumask(n);
+ cpus_and(tmp, tmp, nodemask);
+ if (cpus_empty(tmp))
+ continue;
+
+ sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", j);
+ break;
+ }
+ sg->cpu_power = 0;
+ sg->cpumask = tmp;
+ cpus_or(covered, covered, tmp);
+ prev->next = sg;
+ prev = sg;
+ }
+ prev->next = sched_group_nodes[i];
+ }
#endif
/* Calculate CPU power for physical packages and nodes */
- for_each_cpu_mask(i, cpu_default_map) {
+ for_each_cpu_mask(i, *cpu_map) {
int power;
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd->groups->cpu_power = power;
#ifdef CONFIG_NUMA
- if (i == first_cpu(sd->groups->cpumask)) {
- /* Only add "power" once for each physical package. */
- sd = &per_cpu(node_domains, i);
- sd->groups->cpu_power += power;
+ sd = &per_cpu(allnodes_domains, i);
+ if (sd->groups) {
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
+ sd->groups->cpu_power = power;
}
#endif
}
+#ifdef CONFIG_NUMA
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ struct sched_group *sg = sched_group_nodes[i];
+ int j;
+
+ if (sg == NULL)
+ continue;
+next_sg:
+ for_each_cpu_mask(j, sg->cpumask) {
+ struct sched_domain *sd;
+ int power;
+
+ sd = &per_cpu(phys_domains, j);
+ if (j != first_cpu(sd->groups->cpumask)) {
+ /*
+ * Only add "power" once for each
+ * physical package.
+ */
+ continue;
+ }
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
+
+ sg->cpu_power += power;
+ }
+ sg = sg->next;
+ if (sg != sched_group_nodes[i])
+ goto next_sg;
+ }
+#endif
+
/* Attach the domains */
- for_each_online_cpu(i) {
+ for_each_cpu_mask(i, *cpu_map) {
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
cpu_attach_domain(sd, i);
}
}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
{
- /* Do nothing: everything is statically allocated. */
+ cpumask_t cpu_default_map;
+
+ /*
+ * Setup mask for cpus without special case scheduling requirements.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+ cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+ build_sched_domains(&cpu_default_map);
}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
+{
+#ifdef CONFIG_NUMA
+ int i;
+ int cpu;
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ struct sched_group *sched_group_allnodes
+ = sched_group_allnodes_bycpu[cpu];
+ struct sched_group **sched_group_nodes
+ = sched_group_nodes_bycpu[cpu];
+
+ if (sched_group_allnodes) {
+ kfree(sched_group_allnodes);
+ sched_group_allnodes_bycpu[cpu] = NULL;
+ }
+
+ if (!sched_group_nodes)
+ continue;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ if (sg == NULL)
+ continue;
+ sg = sg->next;
+next_sg:
+ oldsg = sg;
+ sg = sg->next;
+ kfree(oldsg);
+ if (oldsg != sched_group_nodes[i])
+ goto next_sg;
+ }
+ kfree(sched_group_nodes);
+ sched_group_nodes_bycpu[cpu] = NULL;
+ }
#endif
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+ int i;
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+ for_each_cpu_mask(i, *cpu_map)
+ cpu_attach_domain(NULL, i);
+ synchronize_sched();
+ arch_destroy_sched_domains(cpu_map);
+}
/*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
- * it is initialized to zero, so all balancing flags are cleared which is
- * what we want.
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
*/
-static struct sched_domain sched_domain_dummy;
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+ cpumask_t change_map;
+
+ cpus_and(*partition1, *partition1, cpu_online_map);
+ cpus_and(*partition2, *partition2, cpu_online_map);
+ cpus_or(change_map, *partition1, *partition2);
+
+ /* Detach sched domains from all of the affected cpus */
+ detach_destroy_domains(&change_map);
+ if (!cpus_empty(*partition1))
+ build_sched_domains(partition1);
+ if (!cpus_empty(*partition2))
+ build_sched_domains(partition2);
+}
#ifdef CONFIG_HOTPLUG_CPU
/*
* Force a reinitialization of the sched domains hierarchy. The domains
* and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
* which will prevent rebalancing while the sched domains are recalculated.
*/
static int update_sched_domains(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- int i;
-
switch (action) {
case CPU_UP_PREPARE:
case CPU_DOWN_PREPARE:
- for_each_online_cpu(i)
- cpu_attach_domain(&sched_domain_dummy, i);
- arch_destroy_sched_domains();
+ detach_destroy_domains(&cpu_online_map);
return NOTIFY_OK;
case CPU_UP_CANCELED:
}
/* The hotplug lock is already held by cpu_up/cpu_down */
- arch_init_sched_domains();
+ arch_init_sched_domains(&cpu_online_map);
return NOTIFY_OK;
}
void __init sched_init_smp(void)
{
lock_cpu_hotplug();
- arch_init_sched_domains();
+ arch_init_sched_domains(&cpu_online_map);
unlock_cpu_hotplug();
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
+ rq->nr_running = 0;
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
- rq->sd = &sched_domain_dummy;
- rq->cpu_load = 0;
+ rq->sd = NULL;
+ for (j = 1; j < 3; j++)
+ rq->cpu_load[j] = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
rq->migration_thread = NULL;
}
#endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+ return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner. This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+ cpu_curr(cpu) = p;
+}
+
+#endif