#include <linux/completion.h>
#include <linux/kernel_stat.h>
#include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
-#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
-#include <trace/sched.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include "sched_cpupri.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
*/
#define RUNTIME_INF ((u64)~0ULL)
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
#ifdef CONFIG_SMP
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
hard = hrtimer_get_expires(&rt_b->rt_period_timer);
delta = ktime_to_ns(ktime_sub(hard, soft));
__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
- HRTIMER_MODE_ABS, 0);
+ HRTIMER_MODE_ABS_PINNED, 0);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
+ unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
+ u64 nr_migrations_in;
struct cfs_rq cfs;
struct rt_rq rt;
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
{
rq->clock = sched_clock_cpu(cpu_of(rq));
}
static void hrtick_start(struct rq *rq, u64 delay)
{
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
- HRTIMER_MODE_REL, 0);
+ HRTIMER_MODE_REL_PINNED, 0);
}
static inline void init_hrtick(void)
clock_offset = old_rq->clock - new_rq->clock;
- trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ trace_sched_migrate_task(p, new_cpu);
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
+#endif
if (old_cpu != new_cpu) {
- schedstat_inc(p, se.nr_migrations);
+ p->se.nr_migrations++;
+ new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
- }
#endif
+ perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);
+ }
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;
}
/*
+ * wait_task_context_switch - wait for a thread to complete at least one
+ * context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+ unsigned long nvcsw, nivcsw, flags;
+ int running;
+ struct rq *rq;
+
+ nvcsw = p->nvcsw;
+ nivcsw = p->nivcsw;
+ for (;;) {
+ /*
+ * The runqueue is assigned before the actual context
+ * switch. We need to take the runqueue lock.
+ *
+ * We could check initially without the lock but it is
+ * very likely that we need to take the lock in every
+ * iteration.
+ */
+ rq = task_rq_lock(p, &flags);
+ running = task_running(rq, p);
+ task_rq_unlock(rq, &flags);
+
+ if (likely(!running))
+ break;
+ /*
+ * The switch count is incremented before the actual
+ * context switch. We thus wait for two switches to be
+ * sure at least one completed.
+ */
+ if ((p->nvcsw - nvcsw) > 1)
+ break;
+ if ((p->nivcsw - nivcsw) > 1)
+ break;
+
+ cpu_relax();
+ }
+}
+
+/*
* wait_task_inactive - wait for a thread to unschedule.
*
* If @match_state is nonzero, it's the @p->state value just checked and
smp_send_reschedule(cpu);
preempt_enable();
}
+EXPORT_SYMBOL_GPL(kick_process);
/*
* Return a low guess at the load of a migration-source cpu weighted
#endif /* CONFIG_SMP */
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p: the task to evaluate
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+ void (*func) (void *info), void *info)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if (task_curr(p))
+ smp_call_function_single(cpu, func, info, 1);
+ preempt_enable();
+}
+
/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
return success;
}
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes. Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_ALL, 0);
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.nr_migrations = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
p->se.start_runtime = 0;
p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.sum_sleep_runtime = 0;
- p->se.sleep_start = 0;
- p->se.block_start = 0;
- p->se.sleep_max = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
- p->se.wait_max = 0;
+ p->se.wait_start = 0;
+ p->se.wait_max = 0;
+ p->se.wait_count = 0;
+ p->se.wait_sum = 0;
+
+ p->se.sleep_start = 0;
+ p->se.sleep_max = 0;
+ p->se.sum_sleep_runtime = 0;
+
+ p->se.block_start = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+
+ p->se.nr_migrations_cold = 0;
+ p->se.nr_failed_migrations_affine = 0;
+ p->se.nr_failed_migrations_running = 0;
+ p->se.nr_failed_migrations_hot = 0;
+ p->se.nr_forced_migrations = 0;
+ p->se.nr_forced2_migrations = 0;
+
+ p->se.nr_wakeups = 0;
+ p->se.nr_wakeups_sync = 0;
+ p->se.nr_wakeups_migrate = 0;
+ p->se.nr_wakeups_local = 0;
+ p->se.nr_wakeups_remote = 0;
+ p->se.nr_wakeups_affine = 0;
+ p->se.nr_wakeups_affine_attempts = 0;
+ p->se.nr_wakeups_passive = 0;
+ p->se.nr_wakeups_idle = 0;
+
#endif
INIT_LIST_HEAD(&p->rt.run_list);
*/
prev_state = prev->state;
finish_arch_switch(prev);
+ perf_counter_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP
if (post_schedule)
* combine the page table reload and the switch backend into
* one hypercall.
*/
- arch_enter_lazy_cpu_mode();
+ arch_start_context_switch(prev);
if (unlikely(!mm)) {
next->active_mm = oldmm;
}
/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+ return cpu_rq(cpu)->nr_migrations_in;
+}
+
+/*
* Update rq->cpu_load[] statistics. This function is usually called every
* scheduler tick (TICK_NSEC).
*/
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/**
* lowest_flag_domain - Return lowest sched_domain containing flag.
curr->sched_class->task_tick(rq, curr, 0);
spin_unlock(&rq->lock);
+ perf_counter_task_tick(curr, cpu);
+
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
if (likely(prev != next)) {
sched_info_switch(prev, next);
+ perf_counter_task_sched_out(prev, next, cpu);
rq->nr_switches++;
rq->curr = next;
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
* with each other. This can prevent needless bouncing between CPUs.
*
* On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
* awakened in the same order in which they were queued.
*
* See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void complete(struct completion *x)
{
* @x: holds the state of this particular completion
*
* This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void complete_all(struct completion *x)
{
return 0;
}
+static inline int should_resched(void)
+{
+ return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
static void __cond_resched(void)
{
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
int __sched _cond_resched(void)
{
- if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
- system_state == SYSTEM_RUNNING) {
+ if (should_resched()) {
__cond_resched();
return 1;
}
*/
int cond_resched_lock(spinlock_t *lock)
{
- int resched = need_resched() && system_state == SYSTEM_RUNNING;
+ int resched = should_resched();
int ret = 0;
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched && need_resched())
+ if (resched)
__cond_resched();
else
cpu_relax();
{
BUG_ON(!in_softirq());
- if (need_resched() && system_state == SYSTEM_RUNNING) {
+ if (should_resched()) {
local_bh_enable();
__cond_resched();
local_bh_disable();
if (cpu_is_offline(cpu)) {
spin_unlock_irq(&rq->lock);
- goto wait_to_die;
+ break;
}
if (rq->active_balance) {
complete(&req->done);
}
__set_current_state(TASK_RUNNING);
- return 0;
-wait_to_die:
- /* Wait for kthread_stop */
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
return 0;
}
static void calc_global_load_remove(struct rq *rq)
{
atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ rq->calc_load_active = 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
rq = task_rq_lock(p, &flags);
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
task_rq_unlock(rq, &flags);
+ get_task_struct(p);
cpu_rq(cpu)->migration_thread = p;
+ rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
/* Update our root-domain */
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
- rq->calc_load_update = calc_load_update;
- rq->calc_load_active = 0;
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
kthread_bind(cpu_rq(cpu)->migration_thread,
cpumask_any(cpu_online_mask));
kthread_stop(cpu_rq(cpu)->migration_thread);
+ put_task_struct(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
+ put_task_struct(rq->migration_thread);
rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
spin_lock_irq(&rq->lock);
return NOTIFY_OK;
}
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else. This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
*/
static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
free_rootdomain(old_rd);
}
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
+
memset(rd, 0, sizeof(*rd));
- if (bootmem) {
- alloc_bootmem_cpumask_var(&def_root_domain.span);
- alloc_bootmem_cpumask_var(&def_root_domain.online);
- alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
- cpupri_init(&rd->cpupri, true);
- return 0;
- }
+ if (bootmem)
+ gfp = GFP_NOWAIT;
- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->span, gfp))
goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->online, gfp))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->rto_mask, gfp))
goto free_online;
- if (cpupri_init(&rd->cpupri, false) != 0)
+ if (cpupri_init(&rd->cpupri, bootmem) != 0)
goto free_rto_mask;
return 0;
/*
* The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ * and struct sched_domain. )
*/
struct static_sched_group {
struct sched_group sg;
}
#endif /* CONFIG_SMP */
+const_debug unsigned int sysctl_timer_migration = 1;
+
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
- plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+ plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
#endif
rt_rq->rt_time = 0;
* we use alloc_bootmem().
*/
if (alloc_size) {
- ptr = (unsigned long)alloc_bootmem(alloc_size);
+ ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.se = (struct sched_entity **)ptr;
* 1024) and two child groups A0 and A1 (of weight 1024 each),
* then A0's share of the cpu resource is:
*
- * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+ * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
* We achieve this by letting init_task_group's tasks sit
* directly in rq->cfs (i.e init_task_group->se[] = NULL).
current->sched_class = &fair_sched_class;
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
- alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- alloc_bootmem_cpumask_var(&nohz.cpu_mask);
- alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+ alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
#endif
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
+ perf_counter_init();
+
scheduler_running = 1;
}