X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched.c;h=d906f72b42d23ae1d8c2355d9b605e5fd0761eaa;hb=92b29b86fe2e183d44eb467e5e74a5f718ef2e43;hp=827c1b416da6ea9e80cdb8459bd81463f33e1b50;hpb=032f82786f9be4635acaa5f77feca175a4ac5fe1;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 827c1b4..d906f72 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -70,6 +70,8 @@ #include #include #include +#include +#include #include #include @@ -200,14 +202,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; +} + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -297,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -570,8 +577,10 @@ struct rq { #endif #ifdef CONFIG_SCHED_HRTICK - unsigned long hrtick_flags; - ktime_t hrtick_expire; +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif struct hrtimer hrtick_timer; #endif @@ -597,14 +606,13 @@ struct rq { /* BKL stats */ unsigned int bkl_count; #endif - struct lock_class_key rq_lock_key; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -645,6 +653,24 @@ static inline void update_rq_clock(struct rq *rq) # define const_debug static const #endif +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + /* * Debugging: various feature bits */ @@ -788,9 +814,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; /* * ratelimit for updating the group shares. - * default: 0.5ms + * default: 0.25ms */ -const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; +unsigned int sysctl_sched_shares_ratelimit = 250000; /* * period over which we measure -rt task cpu usage in us. @@ -813,7 +839,7 @@ static inline u64 global_rt_period(void) static inline u64 global_rt_runtime(void) { - if (sysctl_sched_rt_period < 0) + if (sysctl_sched_rt_runtime < 0) return RUNTIME_INF; return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; @@ -964,13 +990,6 @@ static struct rq *this_rq_lock(void) return rq; } -static void __resched_task(struct task_struct *p, int tif_bit); - -static inline void resched_task(struct task_struct *p) -{ - __resched_task(p, TIF_NEED_RESCHED); -} - #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -982,25 +1001,6 @@ static inline void resched_task(struct task_struct *p) * When we get rescheduled we reprogram the hrtick_timer outside of the * rq->lock. */ -static inline void resched_hrt(struct task_struct *p) -{ - __resched_task(p, TIF_HRTICK_RESCHED); -} - -static inline void resched_rq(struct rq *rq) -{ - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - resched_task(rq->curr); - spin_unlock_irqrestore(&rq->lock, flags); -} - -enum { - HRTICK_SET, /* re-programm hrtick_timer */ - HRTICK_RESET, /* not a new slice */ - HRTICK_BLOCK, /* stop hrtick operations */ -}; /* * Use hrtick when: @@ -1011,40 +1011,11 @@ static inline int hrtick_enabled(struct rq *rq) { if (!sched_feat(HRTICK)) return 0; - if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) + if (!cpu_active(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -static void hrtick_start(struct rq *rq, u64 delay, int reset) -{ - assert_spin_locked(&rq->lock); - - /* - * preempt at: now + delay - */ - rq->hrtick_expire = - ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); - /* - * indicate we need to program the timer - */ - __set_bit(HRTICK_SET, &rq->hrtick_flags); - if (reset) - __set_bit(HRTICK_RESET, &rq->hrtick_flags); - - /* - * New slices are called from the schedule path and don't need a - * forced reschedule. - */ - if (reset) - resched_hrt(rq->curr); -} - static void hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) @@ -1052,32 +1023,6 @@ static void hrtick_clear(struct rq *rq) } /* - * Update the timer from the possible pending state. - */ -static void hrtick_set(struct rq *rq) -{ - ktime_t time; - int set, reset; - unsigned long flags; - - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - - spin_lock_irqsave(&rq->lock, flags); - set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); - reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); - time = rq->hrtick_expire; - clear_thread_flag(TIF_HRTICK_RESCHED); - spin_unlock_irqrestore(&rq->lock, flags); - - if (set) { - hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); - if (reset && !hrtimer_active(&rq->hrtick_timer)) - resched_rq(rq); - } else - hrtick_clear(rq); -} - -/* * High-resolution timer tick. * Runs from hardirq context with interrupts disabled. */ @@ -1096,27 +1041,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) } #ifdef CONFIG_SMP -static void hotplug_hrtick_disable(int cpu) +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - rq->hrtick_flags = 0; - __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); - spin_unlock_irqrestore(&rq->lock, flags); + struct rq *rq = arg; - hrtick_clear(rq); + spin_lock(&rq->lock); + hrtimer_restart(&rq->hrtick_timer); + rq->hrtick_csd_pending = 0; + spin_unlock(&rq->lock); } -static void hotplug_hrtick_enable(int cpu) +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - spin_lock_irqsave(&rq->lock, flags); - __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); - spin_unlock_irqrestore(&rq->lock, flags); + timer->expires = time; + + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + rq->hrtick_csd_pending = 1; + } } static int @@ -1131,70 +1086,60 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - hotplug_hrtick_disable(cpu); - return NOTIFY_OK; - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hotplug_hrtick_enable(cpu); + hrtick_clear(cpu_rq(cpu)); return NOTIFY_OK; } return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } -#endif /* CONFIG_SMP */ +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); +} -static void init_rq_hrtick(struct rq *rq) +static inline void init_hrtick(void) { - rq->hrtick_flags = 0; - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } +#endif /* CONFIG_SMP */ -void hrtick_resched(void) +static void init_rq_hrtick(struct rq *rq) { - struct rq *rq; - unsigned long flags; +#ifdef CONFIG_SMP + rq->hrtick_csd_pending = 0; - if (!test_thread_flag(TIF_HRTICK_RESCHED)) - return; + rq->hrtick_csd.flags = 0; + rq->hrtick_csd.func = __hrtick_start; + rq->hrtick_csd.info = rq; +#endif - local_irq_save(flags); - rq = cpu_rq(smp_processor_id()); - hrtick_set(rq); - local_irq_restore(flags); + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } -static inline void hrtick_set(struct rq *rq) -{ -} - static inline void init_rq_hrtick(struct rq *rq) { } -void hrtick_resched(void) -{ -} - static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. @@ -1209,16 +1154,16 @@ static inline void init_hrtick(void) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void __resched_task(struct task_struct *p, int tif_bit) +static void resched_task(struct task_struct *p) { int cpu; assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, tif_bit))) + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) return; - set_tsk_thread_flag(p, tif_bit); + set_tsk_thread_flag(p, TIF_NEED_RESCHED); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -1284,10 +1229,10 @@ void wake_up_idle_cpu(int cpu) #endif /* CONFIG_NO_HZ */ #else /* !CONFIG_SMP */ -static void __resched_task(struct task_struct *p, int tif_bit) +static void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); - set_tsk_thread_flag(p, tif_bit); + set_tsk_need_resched(p); } #endif /* CONFIG_SMP */ @@ -1441,38 +1386,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1480,15 +1411,43 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; } +#ifdef CONFIG_FAIR_GROUP_SCHED + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1547,11 +1506,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1576,6 +1535,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1583,10 +1544,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1597,11 +1558,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1611,7 +1569,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1622,9 +1580,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else @@ -1927,16 +1885,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * wait_task_inactive - wait for a thread to unschedule. * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(struct task_struct *p) +unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; int running, on_rq; + unsigned long ncsw; struct rq *rq; for (;;) { @@ -1959,8 +1925,11 @@ void wait_task_inactive(struct task_struct *p) * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; cpu_relax(); + } /* * Ok, time to look more closely! We need the rq @@ -1968,11 +1937,21 @@ void wait_task_inactive(struct task_struct *p) * just go back and repeat. */ rq = task_rq_lock(p, &flags); + trace_sched_wait_task(rq, p); running = task_running(rq, p); on_rq = p->se.on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* * Was it really running after all now that we * checked with the proper locks actually held? * @@ -2004,6 +1983,8 @@ void wait_task_inactive(struct task_struct *p) */ break; } + + return ncsw; } /*** @@ -2089,7 +2070,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2131,7 +2112,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, /* Traverse only the allowed CPUs */ cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask(i, *tmp) { + for_each_cpu_mask_nr(i, *tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -2318,7 +2299,8 @@ out_activate: success = 1; out_running: - check_preempt_curr(rq, p); + trace_sched_wakeup(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2450,7 +2432,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - check_preempt_curr(rq, p); + trace_sched_wakeup_new(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2622,6 +2605,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + trace_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -2783,10 +2767,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) } else { if (rq1 < rq2) { spin_lock(&rq1->lock); - spin_lock(&rq2->lock); + spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&rq2->lock); - spin_lock(&rq1->lock); + spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -2829,14 +2813,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); - spin_lock(&this_rq->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock(&busiest->lock); + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); } return ret; } +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -2851,9 +2842,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) rq = task_rq_lock(p, &flags); if (!cpu_isset(dest_cpu, p->cpus_allowed) - || unlikely(cpu_is_offline(dest_cpu))) + || unlikely(!cpu_active(dest_cpu))) goto out; + trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -2898,7 +2890,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -3138,7 +3130,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { struct rq *rq; if (!cpu_isset(i, *cpus)) @@ -3417,7 +3409,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long max_load = 0; int i; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -3661,7 +3653,7 @@ redo: ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, CPU_NEWLY_IDLE, &all_pinned); - spin_unlock(&busiest->lock); + double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), *cpus); @@ -3776,7 +3768,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) else schedstat_inc(sd, alb_failed); } - spin_unlock(&target_rq->lock); + double_unlock_balance(busiest_rq, target_rq); } #ifdef CONFIG_NO_HZ @@ -3819,7 +3811,7 @@ int select_nohz_load_balancer(int stop_tick) /* * If we are going offline and still the leader, give up! */ - if (cpu_is_offline(cpu) && + if (!cpu_active(cpu) && atomic_read(&nohz.load_balancer) == cpu) { if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); @@ -3959,7 +3951,7 @@ static void run_rebalance_domains(struct softirq_action *h) int balance_cpu; cpu_clear(this_cpu, cpus); - for_each_cpu_mask(balance_cpu, cpus) { + for_each_cpu_mask_nr(balance_cpu, cpus) { /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -4055,23 +4047,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. */ -unsigned long long task_sched_runtime(struct task_struct *p) +unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; - u64 ns, delta_exec; struct rq *rq; + u64 ns = 0; rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; + if (task_current(rq, p)) { + u64 delta_exec; + update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) - ns += delta_exec; + ns = delta_exec; } + task_rq_unlock(rq, &flags); return ns; @@ -4088,6 +4083,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4095,6 +4091,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + /* Account for user time used */ + acct_update_integrals(p); } /* @@ -4110,6 +4108,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@ -4145,6 +4144,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } p->stime = cputime_add(p->stime, cputime); + account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4186,6 +4186,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); + account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else @@ -4195,6 +4196,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal) } /* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +cputime_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + return p->prev_utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + clock_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + + if (stime >= 0) + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + + return p->prev_stime; +} +#endif + +inline cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -4221,26 +4281,44 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void __kprobes add_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -4252,7 +4330,10 @@ void __kprobes sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); @@ -4344,7 +4425,7 @@ asmlinkage void __sched schedule(void) struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; - int cpu, hrtick = sched_feat(HRTICK); + int cpu; need_resched: preempt_disable(); @@ -4359,7 +4440,7 @@ need_resched_nonpreemptible: schedule_debug(prev); - if (hrtick) + if (sched_feat(HRTICK)) hrtick_clear(rq); /* @@ -4406,9 +4487,6 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); - if (hrtick) - hrtick_set(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -4566,6 +4644,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4577,6 +4664,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4597,10 +4690,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } @@ -4628,12 +4718,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4641,6 +4750,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4650,6 +4766,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4658,6 +4782,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); @@ -4667,6 +4798,52 @@ int __sched wait_for_completion_killable(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_killable); +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(completion_done); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -4944,16 +5121,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) set_load_weight(p); } -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) +static int __sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -4985,7 +5154,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ - if (!capable(CAP_SYS_NICE)) { + if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { unsigned long rlim_rtprio; @@ -5016,18 +5185,22 @@ recheck: return -EPERM; } + if (user) { #ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) + return -EPERM; #endif - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + } + /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -5070,8 +5243,39 @@ recheck: return 0; } + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} EXPORT_SYMBOL_GPL(sched_setscheduler); +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} + static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { @@ -5566,7 +5770,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { @@ -5712,6 +5916,8 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; + + sysctl_sched_shares_ratelimit *= factor; } #ifdef CONFIG_SMP @@ -5801,7 +6007,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) struct rq *rq_dest, *rq_src; int ret = 0, on_rq; - if (unlikely(cpu_is_offline(dest_cpu))) + if (unlikely(!cpu_active(dest_cpu))) return ret; rq_src = cpu_rq(src_cpu); @@ -5810,10 +6016,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) - goto out; + goto done; /* Affinity changed (again). */ if (!cpu_isset(dest_cpu, p->cpus_allowed)) - goto out; + goto fail; on_rq = p->se.on_rq; if (on_rq) @@ -5822,10 +6028,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + check_preempt_curr(rq_dest, p, 0); } +done: ret = 1; -out: +fail: double_rq_unlock(rq_src, rq_dest); return ret; } @@ -6146,7 +6353,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(12); + struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@ -6174,7 +6381,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ return table; } @@ -6393,7 +6602,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = 10 }; -void __init migration_init(void) +static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -6403,7 +6612,10 @@ void __init migration_init(void) BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + + return err; } +early_initcall(migration_init); #endif #ifdef CONFIG_SMP @@ -6692,7 +6904,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - int ints[NR_CPUS], i; + static int __initdata ints[NR_CPUS]; + int i; str = get_options(str, ARRAY_SIZE(ints), ints); cpus_clear(cpu_isolated_map); @@ -6726,7 +6939,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(*covered); - for_each_cpu_mask(i, *span) { + for_each_cpu_mask_nr(i, *span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; @@ -6737,7 +6950,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask(j, *span) { + for_each_cpu_mask_nr(j, *span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; @@ -6773,9 +6986,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) min_val = INT_MAX; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Start at @node */ - n = (node + i) % MAX_NUMNODES; + n = (node + i) % nr_node_ids; if (!nr_cpus_node(n)) continue; @@ -6937,7 +7150,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu_mask(j, sg->cpumask) { + for_each_cpu_mask_nr(j, sg->cpumask) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); @@ -6962,14 +7175,14 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; - for_each_cpu_mask(cpu, *cpu_map) { + for_each_cpu_mask_nr(cpu, *cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; if (!sched_group_nodes) continue; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; *nodemask = node_to_cpumask(i); @@ -7054,13 +7267,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + #define SD_INIT(sd, type) sd_init_##type(sd) + #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) @@ -7162,7 +7383,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), + sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); @@ -7201,7 +7422,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = NULL, *p; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7268,7 +7489,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { SCHED_CPUMASK_VAR(this_sibling_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7285,7 +7506,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { SCHED_CPUMASK_VAR(this_core_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7301,7 +7522,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Set up physical groups */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { SCHED_CPUMASK_VAR(nodemask, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7325,7 +7546,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, send_covered, tmpmask); } - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7352,7 +7573,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask(j, *nodemask) { + for_each_cpu_mask_nr(j, *nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); @@ -7364,9 +7585,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpus_or(*covered, *covered, *nodemask); prev = sg; - for (j = 0; j < MAX_NUMNODES; j++) { + for (j = 0; j < nr_node_ids; j++) { SCHED_CPUMASK_VAR(notcovered, allmasks); - int n = (i + j) % MAX_NUMNODES; + int n = (i + j) % nr_node_ids; node_to_cpumask_ptr(pnodemask, n); cpus_complement(*notcovered, *covered); @@ -7398,28 +7619,28 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(cpu_domains, i); init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(core_domains, i); init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(phys_domains, i); init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) + for (i = 0; i < nr_node_ids; i++) init_numa_sched_groups_power(sched_group_nodes[i]); if (sd_allnodes) { @@ -7432,7 +7653,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Attach the domains */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -7477,18 +7698,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void) } /* - * Free current domain masks. - * Called after all cpus are attached to NULL domain. - */ -static void free_sched_domains(void) -{ - ndoms_cur = 0; - if (doms_cur != &fallback_doms) - kfree(doms_cur); - doms_cur = &fallback_doms; -} - -/* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. @@ -7527,7 +7736,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) unregister_sched_domain_sysctl(); - for_each_cpu_mask(i, *cpu_map) + for_each_cpu_mask_nr(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); arch_destroy_sched_domains(cpu_map, &tmpmask); @@ -7566,30 +7775,29 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * ownership of it and will kfree it when done with it. If the caller * failed the kmalloc call, then it can pass in doms_new == NULL, * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms'. + * 'fallback_doms', it also forces the domains to be rebuilt. + * + * If doms_new==NULL it will be replaced with cpu_online_map. + * ndoms_new==0 is a special case for destroying existing domains. + * It will not create the default domain. * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { - int i, j; + int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) { - ndoms_new = 1; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; - } + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < ndoms_new; j++) { + for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7600,6 +7808,13 @@ match1: ; } + if (doms_new == NULL) { + ndoms_cur = 0; + doms_new = &fallback_doms; + cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + dattr_new = NULL; + } + /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { @@ -7630,17 +7845,15 @@ match2: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int arch_reinit_sched_domains(void) { - int err; - get_online_cpus(); - mutex_lock(&sched_domains_mutex); - detach_destroy_domains(&cpu_online_map); - free_sched_domains(); - err = arch_init_sched_domains(&cpu_online_map); - mutex_unlock(&sched_domains_mutex); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + + rebuild_sched_domains(); put_online_cpus(); - return err; + return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) @@ -7661,30 +7874,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sys_device *dev, +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sys_device *dev, +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, sched_smt_power_savings_store); #endif @@ -7706,59 +7923,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +#ifndef CONFIG_CPUSETS /* - * Force a reinitialization of the sched domains hierarchy. The domains - * and groups cannot be updated in place without racing with the balancing - * code, so we temporarily attach all running cpus to the NULL domain - * which will prevent rebalancing while the sched domains are recalculated. + * Add online and remove offline CPUs from the scheduler domains. + * When cpusets are enabled they take over this function. */ static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + partition_sched_domains(1, NULL, NULL); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} +#endif + +static int update_runtime(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ int cpu = (int)(long)hcpu; switch (action) { case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: disable_runtime(cpu_rq(cpu)); - /* fall-through */ - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - detach_destroy_domains(&cpu_online_map); - free_sched_domains(); return NOTIFY_OK; - case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: enable_runtime(cpu_rq(cpu)); - /* fall-through */ - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* - * Fall through and re-initialise the domains. - */ - break; + return NOTIFY_OK; + default: return NOTIFY_DONE; } - -#ifndef CONFIG_CPUSETS - /* - * Create default domain partitioning if cpusets are disabled. - * Otherwise we let cpusets rebuild the domains based on the - * current setup. - */ - - /* The hotplug lock is already held by cpu_up/cpu_down */ - arch_init_sched_domains(&cpu_online_map); -#endif - - return NOTIFY_OK; } void __init sched_init_smp(void) @@ -7778,8 +7985,15 @@ void __init sched_init_smp(void) cpu_set(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); + +#ifndef CONFIG_CPUSETS /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); +#endif + + /* RT runtime code needs to handle some hotplug events */ + hotcpu_notifier(update_runtime, 0); + init_hrtick(); /* Move init over to a non-isolated CPU */ @@ -7987,7 +8201,6 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); - lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); @@ -8076,7 +8289,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #endif #ifdef CONFIG_RT_MUTEXES @@ -8110,20 +8323,25 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); @@ -8444,8 +8662,8 @@ struct task_group *sched_create_group(struct task_group *parent) WARN_ON(!parent); /* root should already exist */ tg->parent = parent; - list_add_rcu(&tg->siblings, &parent->children); INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -8621,73 +8839,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8697,14 +8937,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; - goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8773,16 +9008,25 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; + u64 runtime, period; int ret = 0; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; @@ -8793,6 +9037,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -8853,7 +9100,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -8862,9 +9108,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - return &tg->css; }