X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched.c;h=cc1f81b50b82dddb19658dc4d10dd419087a59fc;hb=2d91f94174ce42378aafc7bedda2588a913d3eab;hp=28c73f07efb2f7506f70be5ceb7a44c134770afa;hpb=22e52b072dd87faa9b2559fe89d4e8f2370f81ca;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 28c73f0..cc1f81b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -66,19 +66,16 @@ #include #include #include +#include +#include +#include +#include +#include #include #include -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} +#include "sched_cpupri.h" /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -114,6 +111,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define DEF_TIMESLICE (100 * HZ / 1000) +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -137,7 +139,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) return 1; return 0; } @@ -155,6 +157,90 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +struct rt_bandwidth { + /* nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start(&rt_b->rt_period_timer, + rt_b->rt_period_timer.expires, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + +/* + * sched_domains_mutex serializes calls to arch_init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + #ifdef CONFIG_GROUP_SCHED #include @@ -181,45 +267,63 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - u64 rt_runtime; + struct rt_bandwidth rt_bandwidth; #endif struct rcu_head rcu; struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; }; +#ifdef CONFIG_USER_SCHED + +/* + * Root task group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. + */ +struct task_group root_task_group; + #ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; - -static struct sched_entity *init_sched_entity_p[NR_CPUS]; -static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; - -static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; -static struct rt_rq *init_rt_rq_p[NR_CPUS]; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ +#define root_task_group init_task_group +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. */ static DEFINE_SPINLOCK(task_group_lock); -/* doms_cur_mutex serializes access to doms_cur[] array */ -static DEFINE_MUTEX(doms_cur_mutex); - #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else +#else /* !CONFIG_USER_SCHED */ # define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif +#endif /* CONFIG_USER_SCHED */ + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES 2 +#define MAX_SHARES (1UL << 18) static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -227,17 +331,7 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD; /* Default task group. * Every task in system belong to this group at bootup. */ -struct task_group init_task_group = { -#ifdef CONFIG_FAIR_GROUP_SCHED - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - .rt_se = init_sched_rt_entity_p, - .rt_rq = init_rt_rq_p, -#endif -}; +struct task_group init_task_group; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) @@ -269,21 +363,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif } -static inline void lock_doms_cur(void) -{ - mutex_lock(&doms_cur_mutex); -} - -static inline void unlock_doms_cur(void) -{ - mutex_unlock(&doms_cur_mutex); -} - #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_doms_cur(void) { } -static inline void unlock_doms_cur(void) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} #endif /* CONFIG_GROUP_SCHED */ @@ -294,11 +380,16 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; + u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct rb_node *rb_load_balance_curr; - /* 'curr' points to currently running entity on this cfs_rq. + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr, *next; @@ -318,6 +409,31 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + /* + * the part of load.weight contributed by tasks + */ + unsigned long task_weight; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + + /* + * this cpu's part of tg->shares + */ + unsigned long shares; + + /* + * load.weight at the time we set shares + */ + unsigned long rq_weight; +#endif #endif }; @@ -334,6 +450,9 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -366,6 +485,9 @@ struct root_domain { */ cpumask_t rto_mask; atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif }; /* @@ -396,6 +518,7 @@ struct rq { unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ + unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ @@ -405,8 +528,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -428,13 +549,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - u64 clock, prev_clock_raw; - s64 clock_max_delta; - - unsigned int clock_warps, clock_overflows, clock_underflows; - u64 idle_clock; - unsigned int clock_deep_idle_events; - u64 tick_timestamp; + u64 clock; atomic_t nr_iowait; @@ -447,14 +562,19 @@ struct rq { int push_cpu; /* cpu of this runqueue: */ int cpu; + int online; + + unsigned long avg_load_per_task; struct task_struct *migration_thread; struct list_head migration_queue; #endif #ifdef CONFIG_SCHED_HRTICK - unsigned long hrtick_flags; - ktime_t hrtick_expire; +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif struct hrtimer hrtick_timer; #endif @@ -480,7 +600,6 @@ struct rq { /* BKL stats */ unsigned int bkl_count; #endif - struct lock_class_key rq_lock_key; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -500,53 +619,6 @@ static inline int cpu_of(struct rq *rq) } /* - * Update the per-runqueue clock, as finegrained as the platform can give - * us, but without assuming monotonicity, etc.: - */ -static void __update_rq_clock(struct rq *rq) -{ - u64 prev_raw = rq->prev_clock_raw; - u64 now = sched_clock(); - s64 delta = now - prev_raw; - u64 clock = rq->clock; - -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -#endif - /* - * Protect against sched_clock() occasionally going backwards: - */ - if (unlikely(delta < 0)) { - clock++; - rq->clock_warps++; - } else { - /* - * Catch too large forward jumps too: - */ - if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { - if (clock < rq->tick_timestamp + TICK_NSEC) - clock = rq->tick_timestamp + TICK_NSEC; - else - clock++; - rq->clock_overflows++; - } else { - if (unlikely(delta > rq->clock_max_delta)) - rq->clock_max_delta = delta; - clock += delta; - } - } - - rq->prev_clock_raw = now; - rq->clock = clock; -} - -static void update_rq_clock(struct rq *rq) -{ - if (likely(smp_processor_id() == cpu_of(rq))) - __update_rq_clock(rq); -} - -/* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. * @@ -561,21 +633,9 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -unsigned long rt_needs_cpu(int cpu) +static inline void update_rq_clock(struct rq *rq) { - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq->rt_throttled) - return 0; - - if (rq->clock > rq->rt_period_expire) - return 1; - - delta = rq->rt_period_expire - rq->clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; + rq->clock = sched_clock_cpu(cpu_of(rq)); } /* @@ -587,25 +647,158 @@ unsigned long rt_needs_cpu(int cpu) # define const_debug static const #endif +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + /* * Debugging: various feature bits */ + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + enum { - SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, - SCHED_FEAT_WAKEUP_PREEMPT = 2, - SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_HRTICK = 8, - SCHED_FEAT_DOUBLE_TICK = 16, +#include "sched_features.h" }; +#undef SCHED_FEAT + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + const_debug unsigned int sysctl_sched_features = - SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | - SCHED_FEAT_WAKEUP_PREEMPT * 1 | - SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_HRTICK * 1 | - SCHED_FEAT_DOUBLE_TICK * 0; +#include "sched_features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +static __read_mostly char *sched_feat_names[] = { +#include "sched_features.h" + NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + return 0; +} + +static ssize_t +sched_feat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + for (i = 0; sched_feat_names[i]; i++) { + len += strlen(sched_feat_names[i]); + len += 4; + } + + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; sched_feat_names[i]; i++) { + if (sysctl_sched_features & (1UL << i)) + r += sprintf(buf + r, "%s ", sched_feat_names[i]); + else + r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; -#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) + if (strncmp(buf, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + int len = strlen(sched_feat_names[i]); + + if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .read = sched_feat_read, + .write = sched_feat_write, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) /* * Number of tasks to iterate in a single balance run. @@ -614,6 +807,12 @@ const_debug unsigned int sysctl_sched_features = const_debug unsigned int sysctl_sched_nr_migrate = 32; /* + * ratelimit for updating the group shares. + * default: 0.25ms + */ +unsigned int sysctl_sched_shares_ratelimit = 250000; + +/* * period over which we measure -rt task cpu usage in us. * default: 1s */ @@ -627,37 +826,18 @@ static __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) - -/* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): - */ -unsigned long long cpu_clock(int cpu) +static inline u64 global_rt_period(void) { - unsigned long long now; - unsigned long flags; - struct rq *rq; - - /* - * Only call sched_clock() if the scheduler has already been - * initialized (some code might call cpu_clock() very early): - */ - if (unlikely(!scheduler_running)) - return 0; + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); - now = rq->clock; - local_irq_restore(flags); +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; - return now; + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -EXPORT_SYMBOL_GPL(cpu_clock); #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) @@ -804,50 +984,6 @@ static struct rq *this_rq_lock(void) return rq; } -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - - spin_lock(&rq->lock); - __update_rq_clock(rq); - spin_unlock(&rq->lock); - rq->clock_deep_idle_events++; -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - u64 now = sched_clock(); - - rq->idle_clock += delta_ns; - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - spin_lock(&rq->lock); - rq->prev_clock_raw = now; - rq->clock += delta_ns; - spin_unlock(&rq->lock); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); - -static void __resched_task(struct task_struct *p, int tif_bit); - -static inline void resched_task(struct task_struct *p) -{ - __resched_task(p, TIF_NEED_RESCHED); -} - #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -859,24 +995,6 @@ static inline void resched_task(struct task_struct *p) * When we get rescheduled we reprogram the hrtick_timer outside of the * rq->lock. */ -static inline void resched_hrt(struct task_struct *p) -{ - __resched_task(p, TIF_HRTICK_RESCHED); -} - -static inline void resched_rq(struct rq *rq) -{ - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - resched_task(rq->curr); - spin_unlock_irqrestore(&rq->lock, flags); -} - -enum { - HRTICK_SET, /* re-programm hrtick_timer */ - HRTICK_RESET, /* not a new slice */ -}; /* * Use hrtick when: @@ -887,38 +1005,11 @@ static inline int hrtick_enabled(struct rq *rq) { if (!sched_feat(HRTICK)) return 0; + if (!cpu_active(cpu_of(rq))) + return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -static void hrtick_start(struct rq *rq, u64 delay, int reset) -{ - assert_spin_locked(&rq->lock); - - /* - * preempt at: now + delay - */ - rq->hrtick_expire = - ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); - /* - * indicate we need to program the timer - */ - __set_bit(HRTICK_SET, &rq->hrtick_flags); - if (reset) - __set_bit(HRTICK_RESET, &rq->hrtick_flags); - - /* - * New slices are called from the schedule path and don't need a - * forced reschedule. - */ - if (reset) - resched_hrt(rq->curr); -} - static void hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) @@ -926,32 +1017,6 @@ static void hrtick_clear(struct rq *rq) } /* - * Update the timer from the possible pending state. - */ -static void hrtick_set(struct rq *rq) -{ - ktime_t time; - int set, reset; - unsigned long flags; - - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - - spin_lock_irqsave(&rq->lock, flags); - set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); - reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); - time = rq->hrtick_expire; - clear_thread_flag(TIF_HRTICK_RESCHED); - spin_unlock_irqrestore(&rq->lock, flags); - - if (set) { - hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); - if (reset && !hrtimer_active(&rq->hrtick_timer)) - resched_rq(rq); - } else - hrtick_clear(rq); -} - -/* * High-resolution timer tick. * Runs from hardirq context with interrupts disabled. */ @@ -962,40 +1027,102 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); spin_lock(&rq->lock); - __update_rq_clock(rq); + update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); spin_unlock(&rq->lock); return HRTIMER_NORESTART; } -static inline void init_rq_hrtick(struct rq *rq) +#ifdef CONFIG_SMP +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) { - rq->hrtick_flags = 0; - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + struct rq *rq = arg; + + spin_lock(&rq->lock); + hrtimer_restart(&rq->hrtick_timer); + rq->hrtick_csd_pending = 0; + spin_unlock(&rq->lock); } -void hrtick_resched(void) +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) { - struct rq *rq; - unsigned long flags; + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - if (!test_thread_flag(TIF_HRTICK_RESCHED)) - return; + timer->expires = time; - local_irq_save(flags); - rq = cpu_rq(smp_processor_id()); - hrtick_set(rq); - local_irq_restore(flags); + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + rq->hrtick_csd_pending = 1; + } +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hrtick_clear(cpu_rq(cpu)); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); } #else -static inline void hrtick_clear(struct rq *rq) +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); +} + +static void init_hrtick(void) { } +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) +{ +#ifdef CONFIG_SMP + rq->hrtick_csd_pending = 0; + + rq->hrtick_csd.flags = 0; + rq->hrtick_csd.func = __hrtick_start; + rq->hrtick_csd.info = rq; +#endif -static inline void hrtick_set(struct rq *rq) + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} +#else +static inline void hrtick_clear(struct rq *rq) { } @@ -1003,7 +1130,7 @@ static inline void init_rq_hrtick(struct rq *rq) { } -void hrtick_resched(void) +static inline void init_hrtick(void) { } #endif @@ -1021,16 +1148,16 @@ void hrtick_resched(void) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void __resched_task(struct task_struct *p, int tif_bit) +static void resched_task(struct task_struct *p) { int cpu; assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, tif_bit))) + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) return; - set_tsk_thread_flag(p, tif_bit); + set_tsk_thread_flag(p, TIF_NEED_RESCHED); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -1052,13 +1179,56 @@ static void resched_cpu(int cpu) resched_task(cpu_curr(cpu)); spin_unlock_irqrestore(&rq->lock, flags); } -#else -static void __resched_task(struct task_struct *p, int tif_bit) + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif /* CONFIG_NO_HZ */ + +#else /* !CONFIG_SMP */ +static void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); - set_tsk_thread_flag(p, tif_bit); + set_tsk_need_resched(p); } -#endif +#endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 # define WMULT_CONST (~0UL) @@ -1073,14 +1243,22 @@ static void __resched_task(struct task_struct *p, int tif_bit) */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +/* + * delta *= weight / lw + */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) { u64 tmp; - if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } tmp = (u64)delta_exec * weight; /* @@ -1095,12 +1273,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -static inline unsigned long -calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) -{ - return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); -} - static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -1198,12 +1370,224 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +static inline void inc_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); +} + +static inline void dec_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_sub(&rq->load, load); +} + #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); -#endif /* CONFIG_SMP */ + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static void +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +{ + struct task_group *parent, *child; + + rcu_read_lock(); + parent = &root_task_group; +down: + (*down)(parent, cpu, sd); + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + (*up)(parent, cpu, sd); + + child = parent; + parent = parent->parent; + if (parent) + goto up; + rcu_read_unlock(); +} + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +__update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) +{ + int boost = 0; + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[cpu]) + return; + + rq_weight = tg->cfs_rq[cpu]->load.weight; + + /* + * If there are currently no tasks on the cpu pretend there is one of + * average load so that when a new task gets to run here it will not + * get delayed by group starvation. + */ + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } + + if (unlikely(rq_weight > sd_rq_weight)) + rq_weight = sd_rq_weight; + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; + + __set_se_shares(tg->se[cpu], shares); +} + +/* + * Re-compute the task group their per cpu shares over the given domain. + * This needs to be done in a bottom-up fashion because the rq weight of a + * parent group depends on the shares of its child groups. + */ +static void +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +{ + unsigned long rq_weight = 0; + unsigned long shares = 0; + int i; + + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + shares += tg->cfs_rq[i]->shares; + } + + if ((!shares && rq_weight) || shares > tg->shares) + shares = tg->shares; + + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; + + if (!rq_weight) + rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; + + for_each_cpu_mask(i, sd->span) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __update_group_shares_cpu(tg, i, shares, rq_weight); + spin_unlock_irqrestore(&rq->lock, flags); + } +} + +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static void +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +{ + unsigned long load; + + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->cfs_rq[cpu]->shares; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; +} + +static void +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) +{ +} + +static void update_shares(struct sched_domain *sd) +{ + u64 now = cpu_clock(raw_smp_processor_id()); + s64 elapsed = now - sd->last_update; + + if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { + sd->last_update = now; + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + } +} + +static void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ + spin_unlock(&rq->lock); + update_shares(sd); + spin_lock(&rq->lock); +} + +static void update_h_load(int cpu) +{ + walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); +} + +#else + +static inline void update_shares(struct sched_domain *sd) +{ +} + +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ +} + +#endif + +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +#ifdef CONFIG_SMP + cfs_rq->shares = shares; +#endif +} +#endif #include "sched_stats.h" #include "sched_idletask.c" @@ -1214,27 +1598,17 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #endif #define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) -static inline void inc_load(struct rq *rq, const struct task_struct *p) -{ - update_load_add(&rq->load, p->se.load.weight); -} - -static inline void dec_load(struct rq *rq, const struct task_struct *p) -{ - update_load_sub(&rq->load, p->se.load.weight); -} - -static void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct rq *rq) { rq->nr_running++; - inc_load(rq, p); } -static void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct rq *rq) { rq->nr_running--; - dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1258,6 +1632,12 @@ static void set_load_weight(struct task_struct *p) p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { sched_info_queued(p); @@ -1267,6 +1647,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { + if (sleep && p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } + + sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } @@ -1326,7 +1713,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(p, rq); + inc_nr_running(rq); } /* @@ -1338,7 +1725,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(p, rq); + dec_nr_running(rq); } /** @@ -1350,12 +1737,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* Used instead of source_load when we know the type == 0 */ -unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { set_task_rq(p, cpu); @@ -1384,6 +1765,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + /* * Is this task likely cache-hot: */ @@ -1395,7 +1782,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (&p->se == cfs_rq_of(&p->se)->next) + if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) return 1; if (p->sched_class != &fair_sched_class) @@ -1479,16 +1866,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * wait_task_inactive - wait for a thread to unschedule. * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(struct task_struct *p) +unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; int running, on_rq; + unsigned long ncsw; struct rq *rq; for (;;) { @@ -1511,8 +1906,11 @@ void wait_task_inactive(struct task_struct *p) * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; cpu_relax(); + } /* * Ok, time to look more closely! We need the rq @@ -1522,9 +1920,21 @@ void wait_task_inactive(struct task_struct *p) rq = task_rq_lock(p, &flags); running = task_running(rq, p); on_rq = p->se.on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) { + ncsw = p->nivcsw + p->nvcsw; + if (unlikely(!ncsw)) + ncsw = 1; + } task_rq_unlock(rq, &flags); /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* * Was it really running after all now that we * checked with the proper locks actually held? * @@ -1556,6 +1966,8 @@ void wait_task_inactive(struct task_struct *p) */ break; } + + return ncsw; } /*** @@ -1594,7 +2006,7 @@ static unsigned long source_load(int cpu, int type) struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); - if (type == 0) + if (type == 0 || !sched_feat(LB_BIAS)) return total; return min(rq->cpu_load[type-1], total); @@ -1609,25 +2021,13 @@ static unsigned long target_load(int cpu, int type) struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); - if (type == 0) + if (type == 0 || !sched_feat(LB_BIAS)) return total; return max(rq->cpu_load[type-1], total); } /* - * Return the average load per task on the cpu's run queue - */ -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - unsigned long n = rq->nr_running; - - return n ? total / n : SCHED_LOAD_SCALE; -} - -/* * find_idlest_group finds and returns the least busy CPU group within the * domain. */ @@ -1653,7 +2053,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -1685,17 +2085,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, + cpumask_t *tmp) { - cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(tmp, group->cpumask, p->cpus_allowed); + cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask_nr(i, *tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1733,8 +2133,11 @@ static int sched_balance_self(int cpu, int flag) sd = tmp; } + if (sd) + update_shares(sd); + while (sd) { - cpumask_t span; + cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -1750,7 +2153,7 @@ static int sched_balance_self(int cpu, int flag) continue; } - new_cpu = find_idlest_cpu(group, t, cpu); + new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -1796,6 +2199,25 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + if (!sched_feat(SYNC_WAKEUPS)) + sync = 0; + +#ifdef CONFIG_SMP + if (sched_feat(LB_WAKEUP_UPDATE)) { + struct sched_domain *sd; + + this_cpu = raw_smp_processor_id(); + cpu = task_cpu(p); + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + update_shares(sd); + break; + } + } + } +#endif + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; @@ -1842,7 +2264,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) } } } -#endif +#endif /* CONFIG_SCHEDSTATS */ out_activate: #endif /* CONFIG_SMP */ @@ -1860,6 +2282,9 @@ out_activate: success = 1; out_running: + trace_mark(kernel_sched_wakeup, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); check_preempt_curr(rq, p); p->state = TASK_RUNNING; @@ -1868,6 +2293,8 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: + current->se.last_wakeup = current->se.sum_exec_runtime; + task_rq_unlock(rq, &flags); return success; @@ -1912,6 +2339,7 @@ static void __sched_fork(struct task_struct *p) INIT_LIST_HEAD(&p->rt.run_list); p->se.on_rq = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -1987,8 +2415,11 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(p, rq); + inc_nr_running(rq); } + trace_mark(kernel_sched_wakeup_new, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2041,7 +2472,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, notifier->ops->sched_out(notifier, next); } -#else +#else /* !CONFIG_PREEMPT_NOTIFIERS */ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -2053,7 +2484,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif +#endif /* CONFIG_PREEMPT_NOTIFIERS */ /** * prepare_task_switch - prepare to switch tasks @@ -2161,6 +2592,11 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + trace_mark(kernel_sched_schedule, + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + prev->pid, next->pid, prev->state, + rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -2322,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) } else { if (rq1 < rq2) { spin_lock(&rq1->lock); - spin_lock(&rq2->lock); + spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&rq2->lock); - spin_lock(&rq1->lock); + spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -2368,14 +2804,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); - spin_lock(&this_rq->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock(&busiest->lock); + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); } return ret; } +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -2390,7 +2833,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) rq = task_rq_lock(p, &flags); if (!cpu_isset(dest_cpu, p->cpus_allowed) - || unlikely(cpu_is_offline(dest_cpu))) + || unlikely(!cpu_active(dest_cpu))) goto out; /* force the process onto the specified CPU */ @@ -2495,7 +2938,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct rq_iterator *iterator) { - int loops = 0, pulled = 0, pinned = 0, skip_for_load; + int loops = 0, pulled = 0, pinned = 0; struct task_struct *p; long rem_load_move = max_load_move; @@ -2511,14 +2954,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, next: if (!p || loops++ > sysctl_sched_nr_migrate) goto out; - /* - * To help distribute high priority tasks across CPUs we don't - * skip a task if it will be the highest priority task (i.e. smallest - * prio value) on its new queue regardless of its load weight - */ - skip_for_load = (p->se.load.weight >> 1) > rem_load_move + - SCHED_LOAD_SCALE_FUZZ; - if ((skip_for_load && p->prio >= *this_best_prio) || + + if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { p = iterator->next(iterator->arg); goto next; @@ -2573,6 +3010,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); class = class->next; + + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; + } while (class && max_load_move > total_load_moved); return total_load_moved > 0; @@ -2631,7 +3072,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, cpumask_t *cpus, int *balance) + int *sd_idle, const cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2649,6 +3090,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; + if (idle == CPU_NOT_IDLE) load_idx = sd->busy_idx; else if (idle == CPU_NEWLY_IDLE) @@ -2663,6 +3105,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; local_group = cpu_isset(this_cpu, group->cpumask); @@ -2671,10 +3115,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + sum_avg_load_per_task = avg_load_per_task = 0; + max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { struct rq *rq; if (!cpu_isset(i, *cpus)) @@ -2704,6 +3150,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load += load; sum_nr_running += rq->nr_running; sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); } /* @@ -2725,7 +3173,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); - if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) __group_imb = 1; group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; @@ -2866,9 +3327,9 @@ small_imbalance: if (busiest_load_per_task > this_load_per_task) imbn = 1; } else - this_load_per_task = SCHED_LOAD_SCALE; + this_load_per_task = cpu_avg_load_per_task(this_cpu); - if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + if (max_load - this_load + 2*busiest_load_per_task >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; @@ -2932,13 +3393,13 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, cpumask_t *cpus) + unsigned long imbalance, const cpumask_t *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -2971,15 +3432,16 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *balance, cpumask_t *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; - cpumask_t cpus = CPU_MASK_ALL; unsigned long flags; + cpus_setall(*cpus); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -2993,8 +3455,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: + update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus, balance); + cpus, balance); if (*balance == 0) goto out_balanced; @@ -3004,7 +3467,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, &cpus); + busiest = find_busiest_queue(group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -3037,8 +3500,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; goto out_balanced; } @@ -3095,8 +3558,9 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return ld_moved; + ld_moved = -1; + + goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3111,8 +3575,13 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return 0; + ld_moved = -1; + else + ld_moved = 0; +out: + if (ld_moved) + update_shares(sd); + return ld_moved; } /* @@ -3123,7 +3592,8 @@ out_one_pinned: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, + cpumask_t *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3131,7 +3601,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; - cpumask_t cpus = CPU_MASK_ALL; + + cpus_setall(*cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3145,15 +3616,15 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: + update_shares_locked(this_rq, sd); group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, - &sd_idle, &cpus, NULL); + &sd_idle, cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, - &cpus); + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); goto out_balanced; @@ -3172,11 +3643,11 @@ redo: ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, CPU_NEWLY_IDLE, &all_pinned); - spin_unlock(&busiest->lock); + double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; } } @@ -3189,6 +3660,7 @@ redo: } else sd->nr_balance_failed = 0; + update_shares_locked(this_rq, sd); return ld_moved; out_balanced: @@ -3210,6 +3682,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = -1; unsigned long next_balance = jiffies + HZ; + cpumask_t tmpmask; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3219,8 +3692,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance_newidle(this_cpu, - this_rq, sd); + pulled_task = load_balance_newidle(this_cpu, this_rq, + sd, &tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3285,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) else schedstat_inc(sd, alb_failed); } - spin_unlock(&target_rq->lock); + double_unlock_balance(busiest_rq, target_rq); } #ifdef CONFIG_NO_HZ @@ -3328,7 +3801,7 @@ int select_nohz_load_balancer(int stop_tick) /* * If we are going offline and still the leader, give up! */ - if (cpu_is_offline(cpu) && + if (!cpu_active(cpu) && atomic_read(&nohz.load_balancer) == cpu) { if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); @@ -3379,6 +3852,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + int need_serialize; + cpumask_t tmp; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3395,14 +3870,15 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (interval > HZ*NR_CPUS/10) interval = HZ*NR_CPUS/10; + need_serialize = sd->flags & SD_SERIALIZE; - if (sd->flags & SD_SERIALIZE) { + if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3412,7 +3888,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } sd->last_balance = jiffies; } - if (sd->flags & SD_SERIALIZE) + if (need_serialize) spin_unlock(&balancing); out: if (time_after(next_balance, sd->last_balance + interval)) { @@ -3465,7 +3941,7 @@ static void run_rebalance_domains(struct softirq_action *h) int balance_cpu; cpu_clear(this_cpu, cpus); - for_each_cpu_mask(balance_cpu, cpus) { + for_each_cpu_mask_nr(balance_cpu, cpus) { /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -3518,7 +3994,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) */ int ilb = first_cpu(nohz.cpu_mask); - if (ilb != NR_CPUS) + if (ilb < nr_cpu_ids) resched_cpu(ilb); } } @@ -3601,6 +4077,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + /* Account for user time used */ + acct_update_integrals(p); } /* @@ -3645,8 +4123,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, struct rq *rq = this_rq(); cputime64_t tmp; - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) - return account_guest_time(p, cputime); + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime); + return; + } p->stime = cputime_add(p->stime, cputime); @@ -3688,14 +4168,73 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cputime64_t tmp = cputime_to_cputime64(steal); struct rq *rq = this_rq(); - if (p == rq->idle) { - p->stime = cputime_add(p->stime, steal); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else - cpustat->steal = cputime64_add(cpustat->steal, tmp); + if (p == rq->idle) { + p->stime = cputime_add(p->stime, steal); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); + } else + cpustat->steal = cputime64_add(cpustat->steal, tmp); +} + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +cputime_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + return p->prev_utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + clock_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + + if (stime >= 0) + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + + return p->prev_stime; +} +#endif + +inline cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; } /* @@ -3710,21 +4249,13 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; - u64 next_tick = rq->tick_timestamp + TICK_NSEC; + + sched_clock_tick(); spin_lock(&rq->lock); - __update_rq_clock(rq); - /* - * Let rq->clock advance by at least TICK_NSEC: - */ - if (unlikely(rq->clock < next_tick)) { - rq->clock = next_tick; - rq->clock_underflows++; - } - rq->tick_timestamp = rq->clock; + update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -3733,26 +4264,44 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void __kprobes add_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -3764,7 +4313,10 @@ void __kprobes sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); @@ -3782,6 +4334,7 @@ static noinline void __schedule_bug(struct task_struct *prev) prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); + print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); @@ -3801,7 +4354,7 @@ static inline void schedule_debug(struct task_struct *prev) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) __schedule_bug(prev); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -3870,23 +4423,22 @@ need_resched_nonpreemptible: schedule_debug(prev); - hrtick_clear(rq); + if (sched_feat(HRTICK)) + hrtick_clear(rq); /* * Do the rq-clock update outside the rq lock: */ local_irq_disable(); - __update_rq_clock(rq); + update_rq_clock(rq); spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - signal_pending(prev))) { + if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - } else { + else deactivate_task(rq, prev, 1); - } switch_count = &prev->nvcsw; } @@ -3901,9 +4453,9 @@ need_resched_nonpreemptible: prev->sched_class->put_prev_task(rq, prev); next = pick_next_task(rq, prev); - sched_info_switch(prev, next); - if (likely(prev != next)) { + sched_info_switch(prev, next); + rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -3918,8 +4470,6 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); - hrtick_set(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -3938,8 +4488,6 @@ EXPORT_SYMBOL(schedule); asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); - struct task_struct *task = current; - int saved_lock_depth; /* * If there is a non-zero preempt_count or interrupts are disabled, @@ -3950,16 +4498,7 @@ asmlinkage void __sched preempt_schedule(void) do { add_preempt_count(PREEMPT_ACTIVE); - - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; schedule(); - task->lock_depth = saved_lock_depth; sub_preempt_count(PREEMPT_ACTIVE); /* @@ -3980,26 +4519,15 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); - struct task_struct *task = current; - int saved_lock_depth; /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); do { add_preempt_count(PREEMPT_ACTIVE); - - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; local_irq_enable(); schedule(); local_irq_disable(); - task->lock_depth = saved_lock_depth; sub_preempt_count(PREEMPT_ACTIVE); /* @@ -4134,22 +4662,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) signal_pending(current)) || (state == TASK_KILLABLE && fatal_signal_pending(current))) { - __remove_wait_queue(&x->wait, &wait); - return -ERESTARTSYS; + timeout = -ERESTARTSYS; + break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - return timeout; - } - } while (!x->done); + } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; } x->done--; - return timeout; + return timeout ?: 1; } static long __sched @@ -4202,6 +4728,52 @@ int __sched wait_for_completion_killable(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_killable); +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(completion_done); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -4324,10 +4896,8 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - dec_load(rq, p); - } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4337,7 +4907,6 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); - inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4482,16 +5051,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) set_load_weight(p); } -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) +static int __sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -4523,7 +5084,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ - if (!capable(CAP_SYS_NICE)) { + if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { unsigned long rlim_rtprio; @@ -4554,18 +5115,21 @@ recheck: return -EPERM; } + if (user) { #ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_policy(policy) && task_group(p)->rt_runtime == 0) - return -EPERM; + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + return -EPERM; #endif - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + } + /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4608,8 +5172,39 @@ recheck: return 0; } + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} EXPORT_SYMBOL_GPL(sched_setscheduler); +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} + static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { @@ -4721,9 +5316,10 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, cpumask_t new_mask) +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) { cpumask_t cpus_allowed; + cpumask_t new_mask = *in_mask; struct task_struct *p; int retval; @@ -4754,13 +5350,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) if (retval) goto out_unlock; - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); cpus_and(new_mask, new_mask, cpus_allowed); again: - retval = set_cpus_allowed(p, new_mask); + retval = set_cpus_allowed_ptr(p, &new_mask); if (!retval) { - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); if (!cpus_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset @@ -4804,27 +5400,9 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); -#endif - long sched_getaffinity(pid_t pid, cpumask_t *mask) { struct task_struct *p; @@ -4920,7 +5498,6 @@ static void __cond_resched(void) } while (need_resched()); } -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) int __sched _cond_resched(void) { if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && @@ -4931,7 +5508,6 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); -#endif /* * cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -5123,7 +5699,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { @@ -5226,8 +5802,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); +#else task_thread_info(idle)->preempt_count = 0; - +#endif /* * The idle tasks have their own, simple scheduling class: */ @@ -5266,7 +5845,8 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; - sysctl_sched_batch_wakeup_granularity *= factor; + + sysctl_sched_shares_ratelimit *= factor; } #ifdef CONFIG_SMP @@ -5295,7 +5875,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) { struct migration_req req; unsigned long flags; @@ -5303,23 +5883,29 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(new_mask, cpu_online_map)) { + if (!cpus_intersects(*new_mask, cpu_online_map)) { + ret = -EINVAL; + goto out; + } + + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && + !cpus_equal(p->cpus_allowed, *new_mask))) { ret = -EINVAL; goto out; } if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, &new_mask); + p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = new_mask; - p->rt.nr_cpus_allowed = cpus_weight(new_mask); + p->cpus_allowed = *new_mask; + p->rt.nr_cpus_allowed = cpus_weight(*new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), new_mask)) + if (cpu_isset(task_cpu(p), *new_mask)) goto out; - if (migrate_task(p, any_online_cpu(new_mask), &req)) { + if (migrate_task(p, any_online_cpu(*new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -5332,7 +5918,7 @@ out: return ret; } -EXPORT_SYMBOL_GPL(set_cpus_allowed); +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* * Move (not current) task off this cpu, onto dest cpu. We're doing @@ -5350,7 +5936,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) struct rq *rq_dest, *rq_src; int ret = 0, on_rq; - if (unlikely(cpu_is_offline(dest_cpu))) + if (unlikely(!cpu_active(dest_cpu))) return ret; rq_src = cpu_rq(src_cpu); @@ -5359,10 +5945,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) - goto out; + goto done; /* Affinity changed (again). */ if (!cpu_isset(dest_cpu, p->cpus_allowed)) - goto out; + goto fail; on_rq = p->se.on_rq; if (on_rq) @@ -5373,8 +5959,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p); } +done: ret = 1; -out: +fail: double_rq_unlock(rq_src, rq_dest); return ret; } @@ -5470,12 +6057,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) dest_cpu = any_online_cpu(mask); /* On any allowed CPU? */ - if (dest_cpu == NR_CPUS) + if (dest_cpu >= nr_cpu_ids) dest_cpu = any_online_cpu(p->cpus_allowed); /* No more Mr. Nice Guy. */ - if (dest_cpu == NR_CPUS) { - cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); + if (dest_cpu >= nr_cpu_ids) { + cpumask_t cpus_allowed; + + cpuset_cpus_allowed_locked(p, &cpus_allowed); /* * Try to stay on the same cpuset, where the * current cpuset may be a subset of all cpus. @@ -5511,7 +6100,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); unsigned long flags; local_irq_save(flags); @@ -5622,6 +6211,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) next = pick_next_task(rq, rq->curr); if (!next) break; + next->sched_class->put_prev_task(rq, next); migrate_dead(dead_cpu, next); } @@ -5793,6 +6383,36 @@ static void unregister_sched_domain_sysctl(void) } #endif +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpu_set(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpu_clear(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -5830,7 +6450,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_set(cpu, rq->rd->online); + + set_rq_online(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -5891,7 +6512,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_clear(cpu, rq->rd->online); + set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -5908,7 +6529,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = 10 }; -void __init migration_init(void) +static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -5918,25 +6539,46 @@ void __init migration_init(void) BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + + return err; } +early_initcall(migration_init); #endif #ifdef CONFIG_SMP -/* Number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; -EXPORT_SYMBOL(nr_cpu_ids); - #ifdef CONFIG_SCHED_DEBUG -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) +static inline const char *sd_level_to_string(enum sched_domain_level lvl) +{ + switch (lvl) { + case SD_LV_NONE: + return "NONE"; + case SD_LV_SIBLING: + return "SIBLING"; + case SD_LV_MC: + return "MC"; + case SD_LV_CPU: + return "CPU"; + case SD_LV_NODE: + return "NODE"; + case SD_LV_ALLNODES: + return "ALLNODES"; + case SD_LV_MAX: + return "MAX"; + + } + return "MAX"; +} + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + cpumask_t *groupmask) { struct sched_group *group = sd->groups; - cpumask_t groupmask; - char str[NR_CPUS]; + char str[256]; - cpumask_scnprintf(str, NR_CPUS, sd->span); - cpus_clear(groupmask); + cpulist_scnprintf(str, sizeof(str), sd->span); + cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -5948,7 +6590,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) return -1; } - printk(KERN_CONT "span %s\n", str); + printk(KERN_CONT "span %s level %s\n", + str, sd_level_to_string(sd->level)); if (!cpu_isset(cpu, sd->span)) { printk(KERN_ERR "ERROR: domain->span does not contain " @@ -5980,25 +6623,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) break; } - if (cpus_intersects(groupmask, group->cpumask)) { + if (cpus_intersects(*groupmask, group->cpumask)) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(groupmask, groupmask, group->cpumask); + cpus_or(*groupmask, *groupmask, group->cpumask); - cpumask_scnprintf(str, NR_CPUS, group->cpumask); + cpulist_scnprintf(str, sizeof(str), group->cpumask); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, groupmask)) + if (!cpus_equal(sd->span, *groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) + if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6006,6 +6649,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) static void sched_domain_debug(struct sched_domain *sd, int cpu) { + cpumask_t *groupmask; int level = 0; if (!sd) { @@ -6015,18 +6659,25 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!groupmask) { + printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); + return; + } + for (;;) { - if (sched_domain_debug_one(sd, cpu, level)) + if (sched_domain_debug_one(sd, cpu, level, groupmask)) break; level++; sd = sd->parent; if (!sd) break; } + kfree(groupmask); } -#else +#else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) -#endif +#endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) { @@ -6086,20 +6737,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; - const struct sched_class *class; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { struct root_domain *old_rd = rq->rd; - for (class = sched_class_highest; class; class = class->next) { - if (class->leave_domain) - class->leave_domain(rq); - } + if (cpu_isset(rq->cpu, old_rd->online)) + set_rq_offline(rq); cpu_clear(rq->cpu, old_rd->span); - cpu_clear(rq->cpu, old_rd->online); if (atomic_dec_and_test(&old_rd->refcount)) kfree(old_rd); @@ -6110,12 +6757,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpu_set(rq->cpu, rd->span); if (cpu_isset(rq->cpu, cpu_online_map)) - cpu_set(rq->cpu, rd->online); - - for (class = sched_class_highest; class; class = class->next) { - if (class->join_domain) - class->join_domain(rq); - } + set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -6126,6 +6768,8 @@ static void init_rootdomain(struct root_domain *rd) cpus_clear(rd->span); cpus_clear(rd->online); + + cpupri_init(&rd->cpupri); } static void init_defrootdomain(void) @@ -6187,7 +6831,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - int ints[NR_CPUS], i; + static int __initdata ints[NR_CPUS]; + int i; str = get_options(str, ARRAY_SIZE(ints), ints); cpus_clear(cpu_isolated_map); @@ -6210,30 +6855,33 @@ __setup("isolcpus=", isolated_cpu_setup); * and ->cpu_power to 0. */ static void -init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, int (*group_fn)(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg)) + struct sched_group **sg, + cpumask_t *tmpmask), + cpumask_t *covered, cpumask_t *tmpmask) { struct sched_group *first = NULL, *last = NULL; - cpumask_t covered = CPU_MASK_NONE; int i; - for_each_cpu_mask(i, span) { + cpus_clear(*covered); + + for_each_cpu_mask_nr(i, *span) { struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg); + int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, covered)) + if (cpu_isset(i, *covered)) continue; - sg->cpumask = CPU_MASK_NONE; + cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask(j, span) { - if (group_fn(j, cpu_map, NULL) != group) + for_each_cpu_mask_nr(j, *span) { + if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, covered); + cpu_set(j, *covered); cpu_set(j, sg->cpumask); } if (!first) @@ -6259,21 +6907,21 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, * * Should use nodemask_t. */ -static int find_next_best_node(int node, unsigned long *used_nodes) +static int find_next_best_node(int node, nodemask_t *used_nodes) { int i, n, val, min_val, best_node = 0; min_val = INT_MAX; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Start at @node */ - n = (node + i) % MAX_NUMNODES; + n = (node + i) % nr_node_ids; if (!nr_cpus_node(n)) continue; /* Skip already used nodes */ - if (test_bit(n, used_nodes)) + if (node_isset(n, *used_nodes)) continue; /* Simple min distance search */ @@ -6285,42 +6933,39 @@ static int find_next_best_node(int node, unsigned long *used_nodes) } } - set_bit(best_node, used_nodes); + node_set(best_node, *used_nodes); return best_node; } /** * sched_domain_node_span - get a cpumask for a node's sched_domain * @node: node whose cpumask we're constructing - * @size: number of nodes to include in this span + * @span: resulting cpumask * * Given a node, construct a good cpumask for its sched_domain to span. It * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static cpumask_t sched_domain_node_span(int node) +static void sched_domain_node_span(int node, cpumask_t *span) { - DECLARE_BITMAP(used_nodes, MAX_NUMNODES); - cpumask_t span, nodemask; + nodemask_t used_nodes; + node_to_cpumask_ptr(nodemask, node); int i; - cpus_clear(span); - bitmap_zero(used_nodes, MAX_NUMNODES); + cpus_clear(*span); + nodes_clear(used_nodes); - nodemask = node_to_cpumask(node); - cpus_or(span, span, nodemask); - set_bit(node, used_nodes); + cpus_or(*span, *span, *nodemask); + node_set(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, used_nodes); + int next_node = find_next_best_node(node, &used_nodes); - nodemask = node_to_cpumask(next_node); - cpus_or(span, span, nodemask); + node_to_cpumask_ptr_next(nodemask, next_node); + cpus_or(*span, *span, *nodemask); } - - return span; } -#endif +#endif /* CONFIG_NUMA */ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; @@ -6332,13 +6977,14 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } -#endif +#endif /* CONFIG_SCHED_SMT */ /* * multi-core sched-domains: @@ -6346,23 +6992,26 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_core); -#endif +#endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); if (sg) *sg = &per_cpu(sched_group_core, group); return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_core, cpu); @@ -6374,17 +7023,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; #ifdef CONFIG_SCHED_MC - cpumask_t mask = cpu_coregroup_map(cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = cpu_coregroup_map(cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #else group = cpu; #endif @@ -6400,19 +7050,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) * gets dynamically allocated. */ static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; +static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) + struct sched_group **sg, cpumask_t *nodemask) { - cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); int group; - cpus_and(nodemask, nodemask, *cpu_map); - group = first_cpu(nodemask); + *nodemask = node_to_cpumask(cpu_to_node(cpu)); + cpus_and(*nodemask, *nodemask, *cpu_map); + group = first_cpu(*nodemask); if (sg) *sg = &per_cpu(sched_group_allnodes, group); @@ -6427,7 +7077,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu_mask(j, sg->cpumask) { + for_each_cpu_mask_nr(j, sg->cpumask) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); @@ -6444,27 +7094,27 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) sg = sg->next; } while (sg != group_head); } -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; - for_each_cpu_mask(cpu, *cpu_map) { + for_each_cpu_mask_nr(cpu, *cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; if (!sched_group_nodes) continue; - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); + for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; if (sg == NULL) @@ -6481,11 +7131,11 @@ next_sg: sched_group_nodes_bycpu[cpu] = NULL; } } -#else -static void free_sched_groups(const cpumask_t *cpu_map) +#else /* !CONFIG_NUMA */ +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { } -#endif +#endif /* CONFIG_NUMA */ /* * Initialize sched groups cpu_power. @@ -6540,13 +7190,111 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) } /* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#define SD_INIT(sd, type) sd_init_##type(sd) +#define SD_INIT_FUNC(type) \ +static noinline void sd_init_##type(struct sched_domain *sd) \ +{ \ + memset(sd, 0, sizeof(*sd)); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif + +/* + * To minimize stack usage kmalloc room for cpumasks and share the + * space as the usage in build_sched_domains() dictates. Used only + * if the amount of space is significant. + */ +struct allmasks { + cpumask_t tmpmask; /* make this one first */ + union { + cpumask_t nodemask; + cpumask_t this_sibling_map; + cpumask_t this_core_map; + }; + cpumask_t send_covered; + +#ifdef CONFIG_NUMA + cpumask_t domainspan; + cpumask_t covered; + cpumask_t notcovered; +#endif +}; + +#if NR_CPUS > 128 +#define SCHED_CPUMASK_ALLOC 1 +#define SCHED_CPUMASK_FREE(v) kfree(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +#else +#define SCHED_CPUMASK_ALLOC 0 +#define SCHED_CPUMASK_FREE(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +#endif + +#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ + ((unsigned long)(a) + offsetof(struct allmasks, v)) + +static int default_relax_domain_level = -1; + +static int __init setup_relax_domain_level(char *str) +{ + unsigned long val; + + val = simple_strtoul(str, NULL, 0); + if (val < SD_LV_MAX) + default_relax_domain_level = val; + + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + } +} + +/* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int build_sched_domains(const cpumask_t *cpu_map) +static int __build_sched_domains(const cpumask_t *cpu_map, + struct sched_domain_attr *attr) { int i; struct root_domain *rd; + SCHED_CPUMASK_DECLARE(allmasks); + cpumask_t *tmpmask; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6554,45 +7302,69 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), + sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); return -ENOMEM; } - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + return -ENOMEM; + } + +#if SCHED_CPUMASK_ALLOC + /* get space for all scratch cpumask variables */ + allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); + if (!allmasks) { + printk(KERN_WARNING "Cannot alloc cpumask array\n"); + kfree(rd); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif return -ENOMEM; } +#endif + tmpmask = (cpumask_t *)allmasks; + + +#ifdef CONFIG_NUMA + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = NULL, *p; - cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + SCHED_CPUMASK_VAR(nodemask, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); + *nodemask = node_to_cpumask(cpu_to_node(i)); + cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { sd = &per_cpu(allnodes_domains, i); - *sd = SD_ALLNODES_INIT; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); sd->span = *cpu_map; - cpu_to_allnodes_group(i, cpu_map, &sd->groups); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; } else p = NULL; sd = &per_cpu(node_domains, i); - *sd = SD_NODE_INIT; - sd->span = sched_domain_node_span(cpu_to_node(i)); + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), &sd->span); sd->parent = p; if (p) p->child = sd; @@ -6601,94 +7373,117 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); - *sd = SD_CPU_INIT; - sd->span = nodemask; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + sd->span = *nodemask; sd->parent = p; if (p) p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups); + cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - *sd = SD_MC_INIT; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups); + cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - *sd = SD_SIBLING_INIT; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups); + cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(this_sibling_map, this_sibling_map, *cpu_map); - if (i != first_cpu(this_sibling_map)) + for_each_cpu_mask_nr(i, *cpu_map) { + SCHED_CPUMASK_VAR(this_sibling_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_sibling_map = per_cpu(cpu_sibling_map, i); + cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); + if (i != first_cpu(*this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group); + &cpu_to_cpu_group, + send_covered, tmpmask); } #endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_core_map = cpu_coregroup_map(i); - cpus_and(this_core_map, this_core_map, *cpu_map); - if (i != first_cpu(this_core_map)) + for_each_cpu_mask_nr(i, *cpu_map) { + SCHED_CPUMASK_VAR(this_core_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_core_map = cpu_coregroup_map(i); + cpus_and(*this_core_map, *this_core_map, *cpu_map); + if (i != first_cpu(*this_core_map)) continue; + init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group); + &cpu_to_core_group, + send_covered, tmpmask); } #endif /* Set up physical groups */ - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); + for (i = 0; i < nr_node_ids; i++) { + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; - init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, + &cpu_to_phys_group, + send_covered, tmpmask); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) - init_sched_build_groups(*cpu_map, cpu_map, - &cpu_to_allnodes_group); + if (sd_allnodes) { + SCHED_CPUMASK_VAR(send_covered, allmasks); - for (i = 0; i < MAX_NUMNODES; i++) { + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, + send_covered, tmpmask); + } + + for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - cpumask_t nodemask = node_to_cpumask(i); - cpumask_t domainspan; - cpumask_t covered = CPU_MASK_NONE; + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(domainspan, allmasks); + SCHED_CPUMASK_VAR(covered, allmasks); int j; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) { + *nodemask = node_to_cpumask(i); + cpus_clear(*covered); + + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) { sched_group_nodes[i] = NULL; continue; } - domainspan = sched_domain_node_span(i); - cpus_and(domainspan, domainspan, *cpu_map); + sched_domain_node_span(i, domainspan); + cpus_and(*domainspan, *domainspan, *cpu_map); sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); if (!sg) { @@ -6697,31 +7492,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask(j, nodemask) { + for_each_cpu_mask_nr(j, *nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = nodemask; + sg->cpumask = *nodemask; sg->next = sg; - cpus_or(covered, covered, nodemask); + cpus_or(*covered, *covered, *nodemask); prev = sg; - for (j = 0; j < MAX_NUMNODES; j++) { - cpumask_t tmp, notcovered; - int n = (i + j) % MAX_NUMNODES; + for (j = 0; j < nr_node_ids; j++) { + SCHED_CPUMASK_VAR(notcovered, allmasks); + int n = (i + j) % nr_node_ids; + node_to_cpumask_ptr(pnodemask, n); - cpus_complement(notcovered, covered); - cpus_and(tmp, notcovered, *cpu_map); - cpus_and(tmp, tmp, domainspan); - if (cpus_empty(tmp)) + cpus_complement(*notcovered, *covered); + cpus_and(*tmpmask, *notcovered, *cpu_map); + cpus_and(*tmpmask, *tmpmask, *domainspan); + if (cpus_empty(*tmpmask)) break; - nodemask = node_to_cpumask(n); - cpus_and(tmp, tmp, nodemask); - if (cpus_empty(tmp)) + cpus_and(*tmpmask, *tmpmask, *pnodemask); + if (cpus_empty(*tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group), @@ -6732,9 +7527,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sg->__cpu_power = 0; - sg->cpumask = tmp; + sg->cpumask = *tmpmask; sg->next = prev->next; - cpus_or(covered, covered, tmp); + cpus_or(*covered, *covered, *tmpmask); prev->next = sg; prev = sg; } @@ -6743,40 +7538,41 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(cpu_domains, i); init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(core_domains, i); init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(phys_domains, i); init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) + for (i = 0; i < nr_node_ids; i++) init_numa_sched_groups_power(sched_group_nodes[i]); if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + tmpmask); init_numa_sched_groups_power(sg); } #endif /* Attach the domains */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -6788,17 +7584,26 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpu_attach_domain(sd, rd, i); } + SCHED_CPUMASK_FREE((void *)allmasks); return 0; #ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); + SCHED_CPUMASK_FREE((void *)allmasks); return -ENOMEM; #endif } +static int build_sched_domains(const cpumask_t *cpu_map) +{ + return __build_sched_domains(cpu_map, NULL); +} + static cpumask_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of @@ -6826,15 +7631,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map) +static void arch_destroy_sched_domains(const cpumask_t *cpu_map, + cpumask_t *tmpmask) { - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); } /* @@ -6843,14 +7650,31 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) */ static void detach_destroy_domains(const cpumask_t *cpu_map) { + cpumask_t tmpmask; int i; unregister_sched_domain_sysctl(); - for_each_cpu_mask(i, *cpu_map) + for_each_cpu_mask_nr(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map); + arch_destroy_sched_domains(cpu_map, &tmpmask); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); } /* @@ -6870,29 +7694,31 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) * ownership of it and will kfree it when done with it. If the caller * failed the kmalloc call, then it can pass in doms_new == NULL, * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms'. + * 'fallback_doms', it also forces the domains to be rebuilt. + * + * If doms_new==NULL it will be replaced with cpu_online_map. + * ndoms_new==0 is a special case for destroying existing domains. + * It will not create the default domain. * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + struct sched_domain_attr *dattr_new) { - int i, j; + int i, j, n; - lock_doms_cur(); + mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) { - ndoms_new = 1; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - } + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < ndoms_new; j++) { - if (cpus_equal(doms_cur[i], doms_new[j])) + for (j = 0; j < n; j++) { + if (cpus_equal(doms_cur[i], doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ @@ -6901,14 +7727,23 @@ match1: ; } + if (doms_new == NULL) { + ndoms_cur = 0; + doms_new = &fallback_doms; + cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + dattr_new = NULL; + } + /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j])) + if (cpus_equal(doms_new[i], doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - build_sched_domains(doms_new + i); + __build_sched_domains(doms_new + i, + dattr_new ? dattr_new + i : NULL); match2: ; } @@ -6916,25 +7751,28 @@ match2: /* Remember the new sched domains */ if (doms_cur != &fallback_doms) kfree(doms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; + dattr_cur = dattr_new; ndoms_cur = ndoms_new; register_sched_domain_sysctl(); - unlock_doms_cur(); + mutex_unlock(&sched_domains_mutex); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int arch_reinit_sched_domains(void) { - int err; - get_online_cpus(); - detach_destroy_domains(&cpu_online_map); - err = arch_init_sched_domains(&cpu_online_map); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + + rebuild_sched_domains(); put_online_cpus(); - return err; + return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) @@ -6955,30 +7793,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sys_device *dev, +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sys_device *dev, +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, sched_smt_power_savings_store); #endif @@ -6998,62 +7840,83 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) #endif return err; } -#endif +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +#ifndef CONFIG_CPUSETS /* - * Force a reinitialization of the sched domains hierarchy. The domains - * and groups cannot be updated in place without racing with the balancing - * code, so we temporarily attach all running cpus to the NULL domain - * which will prevent rebalancing while the sched domains are recalculated. + * Add online and remove offline CPUs from the scheduler domains. + * When cpusets are enabled they take over this function. */ static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + partition_sched_domains(1, NULL, NULL); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} +#endif + +static int update_runtime(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - detach_destroy_domains(&cpu_online_map); + disable_runtime(cpu_rq(cpu)); return NOTIFY_OK; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* - * Fall through and re-initialise the domains. - */ - break; + enable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + default: return NOTIFY_DONE; } - - /* The hotplug lock is already held by cpu_up/cpu_down */ - arch_init_sched_domains(&cpu_online_map); - - return NOTIFY_OK; } void __init sched_init_smp(void) { cpumask_t non_isolated_cpus; +#if defined(CONFIG_NUMA) + sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), + GFP_KERNEL); + BUG_ON(sched_group_nodes_bycpu == NULL); +#endif get_online_cpus(); + mutex_lock(&sched_domains_mutex); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); + +#ifndef CONFIG_CPUSETS /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); +#endif + + /* RT runtime code needs to handle some hotplug events */ + hotcpu_notifier(update_runtime, 0); + + init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) BUG(); sched_init_granularity(); } @@ -7074,6 +7937,7 @@ int in_sched_functions(unsigned long addr) static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif @@ -7103,6 +7967,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -7111,10 +7977,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, - struct cfs_rq *cfs_rq, struct sched_entity *se, - int cpu, int add) +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, int add, + struct sched_entity *parent) { + struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; @@ -7122,75 +7989,199 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; - se->cfs_rq = &rq->cfs; + /* se could be NULL for init_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + se->my_q = cfs_rq; se->load.weight = tg->shares; - se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); - se->parent = NULL; + se->load.inv_weight = 0; + se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, - struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, - int cpu, int add) +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *parent) { + struct rq *rq = cpu_rq(cpu); + tg->rt_rq[cpu] = rt_rq; init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_se = rt_se; + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; if (add) list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; - rt_se->rt_rq = &rq->rt; + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + rt_se->my_q = rt_rq; - rt_se->parent = NULL; + rt_se->parent = parent; INIT_LIST_HEAD(&rt_se->run_list); } #endif void __init sched_init(void) { - int highest_cpu = 0; int i, j; + unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_USER_SCHED + alloc_size *= 2; +#endif + /* + * As sched_init() is called before page_alloc is setup, + * we use alloc_bootmem(). + */ + if (alloc_size) { + ptr = (unsigned long)alloc_bootmem(alloc_size); + +#ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ + } #ifdef CONFIG_SMP init_defrootdomain(); #endif + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&init_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#ifdef CONFIG_USER_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), RUNTIME_INF); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); -#endif + INIT_LIST_HEAD(&init_task_group.children); + +#ifdef CONFIG_USER_SCHED + INIT_LIST_HEAD(&root_task_group.children); + init_task_group.parent = &root_task_group; + list_add(&init_task_group.siblings, &root_task_group.children); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ for_each_possible_cpu(i) { struct rq *rq; rq = cpu_rq(i); spin_lock_init(&rq->lock); - lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->clock = 1; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - init_tg_cfs_entry(rq, &init_task_group, +#ifdef CONFIG_CGROUP_SCHED + /* + * How much cpu bandwidth does init_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * init_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if init_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting init_task_group's tasks sit + * directly in rq->cfs (i.e init_task_group->se[] = NULL). + */ + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + root_task_group.shares = NICE_0_LOAD; + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); + /* + * In case of task-groups formed thr' the user id of tasks, + * init_task_group represents tasks belonging to root user. + * Hence it forms a sibling of all subsequent groups formed. + * In this case, init_task_group gets only a fraction of overall + * system cpu resource, based on the weight assigned to root + * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished + * by letting tasks of init_task_group sit in a separate cfs_rq + * (init_cfs_rq) and having one entity represent this group of + * tasks in rq->cfs (i.e init_task_group->se[] != NULL). + */ + init_tg_cfs_entry(&init_task_group, &per_cpu(init_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1); + &per_cpu(init_sched_entity, i), i, 1, + root_task_group.se[i]); #endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_runtime = - sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); - init_tg_rt_entry(rq, &init_task_group, +#ifdef CONFIG_CGROUP_SCHED + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); + init_tg_rt_entry(&init_task_group, &per_cpu(init_rt_rq, i), - &per_cpu(init_sched_rt_entity, i), i, 1); + &per_cpu(init_sched_rt_entity, i), i, 1, + root_task_group.rt_se[i]); +#endif #endif - rq->rt_period_expire = 0; - rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -7201,13 +8192,13 @@ void __init sched_init(void) rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; + rq->online = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); - highest_cpu = i; } set_load_weight(&init_task); @@ -7217,8 +8208,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_SMP - nr_cpu_ids = highest_cpu + 1; - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #endif #ifdef CONFIG_RT_MUTEXES @@ -7275,6 +8265,7 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { int on_rq; + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) @@ -7306,7 +8297,6 @@ void normalize_rt_tasks(void) p->se.sleep_start = 0; p->se.block_start = 0; #endif - task_rq(p)->clock = 0; if (!rt_task(p)) { /* @@ -7376,8 +8366,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_GROUP_SCHED - #ifdef CONFIG_FAIR_GROUP_SCHED static void free_fair_sched_group(struct task_group *tg) { @@ -7394,17 +8382,18 @@ static void free_fair_sched_group(struct task_group *tg) kfree(tg->se); } -static int alloc_fair_sched_group(struct task_group *tg) +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se; + struct sched_entity *se, *parent_se; struct rq *rq; int i; - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); if (!tg->cfs_rq) goto err; - tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); if (!tg->se) goto err; @@ -7423,7 +8412,8 @@ static int alloc_fair_sched_group(struct task_group *tg) if (!se) goto err; - init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + parent_se = parent ? parent->se[i] : NULL; + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); } return 1; @@ -7442,12 +8432,13 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); } -#else +#else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } -static inline int alloc_fair_sched_group(struct task_group *tg) +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7459,13 +8450,15 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static void free_rt_sched_group(struct task_group *tg) { int i; + destroy_rt_bandwidth(&tg->rt_bandwidth); + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -7477,21 +8470,23 @@ static void free_rt_sched_group(struct task_group *tg) kfree(tg->rt_se); } -static int alloc_rt_sched_group(struct task_group *tg) +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; + struct sched_rt_entity *rt_se, *parent_se; struct rq *rq; int i; - tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); if (!tg->rt_rq) goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); if (!tg->rt_se) goto err; - tg->rt_runtime = 0; + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7506,7 +8501,8 @@ static int alloc_rt_sched_group(struct task_group *tg) if (!rt_se) goto err; - init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); + parent_se = parent ? parent->rt_se[i] : NULL; + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); } return 1; @@ -7525,12 +8521,13 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) { list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { } -static inline int alloc_rt_sched_group(struct task_group *tg) +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7542,8 +8539,9 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) { } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -7552,7 +8550,7 @@ static void free_sched_group(struct task_group *tg) } /* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; @@ -7562,10 +8560,10 @@ struct task_group *sched_create_group(void) if (!tg) return ERR_PTR(-ENOMEM); - if (!alloc_fair_sched_group(tg)) + if (!alloc_fair_sched_group(tg, parent)) goto err; - if (!alloc_rt_sched_group(tg)) + if (!alloc_rt_sched_group(tg, parent)) goto err; spin_lock_irqsave(&task_group_lock, flags); @@ -7574,6 +8572,12 @@ struct task_group *sched_create_group(void) register_rt_sched_group(tg, i); } list_add_rcu(&tg->list, &task_groups); + + WARN_ON(!parent); /* root should already exist */ + + tg->parent = parent; + INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -7602,6 +8606,7 @@ void sched_destroy_group(struct task_group *tg) unregister_rt_sched_group(tg, i); } list_del_rcu(&tg->list); + list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); /* wait for possible concurrent references to cfs_rqs complete */ @@ -7645,27 +8650,34 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } +#endif /* CONFIG_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void set_se_shares(struct sched_entity *se, unsigned long shares) +static void __set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; int on_rq; - spin_lock_irq(&rq->lock); - on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); se->load.weight = shares; - se->load.inv_weight = div64_64((1ULL<<32), shares); + se->load.inv_weight = 0; if (on_rq) enqueue_entity(cfs_rq, se, 0); +} - spin_unlock_irq(&rq->lock); +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -7676,12 +8688,15 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) unsigned long flags; /* - * A weight of 0 or 1 can cause arithmetics problems. - * (The default weight is 1024 - so there's no practical - * limitation from this.) + * We can't change the weight of the root cgroup. */ - if (shares < 2) - shares = 2; + if (!tg->se[0]) + return -EINVAL; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; mutex_lock(&shares_mutex); if (tg->shares == shares) @@ -7690,6 +8705,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); + list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); /* wait for any ongoing reference to this group to finish */ @@ -7700,8 +8716,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) + for_each_possible_cpu(i) { + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); set_se_shares(tg->se[i], shares); + } /* * Enable load balance activity on this group, by inserting it back on @@ -7710,6 +8731,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) register_fair_sched_group(tg, i); + list_add_rcu(&tg->siblings, &tg->parent->children); spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); @@ -7733,29 +8755,61 @@ static unsigned long to_ratio(u64 period, u64 runtime) if (runtime == RUNTIME_INF) return 1ULL << 16; - return div64_64(runtime << 16, period); + return div64_u64(runtime << 16, period); } +#ifdef CONFIG_CGROUP_SCHED +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + struct task_group *tgi, *parent = tg->parent; + unsigned long total = 0; + + if (!parent) { + if (global_rt_period() < period) + return 0; + + return to_ratio(period, runtime) < + to_ratio(global_rt_period(), global_rt_runtime()); + } + + if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(tgi, &parent->children, siblings) { + if (tgi == tg) + continue; + + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); + } + rcu_read_unlock(); + + return total + to_ratio(period, runtime) <= + to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), + parent->rt_bandwidth.rt_runtime); +} +#elif defined CONFIG_USER_SCHED static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { struct task_group *tgi; unsigned long total = 0; unsigned long global_ratio = - to_ratio(sysctl_sched_rt_period, - sysctl_sched_rt_runtime < 0 ? - RUNTIME_INF : sysctl_sched_rt_runtime); + to_ratio(global_rt_period(), global_rt_runtime()); rcu_read_lock(); list_for_each_entry_rcu(tgi, &task_groups, list) { if (tgi == tg) continue; - total += to_ratio(period, tgi->rt_runtime); + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); } rcu_read_unlock(); return total + to_ratio(period, runtime) < global_ratio; } +#endif /* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) @@ -7768,19 +8822,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg) return 0; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) { - u64 rt_runtime, rt_period; - int err = 0; - - rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us == -1) - rt_runtime = RUNTIME_INF; + int i, err = 0; mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { + if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { err = -EBUSY; goto unlock; } @@ -7788,7 +8837,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) err = -EINVAL; goto unlock; } - tg->rt_runtime = rt_runtime; + + spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -7796,19 +8857,117 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return err; } +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; - if (tg->rt_runtime == RUNTIME_INF) + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) return -1; - rt_runtime_us = tg->rt_runtime; + rt_runtime_us = tg->rt_bandwidth.rt_runtime; do_div(rt_runtime_us, NSEC_PER_USEC); return rt_runtime_us; } -#endif -#endif /* CONFIG_GROUP_SCHED */ + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + if (rt_period == 0) + return -EINVAL; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + struct task_group *tg = &root_task_group; + u64 rt_runtime, rt_period; + int ret = 0; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = tg->rt_bandwidth.rt_runtime; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(tg, rt_period, rt_runtime)) + ret = -EINVAL; + mutex_unlock(&rt_constraints_mutex); + + return ret; +} +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} #ifdef CONFIG_CGROUP_SCHED @@ -7822,7 +8981,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) static struct cgroup_subsys_state * cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct task_group *tg; + struct task_group *tg, *parent; if (!cgrp->parent) { /* This is early initialization for the top cgroup */ @@ -7830,11 +8989,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &init_task_group.css; } - /* we support only 1-level deep hierarchical scheduler atm */ - if (cgrp->parent->parent) - return ERR_PTR(-EINVAL); - - tg = sched_create_group(); + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7858,7 +9014,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, { #ifdef CONFIG_RT_GROUP_SCHED /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7877,80 +9033,62 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { return sched_group_set_shares(cgroup_tg(cgrp), shareval); } -static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); return (u64) tg->shares; } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) + s64 val) { - char buffer[64]; - int retval = 0; - s64 val; - char *end; - - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) - return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - - buffer[nbytes] = 0; /* nul-terminate */ - - /* strip newline if necessary */ - if (nbytes && (buffer[nbytes-1] == '\n')) - buffer[nbytes-1] = 0; - val = simple_strtoll(buffer, &end, 0); - if (*end) - return -EINVAL; + return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); +} - /* Pass to subsystem */ - retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); - if (!retval) - retval = nbytes; - return retval; +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_runtime(cgroup_tg(cgrp)); } -static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) { - char tmp[64]; - long val = sched_group_rt_runtime(cgroup_tg(cgrp)); - int len = sprintf(tmp, "%ld\n", val); + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", - .read_uint = cpu_shares_read_uint, - .write_uint = cpu_shares_write_uint, + .read_u64 = cpu_shares_read_u64, + .write_u64 = cpu_shares_write_u64, }, #endif #ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", - .read = cpu_rt_runtime_read, - .write = cpu_rt_runtime_write, + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, }, #endif }; @@ -7992,9 +9130,9 @@ struct cpuacct { struct cgroup_subsys cpuacct_subsys; /* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cont) +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), struct cpuacct, css); } @@ -8007,7 +9145,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cont) + struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -8025,18 +9163,18 @@ static struct cgroup_subsys_state *cpuacct_create( /* destroy an existing cpu accounting group */ static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cpuacct *ca = cgroup_ca(cont); + struct cpuacct *ca = cgroup_ca(cgrp); free_percpu(ca->cpuusage); kfree(ca); } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cont); + struct cpuacct *ca = cgroup_ca(cgrp); u64 totalcpuusage = 0; int i; @@ -8055,16 +9193,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) return totalcpuusage; } +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_possible_cpu(i) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + + spin_lock_irq(&cpu_rq(i)->lock); + *cpuusage = 0; + spin_unlock_irq(&cpu_rq(i)->lock); + } +out: + return err; +} + static struct cftype files[] = { { .name = "usage", - .read_uint = cpuusage_read, + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, }, }; -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) { - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); + return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); } /*