X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsched.c;h=cda8b81f880131c90a1890cd9ef4c668588988dd;hb=fa08661af834875c9bd6f7f0b1b9388dc72a6585;hp=196d48babbef87c088214e4f252b4397d2bf25be;hpb=c4e1aa67ed9e4e542a064bc271ddbf152b677e91;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 196d48b..cda8b81 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -68,17 +69,18 @@ #include #include #include -#include #include #include #include -#include #include #include #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - #ifdef CONFIG_SMP static void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -231,13 +227,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) spin_lock(&rt_b->rt_runtime_lock); for (;;) { + unsigned long delta; + ktime_t soft, hard; + if (hrtimer_active(&rt_b->rt_period_timer)) break; now = hrtimer_cb_get_time(&rt_b->rt_period_timer); hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - hrtimer_start_expires(&rt_b->rt_period_timer, - HRTIMER_MODE_ABS); + + soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); + hard = hrtimer_get_expires(&rt_b->rt_period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); } spin_unlock(&rt_b->rt_runtime_lock); } @@ -490,6 +493,7 @@ struct rt_rq { #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; + unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; #endif @@ -577,6 +581,7 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; + u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -623,6 +628,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -685,7 +694,7 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -1110,7 +1119,7 @@ static void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { hrtimer_restart(timer); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; } } @@ -1146,7 +1155,8 @@ static __init void init_hrtick(void) */ static void hrtick_start(struct rq *rq, u64 delay) { - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); + __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, + HRTIMER_MODE_REL_PINNED, 0); } static inline void init_hrtick(void) @@ -1410,10 +1420,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct rq_iterator *iterator); #endif +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + #ifdef CONFIG_CGROUP_CPUACCT static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +static inline void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) {} #endif static inline void inc_cpu_load(struct rq *rq, unsigned long load) @@ -1708,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -1938,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) @@ -1947,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; +#endif if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); + p->se.nr_migrations++; + new_rq->nr_migrations_in++; +#ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); - } #endif + perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + 1, 1, NULL, 0); + } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -1995,6 +2024,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) } /* + * wait_task_context_switch - wait for a thread to complete at least one + * context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ + unsigned long nvcsw, nivcsw, flags; + int running; + struct rq *rq; + + nvcsw = p->nvcsw; + nivcsw = p->nivcsw; + for (;;) { + /* + * The runqueue is assigned before the actual context + * switch. We need to take the runqueue lock. + * + * We could check initially without the lock but it is + * very likely that we need to take the lock in every + * iteration. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + task_rq_unlock(rq, &flags); + + if (likely(!running)) + break; + /* + * The switch count is incremented before the actual + * context switch. We thus wait for two switches to be + * sure at least one completed. + */ + if ((p->nvcsw - nvcsw) > 1) + break; + if ((p->nivcsw - nivcsw) > 1) + break; + + cpu_relax(); + } +} + +/* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and @@ -2122,6 +2194,7 @@ void kick_process(struct task_struct *p) smp_send_reschedule(cpu); preempt_enable(); } +EXPORT_SYMBOL_GPL(kick_process); /* * Return a low guess at the load of a migration-source cpu weighted @@ -2304,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + */ +void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if (task_curr(p)) + smp_call_function_single(cpu, func, info, 1); + preempt_enable(); +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2438,6 +2532,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -2460,21 +2565,44 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sum_sleep_runtime = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; - p->se.sleep_max = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.wait_max = 0; + p->se.wait_start = 0; + p->se.wait_max = 0; + p->se.wait_count = 0; + p->se.wait_sum = 0; + + p->se.sleep_start = 0; + p->se.sleep_max = 0; + p->se.sum_sleep_runtime = 0; + + p->se.block_start = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + + p->se.nr_migrations_cold = 0; + p->se.nr_failed_migrations_affine = 0; + p->se.nr_failed_migrations_running = 0; + p->se.nr_failed_migrations_hot = 0; + p->se.nr_forced_migrations = 0; + p->se.nr_forced2_migrations = 0; + + p->se.nr_wakeups = 0; + p->se.nr_wakeups_sync = 0; + p->se.nr_wakeups_migrate = 0; + p->se.nr_wakeups_local = 0; + p->se.nr_wakeups_remote = 0; + p->se.nr_wakeups_affine = 0; + p->se.nr_wakeups_affine_attempts = 0; + p->se.nr_wakeups_passive = 0; + p->se.nr_wakeups_idle = 0; + #endif INIT_LIST_HEAD(&p->rt.run_list); @@ -2690,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP if (post_schedule) @@ -2746,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; @@ -2836,19 +2965,81 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) { - unsigned long i, running = 0, uninterruptible = 0; + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; + if (time_before(jiffies, upd)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); + } +} - return running + uninterruptible; +/* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_migrations(int cpu) +{ + return cpu_rq(cpu)->nr_migrations_in; } /* @@ -2879,6 +3070,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -3818,19 +4014,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ #define MAX_PINNED_INTERVAL 512 +/* Working cpumask for load_balance and load_balance_newidle. */ +static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, struct cpumask *cpus) + int *balance) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_setall(cpus); @@ -3985,8 +4185,7 @@ out: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - struct cpumask *cpus) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) { struct sched_group *group; struct rq *busiest = NULL; @@ -3994,6 +4193,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_setall(cpus); @@ -4134,10 +4334,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_var_t tmpmask; - - if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) - return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4148,7 +4344,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, tmpmask); + sd); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -4163,7 +4359,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } - free_cpumask_var(tmpmask); } /* @@ -4221,10 +4416,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +int get_nohz_load_balancer(void) +{ + return atomic_read(&nohz.load_balancer); +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return cpumask_first(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4279,8 +4595,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; @@ -4313,11 +4645,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_var_t tmp; - - /* Fails alloc? Rebalancing probably not a priority right now. */ - if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) - return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -4342,7 +4669,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -4376,8 +4703,6 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; - - free_cpumask_var(tmp); } /* @@ -4456,15 +4781,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -4511,9 +4828,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return any ns on the sched_clock that have not yet been banked in + * Return any ns on the sched_clock that have not yet been accounted in * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. */ +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ + u64 ns = 0; + + if (task_current(rq, p)) { + update_rq_clock(rq); + ns = rq->clock - p->se.exec_start; + if ((s64)ns < 0) + ns = 0; + } + + return ns; +} + unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; @@ -4521,16 +4854,49 @@ unsigned long long task_delta_exec(struct task_struct *p) u64 ns = 0; rq = task_rq_lock(p, &flags); + ns = do_task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); - if (task_current(rq, p)) { - u64 delta_exec; + return ns; +} - update_rq_clock(rq); - delta_exec = rq->clock - p->se.exec_start; - if ((s64)delta_exec > 0) - ns = delta_exec; - } +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); + + return ns; +} + +/* + * Return sum_exec_runtime for the thread group. + * In case the task is currently running, return the sum plus current's + * pending runtime that have not been accounted yet. + * + * Note that the thread group might have other running tasks as well, + * so the return value not includes other pending runtime that other + * running tasks might have. + */ +unsigned long long thread_group_sched_runtime(struct task_struct *p) +{ + struct task_cputime totals; + unsigned long flags; + struct rq *rq; + u64 ns; + + rq = task_rq_lock(p, &flags); + thread_group_cputime(p, &totals); + ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, &flags); return ns; @@ -4559,6 +4925,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime, cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + + cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ acct_update_integrals(p); } @@ -4620,6 +4988,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else cpustat->system = cputime64_add(cpustat->system, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + /* Account for system time used */ acct_update_integrals(p); } @@ -4667,7 +5037,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, one_jiffy, one_jiffy_scaled); - else if (p != rq->idle) + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, one_jiffy, one_jiffy_scaled); else @@ -4775,16 +5145,15 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); + perf_counter_task_tick(curr, cpu); + #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -static inline unsigned long get_parent_ip(unsigned long addr) +notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { addr = CALLER_ADDR2; @@ -4794,6 +5163,9 @@ static inline unsigned long get_parent_ip(unsigned long addr) return addr; } +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + void __kprobes add_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@ -4942,13 +5314,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -4988,6 +5362,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -5005,15 +5380,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); @@ -5156,7 +5525,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; @@ -5176,6 +5545,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5196,11 +5568,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) __wake_up_common(q, mode, 1, 0, NULL); } +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} + /** - * __wake_up_sync - wake up threads blocked on a waitqueue. + * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not @@ -5208,9 +5586,12 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ -void -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) { unsigned long flags; int sync = 1; @@ -5222,9 +5603,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) sync = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); + __wake_up_common(q, mode, nr_exclusive, sync, key); spin_unlock_irqrestore(&q->lock, flags); } +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ /** @@ -5235,6 +5625,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5252,6 +5645,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { @@ -6168,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +static inline int should_resched(void) +{ + return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + static void __cond_resched(void) { #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -6187,8 +6588,7 @@ static void __cond_resched(void) int __sched _cond_resched(void) { - if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && - system_state == SYSTEM_RUNNING) { + if (should_resched()) { __cond_resched(); return 1; } @@ -6206,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched); */ int cond_resched_lock(spinlock_t *lock) { - int resched = need_resched() && system_state == SYSTEM_RUNNING; + int resched = should_resched(); int ret = 0; if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched && need_resched()) + if (resched) __cond_resched(); else cpu_relax(); @@ -6226,7 +6626,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && system_state == SYSTEM_RUNNING) { + if (should_resched()) { local_bh_enable(); __cond_resched(); local_bh_disable(); @@ -6410,8 +6810,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } @@ -6650,6 +7051,11 @@ fail: return ret; } +#define RCU_MIGRATION_IDLE 0 +#define RCU_MIGRATION_NEED_QS 1 +#define RCU_MIGRATION_GOT_QS 2 +#define RCU_MIGRATION_MUST_SYNC 3 + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -6657,6 +7063,7 @@ fail: */ static int migration_thread(void *data) { + int badcpu; int cpu = (long)data; struct rq *rq; @@ -6672,7 +7079,7 @@ static int migration_thread(void *data) if (cpu_is_offline(cpu)) { spin_unlock_irq(&rq->lock); - goto wait_to_die; + break; } if (rq->active_balance) { @@ -6691,23 +7098,23 @@ static int migration_thread(void *data) req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); + if (req->task != NULL) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + } else if (likely(cpu == (badcpu = smp_processor_id()))) { + req->dest_cpu = RCU_MIGRATION_GOT_QS; + spin_unlock(&rq->lock); + } else { + req->dest_cpu = RCU_MIGRATION_MUST_SYNC; + spin_unlock(&rq->lock); + WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); + } local_irq_enable(); complete(&req->done); } __set_current_state(TASK_RUNNING); - return 0; -wait_to_die: - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); return 0; } @@ -6890,6 +7297,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7113,7 +7529,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = task_rq_lock(p, &flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); + get_task_struct(p); cpu_rq(cpu)->migration_thread = p; + rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: @@ -7141,6 +7559,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) kthread_bind(cpu_rq(cpu)->migration_thread, cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); + put_task_struct(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; @@ -7150,6 +7569,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); + put_task_struct(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ spin_lock_irq(&rq->lock); @@ -7163,7 +7583,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -7199,8 +7619,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else. This has to be lower priority than + * the notifier in the perf_counter subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -7287,7 +7709,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); + printk(KERN_CONT " %s", str); + if (group->__cpu_power != SCHED_LOAD_SCALE) { + printk(KERN_CONT " (__cpu_power = %d)", + group->__cpu_power); + } group = group->next; } while (group != sd->groups); @@ -7438,26 +7865,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { + gfp_t gfp = GFP_KERNEL; + memset(rd, 0, sizeof(*rd)); - if (bootmem) { - alloc_bootmem_cpumask_var(&def_root_domain.span); - alloc_bootmem_cpumask_var(&def_root_domain.online); - alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri, true); - return 0; - } + if (bootmem) + gfp = GFP_NOWAIT; - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->span, gfp)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->online, gfp)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->rto_mask, gfp)) goto free_online; - if (cpupri_init(&rd->cpupri, false) != 0) + if (cpupri_init(&rd->cpupri, bootmem) != 0) goto free_rto_mask; return 0; @@ -7668,8 +8092,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; @@ -7713,7 +8138,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map, { int group; - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); if (sg) *sg = &per_cpu(sched_group_core, group).sg; @@ -7742,7 +8167,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); #else group = cpu; @@ -7790,7 +8215,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -7868,7 +8293,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; @@ -8085,7 +8510,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), - &per_cpu(cpu_sibling_map, i), cpu_map); + topology_thread_cpumask(i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@ -8096,7 +8521,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { cpumask_and(this_sibling_map, - &per_cpu(cpu_sibling_map, i), cpu_map); + topology_thread_cpumask(i), cpu_map); if (i != cpumask_first(this_sibling_map)) continue; @@ -8646,6 +9071,8 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ +const_debug unsigned int sysctl_timer_migration = 1; + int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8685,7 +9112,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init(&rq->rt.pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; @@ -8772,12 +9199,15 @@ void __init sched_init(void) #ifdef CONFIG_USER_SCHED alloc_size *= 2; #endif +#ifdef CONFIG_CPUMASK_OFFSTACK + alloc_size += num_possible_cpus() * cpumask_size(); +#endif /* * As sched_init() is called before page_alloc is setup, * we use alloc_bootmem(). */ if (alloc_size) { - ptr = (unsigned long)alloc_bootmem(alloc_size); + ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.se = (struct sched_entity **)ptr; @@ -8809,6 +9239,12 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_CPUMASK_OFFSTACK + for_each_possible_cpu(i) { + per_cpu(load_balance_tmpmask, i) = (void *)ptr; + ptr += cpumask_size(); + } +#endif /* CONFIG_CPUMASK_OFFSTACK */ } #ifdef CONFIG_SMP @@ -8844,6 +9280,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8864,7 +9302,7 @@ void __init sched_init(void) * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -8951,20 +9389,26 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_bootmem_cpumask_var(&nohz_cpu_mask); + alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_bootmem_cpumask_var(&cpu_isolated_map); + alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ + perf_counter_init(); + scheduler_running = 1; } @@ -9706,6 +10150,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -9901,6 +10352,7 @@ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; struct cpuacct *parent; }; @@ -9925,20 +10377,32 @@ static struct cgroup_subsys_state *cpuacct_create( struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + int i; if (!ca) - return ERR_PTR(-ENOMEM); + goto out; ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) { - kfree(ca); - return ERR_PTR(-ENOMEM); - } + if (!ca->cpuusage) + goto out_free_ca; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + if (percpu_counter_init(&ca->cpustat[i], 0)) + goto out_free_counters; if (cgrp->parent) ca->parent = cgroup_ca(cgrp->parent); return &ca->css; + +out_free_counters: + while (--i >= 0) + percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); } /* destroy an existing cpu accounting group */ @@ -9946,7 +10410,10 @@ static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); + int i; + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + percpu_counter_destroy(&ca->cpustat[i]); free_percpu(ca->cpuusage); kfree(ca); } @@ -10033,6 +10500,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, return 0; } +static const char *cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int i; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { + s64 val = percpu_counter_read(&ca->cpustat[i]); + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[i], val); + } + return 0; +} + static struct cftype files[] = { { .name = "usage", @@ -10043,7 +10529,10 @@ static struct cftype files[] = { .name = "usage_percpu", .read_seq_string = cpuacct_percpu_seq_read, }, - + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -10065,12 +10554,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) return; cpu = task_cpu(tsk); + + rcu_read_lock(); + ca = task_ca(tsk); for (; ca; ca = ca->parent) { u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } + + rcu_read_unlock(); +} + +/* + * Charge the system/user time to the task's accounting group. + */ +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) +{ + struct cpuacct *ca; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(tsk); + + do { + percpu_counter_add(&ca->cpustat[idx], val); + ca = ca->parent; + } while (ca); + rcu_read_unlock(); } struct cgroup_subsys cpuacct_subsys = { @@ -10081,3 +10596,113 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +#ifndef CONFIG_SMP + +int rcu_expedited_torture_stats(char *page) +{ + return 0; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +void synchronize_sched_expedited(void) +{ +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); +static DEFINE_MUTEX(rcu_sched_expedited_mutex); + +#define RCU_EXPEDITED_STATE_POST -2 +#define RCU_EXPEDITED_STATE_IDLE -1 + +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + +int rcu_expedited_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + + cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); + for_each_online_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d:%d", + cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +static long synchronize_sched_expedited_count; + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + */ +void synchronize_sched_expedited(void) +{ + int cpu; + unsigned long flags; + bool need_full_sync = 0; + struct rq *rq; + struct migration_req *req; + long snap; + int trycount = 0; + + smp_mb(); /* ensure prior mod happens before capturing snap. */ + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + get_online_cpus(); + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + put_online_cpus(); + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + get_online_cpus(); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_POST; + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + req = &per_cpu(rcu_migration_req, cpu); + init_completion(&req->done); + req->task = NULL; + req->dest_cpu = RCU_MIGRATION_NEED_QS; + spin_lock_irqsave(&rq->lock, flags); + list_add(&req->list, &rq->migration_queue); + spin_unlock_irqrestore(&rq->lock, flags); + wake_up_process(rq->migration_thread); + } + for_each_online_cpu(cpu) { + rcu_expedited_state = cpu; + req = &per_cpu(rcu_migration_req, cpu); + rq = cpu_rq(cpu); + wait_for_completion(&req->done); + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) + need_full_sync = 1; + req->dest_cpu = RCU_MIGRATION_IDLE; + spin_unlock_irqrestore(&rq->lock, flags); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + mutex_unlock(&rcu_sched_expedited_mutex); + put_online_cpus(); + if (need_full_sync) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */