perf_counter: Increase mmap limit
[safe/jmp/linux-2.6] / kernel / sched.c
index 0de2f81..ad079f0 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -231,13 +232,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
        spin_lock(&rt_b->rt_runtime_lock);
        for (;;) {
+               unsigned long delta;
+               ktime_t soft, hard;
+
                if (hrtimer_active(&rt_b->rt_period_timer))
                        break;
 
                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start_expires(&rt_b->rt_period_timer,
-                               HRTIMER_MODE_ABS);
+
+               soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+               hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+               delta = ktime_to_ns(ktime_sub(hard, soft));
+               __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+                               HRTIMER_MODE_ABS, 0);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -1147,7 +1155,8 @@ static __init void init_hrtick(void)
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
-       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+                       HRTIMER_MODE_REL, 0);
 }
 
 static inline void init_hrtick(void)
@@ -1411,10 +1420,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   struct rq_iterator *iterator);
 #endif
 
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+       CPUACCT_STAT_USER,      /* ... user mode */
+       CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+
+       CPUACCT_STAT_NSTATS,
+};
+
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4539,49 +4560,75 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
  */
-unsigned long long __task_delta_exec(struct task_struct *p, int update)
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
-       s64 delta_exec;
-       struct rq *rq;
-
-       rq = task_rq(p);
-       WARN_ON_ONCE(!runqueue_is_locked());
-       WARN_ON_ONCE(!task_current(rq, p));
+       u64 ns = 0;
 
-       if (update)
+       if (task_current(rq, p)) {
                update_rq_clock(rq);
+               ns = rq->clock - p->se.exec_start;
+               if ((s64)ns < 0)
+                       ns = 0;
+       }
 
-       delta_exec = rq->clock - p->se.exec_start;
+       return ns;
+}
 
-       WARN_ON_ONCE(delta_exec < 0);
+unsigned long long task_delta_exec(struct task_struct *p)
+{
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns = 0;
 
-       return delta_exec;
+       rq = task_rq_lock(p, &flags);
+       ns = do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
+
+       return ns;
 }
 
 /*
- * Return any ns on the sched_clock that have not yet been banked in
- * @p in case that task is currently running.
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
  */
-unsigned long long task_delta_exec(struct task_struct *p)
+unsigned long long task_sched_runtime(struct task_struct *p)
 {
        unsigned long flags;
        struct rq *rq;
        u64 ns = 0;
 
        rq = task_rq_lock(p, &flags);
+       ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
 
-       if (task_current(rq, p)) {
-               u64 delta_exec;
+       return ns;
+}
 
-               update_rq_clock(rq);
-               delta_exec = rq->clock - p->se.exec_start;
-               if ((s64)delta_exec > 0)
-                       ns = delta_exec;
-       }
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+       struct task_cputime totals;
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns;
 
+       rq = task_rq_lock(p, &flags);
+       thread_group_cputime(p, &totals);
+       ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
        task_rq_unlock(rq, &flags);
 
        return ns;
@@ -4610,6 +4657,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
                cpustat->nice = cputime64_add(cpustat->nice, tmp);
        else
                cpustat->user = cputime64_add(cpustat->user, tmp);
+
+       cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
        /* Account for user time used */
        acct_update_integrals(p);
 }
@@ -4671,6 +4720,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        else
                cpustat->system = cputime64_add(cpustat->system, tmp);
 
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
        /* Account for system time used */
        acct_update_integrals(p);
 }
@@ -4718,7 +4769,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 
        if (user_tick)
                account_user_time(p, one_jiffy, one_jiffy_scaled);
-       else if (p != rq->idle)
+       else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
                account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
                                    one_jiffy_scaled);
        else
@@ -4824,16 +4875,17 @@ void scheduler_tick(void)
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
-       perf_counter_task_tick(curr, cpu);
        spin_unlock(&rq->lock);
 
+       perf_counter_task_tick(curr, cpu);
+
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
 #endif
 }
 
-unsigned long get_parent_ip(unsigned long addr)
+notrace unsigned long get_parent_ip(unsigned long addr)
 {
        if (in_lock_functions(addr)) {
                addr = CALLER_ADDR2;
@@ -5040,7 +5092,7 @@ need_resched_nonpreemptible:
 
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
-               perf_counter_task_sched_out(prev, cpu);
+               perf_counter_task_sched_out(prev, next, cpu);
 
                rq->nr_switches++;
                rq->curr = next;
@@ -7355,7 +7407,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+
                printk(KERN_CONT " %s", str);
+               if (group->__cpu_power != SCHED_LOAD_SCALE) {
+                       printk(KERN_CONT " (__cpu_power = %d)",
+                               group->__cpu_power);
+               }
 
                group = group->next;
        } while (group != sd->groups);
@@ -8941,7 +8998,7 @@ void __init sched_init(void)
                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
                 * then A0's share of the cpu resource is:
                 *
-                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                 *
                 * We achieve this by letting init_task_group's tasks sit
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9042,6 +9099,8 @@ void __init sched_init(void)
        alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
 
+       perf_counter_init();
+
        scheduler_running = 1;
 }
 
@@ -9978,6 +10037,7 @@ struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+       struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
 };
 
@@ -10002,20 +10062,32 @@ static struct cgroup_subsys_state *cpuacct_create(
        struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+       int i;
 
        if (!ca)
-               return ERR_PTR(-ENOMEM);
+               goto out;
 
        ca->cpuusage = alloc_percpu(u64);
-       if (!ca->cpuusage) {
-               kfree(ca);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!ca->cpuusage)
+               goto out_free_ca;
+
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               if (percpu_counter_init(&ca->cpustat[i], 0))
+                       goto out_free_counters;
 
        if (cgrp->parent)
                ca->parent = cgroup_ca(cgrp->parent);
 
        return &ca->css;
+
+out_free_counters:
+       while (--i >= 0)
+               percpu_counter_destroy(&ca->cpustat[i]);
+       free_percpu(ca->cpuusage);
+out_free_ca:
+       kfree(ca);
+out:
+       return ERR_PTR(-ENOMEM);
 }
 
 /* destroy an existing cpu accounting group */
@@ -10023,7 +10095,10 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
 
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               percpu_counter_destroy(&ca->cpustat[i]);
        free_percpu(ca->cpuusage);
        kfree(ca);
 }
@@ -10110,6 +10185,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
        return 0;
 }
 
+static const char *cpuacct_stat_desc[] = {
+       [CPUACCT_STAT_USER] = "user",
+       [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+               struct cgroup_map_cb *cb)
+{
+       struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
+
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+               s64 val = percpu_counter_read(&ca->cpustat[i]);
+               val = cputime64_to_clock_t(val);
+               cb->fill(cb, cpuacct_stat_desc[i], val);
+       }
+       return 0;
+}
+
 static struct cftype files[] = {
        {
                .name = "usage",
@@ -10120,7 +10214,10 @@ static struct cftype files[] = {
                .name = "usage_percpu",
                .read_seq_string = cpuacct_percpu_seq_read,
        },
-
+       {
+               .name = "stat",
+               .read_map = cpuacct_stats_show,
+       },
 };
 
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10142,12 +10239,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
                return;
 
        cpu = task_cpu(tsk);
+
+       rcu_read_lock();
+
        ca = task_ca(tsk);
 
        for (; ca; ca = ca->parent) {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
+
+       rcu_read_unlock();
+}
+
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val)
+{
+       struct cpuacct *ca;
+
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       rcu_read_lock();
+       ca = task_ca(tsk);
+
+       do {
+               percpu_counter_add(&ca->cpustat[idx], val);
+               ca = ca->parent;
+       } while (ca);
+       rcu_read_unlock();
 }
 
 struct cgroup_subsys cpuacct_subsys = {