X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fperf_counter.c;h=534e20d14d631b44cefc136f99200b2257af378f;hb=8cf4e750f8459d51c2e8a035a201da4bf7aa996a;hp=367299f91aaff21fe9948035a43f27bd5ee7a4b5;hpb=329d876d6fd326109f191ae0fb2798b8834fb70b;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 367299f..534e20d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -16,8 +16,9 @@ #include #include #include -#include +#include #include +#include #include #include #include @@ -26,7 +27,6 @@ #include #include #include -#include #include @@ -40,13 +40,36 @@ static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_tracking __read_mostly; -static atomic_t nr_munmap_tracking __read_mostly; -static atomic_t nr_comm_tracking __read_mostly; +static atomic_t nr_mmap_counters __read_mostly; +static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly; + +/* + * perf counter paranoia level: + * 0 - not paranoid + * 1 - disallow cpu counters to unpriv + * 2 - disallow kernel profiling to unpriv + */ +int sysctl_perf_counter_paranoid __read_mostly; + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_counter_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_counter_paranoid > 1; +} -int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ -int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ + +/* + * max perf counter sample rate + */ +int sysctl_perf_counter_sample_rate __read_mostly = 100000; + +static atomic64_t perf_counter_id; /* * Lock for (sysadmin-configurable) counter reservations: @@ -65,7 +88,10 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } -int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) { @@ -100,7 +126,15 @@ void perf_enable(void) static void get_ctx(struct perf_counter_context *ctx) { - atomic_inc(&ctx->refcount); + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_counter_context *ctx; + + ctx = container_of(head, struct perf_counter_context, rcu_head); + kfree(ctx); } static void put_ctx(struct perf_counter_context *ctx) @@ -108,8 +142,99 @@ static void put_ctx(struct perf_counter_context *ctx) if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - kfree(ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_counter_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +/* + * If we inherit counters we want to return the parent counter id + * to userspace. + */ +static u64 primary_counter_id(struct perf_counter *counter) +{ + u64 id = counter->id; + + if (counter->parent) + id = counter->parent->id; + + return id; +} + +/* + * Get the perf_counter_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_counter_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ + struct perf_counter_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_counter_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_counter_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_counter_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); } + return ctx; +} + +static void perf_unpin_context(struct perf_counter_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); } /* @@ -135,6 +260,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; + if (counter->attr.inherit_stat) + ctx->nr_stat++; } /* @@ -149,6 +276,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; + if (counter->attr.inherit_stat) + ctx->nr_stat--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -178,6 +307,10 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + if (counter->pending_disable) { + counter->pending_disable = 0; + counter->state = PERF_COUNTER_STATE_OFF; + } counter->tstamp_stopped = ctx->time; counter->pmu->disable(counter); counter->oncpu = -1; @@ -185,7 +318,7 @@ counter_sched_out(struct perf_counter *counter, if (!is_software_counter(counter)) cpuctx->active_oncpu--; ctx->nr_active--; - if (counter->hw_event.exclusive || !cpuctx->active_oncpu) + if (counter->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -207,27 +340,11 @@ group_sched_out(struct perf_counter *group_counter, list_for_each_entry(counter, &group_counter->sibling_list, list_entry) counter_sched_out(counter, cpuctx, ctx); - if (group_counter->hw_event.exclusive) + if (group_counter->attr.exclusive) cpuctx->exclusive = 0; } /* - * Mark this context as not being a clone of another. - * Called when counters are added to or removed from this context. - * We also increment our generation number so that anything that - * was cloned from this context before this will not match anything - * cloned from this context after this. - */ -static void unclone_ctx(struct perf_counter_context *ctx) -{ - ++ctx->generation; - if (!ctx->parent_ctx) - return; - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; -} - -/* * Cross CPU call to remove a performance counter * * We disable the counter on the hardware level first. After that we @@ -238,7 +355,6 @@ static void __perf_counter_remove_from_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; /* * If this is a task context, we need to check whether it is @@ -248,7 +364,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the * counters on a global level. @@ -270,7 +386,7 @@ static void __perf_counter_remove_from_context(void *info) } perf_enable(); - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); } @@ -281,13 +397,19 @@ static void __perf_counter_remove_from_context(void *info) * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_counter_exit_task, it's OK because the + * context has been detached from its task. */ static void perf_counter_remove_from_context(struct perf_counter *counter) { struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; - unclone_ctx(ctx); if (!task) { /* * Per cpu counters are removed via an smp call and @@ -380,7 +502,6 @@ static void __perf_counter_disable(void *info) struct perf_counter *counter = info; struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; /* * If this is a per-task counter, need to check whether this @@ -389,7 +510,7 @@ static void __perf_counter_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + spin_lock(&ctx->lock); /* * If the counter is on, turn it off. @@ -405,11 +526,21 @@ static void __perf_counter_disable(void *info) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); } /* * Disable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_counter_for_each_child or perf_counter_for_each because they + * hold the top-level counter's child_mutex, so any descendant that + * goes to exit will block in sync_child_counter. + * When called from perf_pending_counter it's OK because counter->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_counter_task_sched_out for this context. */ static void perf_counter_disable(struct perf_counter *counter) { @@ -477,7 +608,7 @@ counter_sched_in(struct perf_counter *counter, cpuctx->active_oncpu++; ctx->nr_active++; - if (counter->hw_event.exclusive) + if (counter->attr.exclusive) cpuctx->exclusive = 1; return 0; @@ -499,7 +630,6 @@ group_sched_in(struct perf_counter *group_counter, if (ret) return ret < 0 ? ret : 0; - group_counter->prev_state = group_counter->state; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -507,7 +637,6 @@ group_sched_in(struct perf_counter *group_counter, * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter->prev_state = counter->state; if (counter_sched_in(counter, cpuctx, ctx, cpu)) { partial_group = counter; goto group_error; @@ -571,7 +700,7 @@ static int group_can_go_on(struct perf_counter *counter, * If this group is exclusive and there are already * counters on the CPU, it can't go on. */ - if (counter->hw_event.exclusive && cpuctx->active_oncpu) + if (counter->attr.exclusive && cpuctx->active_oncpu) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -584,7 +713,6 @@ static void add_counter_to_ctx(struct perf_counter *counter, struct perf_counter_context *ctx) { list_add_counter(counter, ctx); - counter->prev_state = PERF_COUNTER_STATE_OFF; counter->tstamp_enabled = ctx->time; counter->tstamp_running = ctx->time; counter->tstamp_stopped = ctx->time; @@ -602,7 +730,6 @@ static void __perf_install_in_context(void *info) struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); - unsigned long flags; int err; /* @@ -618,7 +745,7 @@ static void __perf_install_in_context(void *info) cpuctx->task_ctx = ctx; } - spin_lock_irqsave(&ctx->lock, flags); + spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -656,7 +783,7 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) { + if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; } @@ -668,7 +795,7 @@ static void __perf_install_in_context(void *info) unlock: perf_enable(); - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); } /* @@ -732,7 +859,6 @@ static void __perf_counter_enable(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; - unsigned long flags; int err; /* @@ -745,11 +871,10 @@ static void __perf_counter_enable(void *info) cpuctx->task_ctx = ctx; } - spin_lock_irqsave(&ctx->lock, flags); + spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); - counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; @@ -782,18 +907,24 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) { + if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; } } unlock: - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); } /* * Enable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_counter_for_each_child or perf_counter_for_each as described + * for perf_counter_disable. */ static void perf_counter_enable(struct perf_counter *counter) { @@ -854,7 +985,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh) /* * not supported on inherited counters */ - if (counter->hw_event.inherit) + if (counter->attr.inherit) return -EINVAL; atomic_add(refresh, &counter->event_limit); @@ -903,7 +1034,83 @@ static int context_equiv(struct perf_counter_context *ctx1, struct perf_counter_context *ctx2) { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen; + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_counter_read(void *counter); + +static void __perf_counter_sync_stat(struct perf_counter *counter, + struct perf_counter *next_counter) +{ + u64 value; + + if (!counter->attr.inherit_stat) + return; + + /* + * Update the counter value, we cannot use perf_counter_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the counter must be on the current CPU, therefore we + * don't need to use it. + */ + switch (counter->state) { + case PERF_COUNTER_STATE_ACTIVE: + __perf_counter_read(counter); + break; + + case PERF_COUNTER_STATE_INACTIVE: + update_counter_times(counter); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the counter + * values when we flip the contexts. + */ + value = atomic64_read(&next_counter->count); + value = atomic64_xchg(&counter->count, value); + atomic64_set(&next_counter->count, value); + + swap(counter->total_time_enabled, next_counter->total_time_enabled); + swap(counter->total_time_running, next_counter->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_counter_update_userpage(counter); + perf_counter_update_userpage(next_counter); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_counter_sync_stat(struct perf_counter_context *ctx, + struct perf_counter_context *next_ctx) +{ + struct perf_counter *counter, *next_counter; + + if (!ctx->nr_stat) + return; + + counter = list_first_entry(&ctx->event_list, + struct perf_counter, event_entry); + + next_counter = list_first_entry(&next_ctx->event_list, + struct perf_counter, event_entry); + + while (&counter->event_entry != &ctx->event_list && + &next_counter->event_entry != &next_ctx->event_list) { + + __perf_counter_sync_stat(counter, next_counter); + + counter = list_next_entry(counter, event_entry); + next_counter = list_next_entry(next_counter, event_entry); + } } /* @@ -923,39 +1130,78 @@ void perf_counter_task_sched_out(struct task_struct *task, struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = task->perf_counter_ctxp; struct perf_counter_context *next_ctx; + struct perf_counter_context *parent; struct pt_regs *regs; + int do_switch = 1; regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_counter_ctxp; - if (next_ctx && context_equiv(ctx, next_ctx)) { - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - return; - } + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_counter_ctxp + */ + task->perf_counter_ctxp = next_ctx; + next->perf_counter_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; - __perf_counter_sched_out(ctx, cpuctx); + perf_counter_sync_stat(ctx, next_ctx); + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); + } + rcu_read_unlock(); - cpuctx->task_ctx = NULL; + if (do_switch) { + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } } +/* + * Called with IRQs disabled + */ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); if (!cpuctx->task_ctx) return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } +/* + * Called with IRQs disabled + */ static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) { __perf_counter_sched_out(&cpuctx->ctx, cpuctx); @@ -983,7 +1229,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, */ list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->hw_event.pinned) + !counter->attr.pinned) continue; if (counter->cpu != -1 && counter->cpu != cpu) continue; @@ -1011,7 +1257,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * ignore pinned counters since we did them already. */ if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->hw_event.pinned) + counter->attr.pinned) continue; /* @@ -1070,46 +1316,87 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_counter *counter, int enable); -static void perf_log_period(struct perf_counter *counter, u64 period); -static void perf_adjust_freq(struct perf_counter_context *ctx) +static void perf_adjust_period(struct perf_counter *counter, u64 events) { - struct perf_counter *counter; - u64 interrupts, irq_period; - u64 events, period; + struct hw_perf_counter *hwc = &counter->hw; + u64 period, sample_period; s64 delta; + events *= hwc->sample_period; + period = div64_u64(events, counter->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + struct hw_perf_counter *hwc; + u64 interrupts, freq; + spin_lock(&ctx->lock); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state != PERF_COUNTER_STATE_ACTIVE) continue; - interrupts = counter->hw.interrupts; - counter->hw.interrupts = 0; + hwc = &counter->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + /* + * unthrottle counters on the tick + */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(counter, 1); counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_limit/HZ; + interrupts = 2*sysctl_perf_counter_sample_rate/HZ; } - if (!counter->hw_event.freq || !counter->hw_event.irq_freq) + if (!counter->attr.freq || !counter->attr.sample_freq) continue; - events = HZ * interrupts * counter->hw.irq_period; - period = div64_u64(events, counter->hw_event.irq_freq); + /* + * if the specified freq < HZ then we need to skip ticks + */ + if (counter->attr.sample_freq < HZ) { + freq = counter->attr.sample_freq; - delta = (s64)(1 + period - counter->hw.irq_period); - delta >>= 1; + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; - irq_period = counter->hw.irq_period + delta; + if (hwc->freq_count < HZ) + continue; - if (!irq_period) - irq_period = 1; + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; - perf_log_period(counter, irq_period); + perf_adjust_period(counter, freq * interrupts); - counter->hw.irq_period = irq_period; + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + counter->pmu->disable(counter); + atomic64_set(&hwc->period_left, 0); + counter->pmu->enable(counter); + perf_enable(); + } } spin_unlock(&ctx->lock); } @@ -1149,9 +1436,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = curr->perf_counter_ctxp; - perf_adjust_freq(&cpuctx->ctx); + perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) - perf_adjust_freq(ctx); + perf_ctx_adjust_freq(ctx); perf_counter_cpu_sched_out(cpuctx); if (ctx) @@ -1167,9 +1454,54 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) } /* + * Enable all of a task's counters that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_counter_enable_on_exec(struct task_struct *task) +{ + struct perf_counter_context *ctx; + struct perf_counter *counter; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_counter_ctxp; + if (!ctx || !ctx->nr_counters) + goto out; + + __perf_counter_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (!counter->attr.enable_on_exec) + continue; + counter->attr.enable_on_exec = 0; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + continue; + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; + enabled = 1; + } + + /* + * Unclone this context if we enabled any counter. + */ + if (enabled) + unclone_ctx(ctx); + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + +/* * Cross CPU call to read the hardware counter */ -static void __read(void *info) +static void __perf_counter_read(void *info) { struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; @@ -1191,7 +1523,7 @@ static u64 perf_counter_read(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, - __read, counter, 1); + __perf_counter_read, counter, 1); } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); } @@ -1215,25 +1547,20 @@ __perf_counter_init_context(struct perf_counter_context *ctx, ctx->task = task; } -static void put_context(struct perf_counter_context *ctx) -{ - if (ctx->task) - put_task_struct(ctx->task); -} - static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { - struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; - struct perf_counter_context *tctx; + struct perf_cpu_context *cpuctx; struct task_struct *task; + unsigned long flags; + int err; /* * If cpu is not a wildcard then this is a percpu counter: */ if (cpu != -1) { /* Must be root to operate on a CPU counter: */ - if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); if (cpu < 0 || cpu > num_possible_cpus()) @@ -1249,6 +1576,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &cpuctx->ctx; + get_ctx(ctx); return ctx; } @@ -1265,37 +1593,49 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); + /* + * Can't attach counters to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + /* Reuse ptrace permission checks for now. */ - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_task_struct(task); - return ERR_PTR(-EACCES); + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry: + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + unclone_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); } - ctx = task->perf_counter_ctxp; if (!ctx) { ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!ctx) { - put_task_struct(task); - return ERR_PTR(-ENOMEM); - } + err = -ENOMEM; + if (!ctx) + goto errout; __perf_counter_init_context(ctx, task); - /* - * Make sure other cpus see correct values for *ctx - * once task->perf_counter_ctxp is visible to them. - */ - smp_wmb(); - tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); - if (tctx) { + get_ctx(ctx); + if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { /* * We raced with some other task; use * the context they set. */ kfree(ctx); - ctx = tctx; + goto retry; } + get_task_struct(task); } + put_task_struct(task); return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); } static void free_counter_rcu(struct rcu_head *head) @@ -1303,7 +1643,8 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); - put_ctx(counter->ctx); + if (counter->ns) + put_pid_ns(counter->ns); kfree(counter); } @@ -1313,17 +1654,20 @@ static void free_counter(struct perf_counter *counter) { perf_pending_sync(counter); - atomic_dec(&nr_counters); - if (counter->hw_event.mmap) - atomic_dec(&nr_mmap_tracking); - if (counter->hw_event.munmap) - atomic_dec(&nr_munmap_tracking); - if (counter->hw_event.comm) - atomic_dec(&nr_comm_tracking); + if (!counter->parent) { + atomic_dec(&nr_counters); + if (counter->attr.mmap) + atomic_dec(&nr_mmap_counters); + if (counter->attr.comm) + atomic_dec(&nr_comm_counters); + if (counter->attr.task) + atomic_dec(&nr_task_counters); + } if (counter->destroy) counter->destroy(counter); + put_ctx(counter->ctx); call_rcu(&counter->rcu_head, free_counter_rcu); } @@ -1337,6 +1681,7 @@ static int perf_release(struct inode *inode, struct file *file) file->private_data = NULL; + WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_counter_remove_from_context(counter); mutex_unlock(&ctx->mutex); @@ -1347,41 +1692,57 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(counter->owner); free_counter(counter); - put_context(ctx); return 0; } -/* - * Read the performance counter - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +static int perf_counter_read_size(struct perf_counter *counter) { - u64 values[3]; - int n; + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; - /* - * Return end-of-file for a read on a counter that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - return 0; + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); - mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read(counter); - n = 1; - if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - mutex_unlock(&counter->child_mutex); + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_GROUP) { + nr += counter->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) +{ + struct perf_counter *child; + u64 total = 0; + + total += perf_counter_read(counter); + list_for_each_entry(child, &counter->child_list, child_list) + total += perf_counter_read(child); + + return total; +} + +static int perf_counter_read_entry(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); - if (count < n * sizeof(u64)) - return -EINVAL; count = n * sizeof(u64); if (copy_to_user(buf, values, count)) @@ -1390,6 +1751,101 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return count; } +static int perf_counter_read_group(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + struct perf_counter *leader = counter->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_counter_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + err = perf_counter_read_entry(counter, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance counter - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +{ + u64 read_format = counter->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a counter that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + return 0; + + if (count < perf_counter_read_size(counter)) + return -ENOSPC; + + WARN_ON_ONCE(counter->ctx->parent_ctx); + mutex_lock(&counter->child_mutex); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_counter_read_group(counter, read_format, buf); + else + ret = perf_counter_read_one(counter, read_format, buf); + mutex_unlock(&counter->child_mutex); + + return ret; +} + static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -1422,26 +1878,18 @@ static void perf_counter_reset(struct perf_counter *counter) perf_counter_update_userpage(counter); } -static void perf_counter_for_each_sibling(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, list_entry) - func(sibling); - mutex_unlock(&ctx->mutex); -} - +/* + * Holding the top-level counter's child_mutex means that any + * descendant process that has inherited this counter will block + * in sync_child_counter if it goes to exit, thus satisfying the + * task existence requirements of perf_counter_enable/disable. + */ static void perf_counter_for_each_child(struct perf_counter *counter, void (*func)(struct perf_counter *)) { struct perf_counter *child; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); func(counter); list_for_each_entry(child, &counter->child_list, child_list) @@ -1452,13 +1900,53 @@ static void perf_counter_for_each_child(struct perf_counter *counter, static void perf_counter_for_each(struct perf_counter *counter, void (*func)(struct perf_counter *)) { - struct perf_counter *child; + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *sibling; - mutex_lock(&counter->child_mutex); - perf_counter_for_each_sibling(counter, func); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_for_each_sibling(child, func); - mutex_unlock(&counter->child_mutex); + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + counter = counter->group_leader; + + perf_counter_for_each_child(counter, func); + func(counter); + list_for_each_entry(sibling, &counter->sibling_list, list_entry) + perf_counter_for_each_child(counter, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) +{ + struct perf_counter_context *ctx = counter->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!counter->attr.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (counter->attr.freq) { + if (value > sysctl_perf_counter_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + counter->attr.sample_freq = value; + } else { + counter->attr.sample_period = value; + counter->hw.sample_period = value; + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1480,6 +1968,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_REFRESH: return perf_counter_refresh(counter, arg); + + case PERF_COUNTER_IOC_PERIOD: + return perf_counter_period(counter, (u64 __user *)arg); + default: return -ENOTTY; } @@ -1516,6 +2008,14 @@ int perf_counter_task_disable(void) return 0; } +static int perf_counter_index(struct perf_counter *counter) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return 0; + + return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -1523,8 +2023,8 @@ int perf_counter_task_disable(void) */ void perf_counter_update_userpage(struct perf_counter *counter) { - struct perf_mmap_data *data; struct perf_counter_mmap_page *userpg; + struct perf_mmap_data *data; rcu_read_lock(); data = rcu_dereference(counter->data); @@ -1540,11 +2040,17 @@ void perf_counter_update_userpage(struct perf_counter *counter) preempt_disable(); ++userpg->lock; barrier(); - userpg->index = counter->hw.idx; + userpg->index = perf_counter_index(counter); userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); + userpg->time_enabled = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + + userpg->time_running = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + barrier(); ++userpg->lock; preempt_enable(); @@ -1558,6 +2064,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct perf_mmap_data *data; int ret = VM_FAULT_SIGBUS; + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + rcu_read_lock(); data = rcu_dereference(counter->data); if (!data) @@ -1571,9 +2083,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if ((unsigned)nr > data->nr_pages) goto unlock; + if (vmf->flags & FAULT_FLAG_WRITE) + goto unlock; + vmf->page = virt_to_page(data->data_pages[nr]); } + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + ret = 0; unlock: rcu_read_unlock(); @@ -1626,15 +2145,25 @@ fail: return -ENOMEM; } +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + static void __perf_mmap_data_free(struct rcu_head *rcu_head) { - struct perf_mmap_data *data = container_of(rcu_head, - struct perf_mmap_data, rcu_head); + struct perf_mmap_data *data; int i; - free_page((unsigned long)data->user_page); + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + + perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) - free_page((unsigned long)data->data_pages[i]); + perf_mmap_free_page((unsigned long)data->data_pages[i]); + kfree(data); } @@ -1659,8 +2188,8 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_counter *counter = vma->vm_file->private_data; - if (atomic_dec_and_mutex_lock(&counter->mmap_count, - &counter->mmap_mutex)) { + WARN_ON_ONCE(counter->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { struct user_struct *user = current_user(); atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); @@ -1671,23 +2200,24 @@ static void perf_mmap_close(struct vm_area_struct *vma) } static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; + unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + unsigned long locked, lock_limit; unsigned long vma_size; unsigned long nr_pages; - unsigned long user_locked, user_lock_limit; - unsigned long locked, lock_limit; long user_extra, extra; int ret = 0; - if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) + if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; vma_size = vma->vm_end - vma->vm_start; @@ -1706,6 +2236,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_pgoff != 0) return -EINVAL; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->mmap_mutex); if (atomic_inc_not_zero(&counter->mmap_count)) { if (nr_pages != counter->data->nr_pages) @@ -1745,10 +2276,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; counter->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + counter->data->writable = 1; + unlock: mutex_unlock(&counter->mmap_mutex); - vma->vm_flags &= ~VM_MAYWRITE; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; @@ -1757,8 +2290,8 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { - struct perf_counter *counter = filp->private_data; struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_counter *counter = filp->private_data; int retval; mutex_lock(&inode->i_mutex); @@ -1814,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry) if (counter->pending_disable) { counter->pending_disable = 0; - perf_counter_disable(counter); + __perf_counter_disable(counter); } if (counter->pending_wakeup) { @@ -1922,14 +2455,41 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_output_handle { struct perf_counter *counter; struct perf_mmap_data *data; - unsigned int offset; - unsigned int head; + unsigned long head; + unsigned long offset; int nmi; - int overflow; + int sample; int locked; unsigned long flags; }; +static bool perf_output_space(struct perf_mmap_data *data, + unsigned int offset, unsigned int head) +{ + unsigned long tail; + unsigned long mask; + + if (!data->writable) + return true; + + mask = (data->nr_pages << PAGE_SHIFT) - 1; + /* + * Userspace could choose to issue a mb() before updating the tail + * pointer. So that all reads will be completed before the write is + * issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->data->poll, POLL_IN); @@ -1977,7 +2537,8 @@ static void perf_output_lock(struct perf_output_handle *handle) static void perf_output_unlock(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int head, cpu; + unsigned long head; + int cpu; data->done_head = data->head; @@ -1990,7 +2551,7 @@ again: * before we publish the new head, matched by a rmb() in userspace when * reading this position. */ - while ((head = atomic_xchg(&data->done_head, 0))) + while ((head = atomic_long_xchg(&data->done_head, 0))) data->user_page->data_head = head; /* @@ -2003,7 +2564,7 @@ again: /* * Therefore we have to validate we did not indeed do so. */ - if (unlikely(atomic_read(&data->done_head))) { + if (unlikely(atomic_long_read(&data->done_head))) { /* * Since we had it locked, we can lock it again. */ @@ -2019,12 +2580,57 @@ out: local_irq_restore(handle->flags); } +static void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, - int nmi, int overflow) + int nmi, int sample) { struct perf_mmap_data *data; unsigned int offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; /* * For inherited counters we send all the output towards the parent. @@ -2037,20 +2643,26 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->overflow = overflow; + handle->data = data; + handle->counter = counter; + handle->nmi = nmi; + handle->sample = sample; if (!data->nr_pages) goto fail; + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + perf_output_lock(handle); do { - offset = head = atomic_read(&data->head); + offset = head = atomic_long_read(&data->head); head += size; - } while (atomic_cmpxchg(&data->head, offset, head) != offset); + if (unlikely(!perf_output_space(data, offset, head))) + goto fail; + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); handle->offset = offset; handle->head = head; @@ -2058,89 +2670,152 @@ static int perf_output_begin(struct perf_output_handle *handle, if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) atomic_set(&data->wakeup, 1); + if (have_lost) { + lost_event.header.type = PERF_EVENT_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = counter->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + return 0; fail: - perf_output_wakeup(handle); + atomic_inc(&data->lost); + perf_output_unlock(handle); out: rcu_read_unlock(); return -ENOSPC; } -static void perf_output_copy(struct perf_output_handle *handle, - void *buf, unsigned int len) +static void perf_output_end(struct perf_output_handle *handle) { - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; + struct perf_counter *counter = handle->counter; + struct perf_mmap_data *data = handle->data; - do { - unsigned int page_offset; - int nr; + int wakeup_events = counter->attr.wakeup_events; - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + if (handle->sample && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } + } - memcpy(pages[nr] + page_offset, buf, size); + perf_output_unlock(handle); + rcu_read_unlock(); +} - len -= size; - buf += size; - offset += size; - } while (len); +static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) +{ + /* + * only top level counters have the pid namespace they were created in + */ + if (counter->parent) + counter = counter->parent; - handle->offset = offset; + return task_tgid_nr_ns(p, counter->ns); +} +static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) +{ /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. + * only top level counters have the pid namespace they were created in */ - WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0); + if (counter->parent) + counter = counter->parent; + + return task_pid_nr_ns(p, counter->ns); } -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + u64 read_format = counter->attr.read_format; + u64 values[4]; + int n = 0; -static void perf_output_end(struct perf_output_handle *handle) + values[n++] = atomic64_read(&counter->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_counter *counter) { - struct perf_counter *counter = handle->counter; - struct perf_mmap_data *data = handle->data; + struct perf_counter *leader = counter->group_leader, *sub; + u64 read_format = counter->attr.read_format; + u64 values[5]; + int n = 0; - int wakeup_events = counter->hw_event.wakeup_events; + values[n++] = 1 + leader->nr_siblings; - if (handle->overflow && wakeup_events) { - int events = atomic_inc_return(&data->events); - if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); - } + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != counter) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + n = 0; + + if (sub != counter) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); } +} - perf_output_unlock(handle); - rcu_read_unlock(); +static void perf_output_read(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + if (counter->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, counter); + else + perf_output_read_one(handle, counter); } -static void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) +void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) { int ret; - u64 record_type = counter->hw_event.record_type; + u64 sample_type = counter->attr.sample_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; struct { u32 pid, tid; } tid_entry; - struct { - u64 event; - u64 counter; - } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; @@ -2148,69 +2823,76 @@ static void perf_counter_output(struct perf_counter *counter, u32 cpu, reserved; } cpu_entry; - header.type = 0; + header.type = PERF_EVENT_SAMPLE; header.size = sizeof(header); - header.misc = PERF_EVENT_MISC_OVERFLOW; - header.misc |= perf_misc_flags(regs); + header.misc = 0; + header.misc |= perf_misc_flags(data->regs); - if (record_type & PERF_RECORD_IP) { - ip = perf_instruction_pointer(regs); - header.type |= PERF_RECORD_IP; + if (sample_type & PERF_SAMPLE_IP) { + ip = perf_instruction_pointer(data->regs); header.size += sizeof(ip); } - if (record_type & PERF_RECORD_TID) { + if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ - tid_entry.pid = current->group_leader->pid; - tid_entry.tid = current->pid; + tid_entry.pid = perf_counter_pid(counter, current); + tid_entry.tid = perf_counter_tid(counter, current); - header.type |= PERF_RECORD_TID; header.size += sizeof(tid_entry); } - if (record_type & PERF_RECORD_TIME) { + if (sample_type & PERF_SAMPLE_TIME) { /* * Maybe do better on x86 and provide cpu_clock_nmi() */ time = sched_clock(); - header.type |= PERF_RECORD_TIME; header.size += sizeof(u64); } - if (record_type & PERF_RECORD_ADDR) { - header.type |= PERF_RECORD_ADDR; + if (sample_type & PERF_SAMPLE_ADDR) header.size += sizeof(u64); - } - if (record_type & PERF_RECORD_CONFIG) { - header.type |= PERF_RECORD_CONFIG; + if (sample_type & PERF_SAMPLE_ID) header.size += sizeof(u64); - } - if (record_type & PERF_RECORD_CPU) { - header.type |= PERF_RECORD_CPU; + if (sample_type & PERF_SAMPLE_STREAM_ID) + header.size += sizeof(u64); + + if (sample_type & PERF_SAMPLE_CPU) { header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } - if (record_type & PERF_RECORD_GROUP) { - header.type |= PERF_RECORD_GROUP; - header.size += sizeof(u64) + - counter->nr_siblings * sizeof(group_entry); - } + if (sample_type & PERF_SAMPLE_PERIOD) + header.size += sizeof(u64); - if (record_type & PERF_RECORD_CALLCHAIN) { - callchain = perf_callchain(regs); + if (sample_type & PERF_SAMPLE_READ) + header.size += perf_counter_read_size(counter); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + callchain = perf_callchain(data->regs); if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); - - header.type |= PERF_RECORD_CALLCHAIN; header.size += callchain_size; - } + } else + header.size += sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2219,58 +2901,225 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_put(&handle, header); - if (record_type & PERF_RECORD_IP) + if (sample_type & PERF_SAMPLE_IP) perf_output_put(&handle, ip); - if (record_type & PERF_RECORD_TID) + if (sample_type & PERF_SAMPLE_TID) perf_output_put(&handle, tid_entry); - if (record_type & PERF_RECORD_TIME) + if (sample_type & PERF_SAMPLE_TIME) perf_output_put(&handle, time); - if (record_type & PERF_RECORD_ADDR) - perf_output_put(&handle, addr); + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(&handle, data->addr); - if (record_type & PERF_RECORD_CONFIG) - perf_output_put(&handle, counter->hw_event.config); + if (sample_type & PERF_SAMPLE_ID) { + u64 id = primary_counter_id(counter); - if (record_type & PERF_RECORD_CPU) - perf_output_put(&handle, cpu_entry); + perf_output_put(&handle, id); + } - /* - * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. - */ - if (record_type & PERF_RECORD_GROUP) { - struct perf_counter *leader, *sub; - u64 nr = counter->nr_siblings; + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(&handle, counter->id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(&handle, cpu_entry); - perf_output_put(&handle, nr); + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(&handle, data->period); - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->pmu->read(sub); + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(&handle, counter); - group_entry.event = sub->hw_event.config; - group_entry.counter = atomic64_read(&sub->count); + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + else { + u64 nr = 0; + perf_output_put(&handle, nr); + } + } - perf_output_put(&handle, group_entry); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); } } - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); + perf_output_end(&handle); +} + +/* + * read event + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_counter_read_event(struct perf_counter *counter, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event event = { + .header = { + .type = PERF_EVENT_READ, + .misc = 0, + .size = sizeof(event) + perf_counter_read_size(counter), + }, + .pid = perf_counter_pid(counter, task), + .tid = perf_counter_tid(counter, task), + }; + int ret; + + ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, event); + perf_output_read(&handle, counter); perf_output_end(&handle); } /* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_counter_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + } event; +}; + +static void perf_counter_task_output(struct perf_counter *counter, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + int size = task_event->event.header.size; + struct task_struct *task = task_event->task; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + task_event->event.pid = perf_counter_pid(counter, task); + task_event->event.ppid = perf_counter_pid(counter, current); + + task_event->event.tid = perf_counter_tid(counter, task); + task_event->event.ptid = perf_counter_tid(counter, current); + + perf_output_put(&handle, task_event->event); + perf_output_end(&handle); +} + +static int perf_counter_task_match(struct perf_counter *counter) +{ + if (counter->attr.comm || counter->attr.mmap || counter->attr.task) + return 1; + + return 0; +} + +static void perf_counter_task_ctx(struct perf_counter_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_task_match(counter)) + perf_counter_task_output(counter, task_event); + } + rcu_read_unlock(); +} + +static void perf_counter_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx = task_event->task_ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_task_ctx(&cpuctx->ctx, task_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_counter_ctxp); + if (ctx) + perf_counter_task_ctx(ctx, task_event); + rcu_read_unlock(); +} + +static void perf_counter_task(struct task_struct *task, + struct perf_counter_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_counters) && + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_task_counters)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event = { + .header = { + .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, + .misc = 0, + .size = sizeof(task_event.event), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + }, + }; + + perf_counter_task_event(&task_event); +} + +void perf_counter_fork(struct task_struct *task) +{ + perf_counter_task(task, NULL, 1); +} + +/* * comm tracking */ struct perf_comm_event { - struct task_struct *task; - char *comm; + struct task_struct *task; + char *comm; int comm_size; struct { @@ -2291,17 +3140,18 @@ static void perf_counter_comm_output(struct perf_counter *counter, if (ret) return; + comm_event->event.pid = perf_counter_pid(counter, comm_event->task); + comm_event->event.tid = perf_counter_tid(counter, comm_event->task); + perf_output_put(&handle, comm_event->event); perf_output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_output_end(&handle); } -static int perf_counter_comm_match(struct perf_counter *counter, - struct perf_comm_event *comm_event) +static int perf_counter_comm_match(struct perf_counter *counter) { - if (counter->hw_event.comm && - comm_event->event.header.type == PERF_EVENT_COMM) + if (counter->attr.comm) return 1; return 0; @@ -2317,7 +3167,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter, comm_event)) + if (perf_counter_comm_match(counter)) perf_counter_comm_output(counter, comm_event); } rcu_read_unlock(); @@ -2326,9 +3176,12 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx, static void perf_counter_comm_event(struct perf_comm_event *comm_event) { struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; unsigned int size; - char *comm = comm_event->task->comm; + char comm[TASK_COMM_LEN]; + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -2340,24 +3193,39 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) perf_counter_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_comm_ctx(ctx, comm_event); + rcu_read_unlock(); } void perf_counter_comm(struct task_struct *task) { struct perf_comm_event comm_event; - if (!atomic_read(&nr_comm_tracking)) - return; - if (!current->perf_counter_ctxp) + if (task->perf_counter_ctxp) + perf_counter_enable_on_exec(task); + + if (!atomic_read(&nr_comm_counters)) return; comm_event = (struct perf_comm_event){ .task = task, + /* .comm */ + /* .comm_size */ .event = { - .header = { .type = PERF_EVENT_COMM, }, - .pid = task->group_leader->pid, - .tid = task->pid, + .header = { + .type = PERF_EVENT_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ }, }; @@ -2369,9 +3237,10 @@ void perf_counter_comm(struct task_struct *task) */ struct perf_mmap_event { - struct file *file; - char *file_name; - int file_size; + struct vm_area_struct *vma; + + const char *file_name; + int file_size; struct { struct perf_event_header header; @@ -2394,6 +3263,9 @@ static void perf_counter_mmap_output(struct perf_counter *counter, if (ret) return; + mmap_event->event.pid = perf_counter_pid(counter, current); + mmap_event->event.tid = perf_counter_tid(counter, current); + perf_output_put(&handle, mmap_event->event); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); @@ -2403,12 +3275,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, static int perf_counter_mmap_match(struct perf_counter *counter, struct perf_mmap_event *mmap_event) { - if (counter->hw_event.mmap && - mmap_event->event.header.type == PERF_EVENT_MMAP) - return 1; - - if (counter->hw_event.munmap && - mmap_event->event.header.type == PERF_EVENT_MUNMAP) + if (counter->attr.mmap) return 1; return 0; @@ -2433,14 +3300,23 @@ static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) { struct perf_cpu_context *cpuctx; - struct file *file = mmap_event->file; + struct perf_counter_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; unsigned int size; char tmp[16]; char *buf = NULL; - char *name; + const char *name; + + memset(tmp, 0, sizeof(tmp)); if (file) { - buf = kzalloc(PATH_MAX, GFP_KERNEL); + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); if (!buf) { name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; @@ -2451,6 +3327,17 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + name = strncpy(tmp, "//anon", sizeof(tmp)); goto got_name; } @@ -2467,53 +3354,41 @@ got_name: perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); kfree(buf); } -void perf_counter_mmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_tracking)) - return; - if (!current->perf_counter_ctxp) - return; - - mmap_event = (struct perf_mmap_event){ - .file = file, - .event = { - .header = { .type = PERF_EVENT_MMAP, }, - .pid = current->group_leader->pid, - .tid = current->pid, - .start = addr, - .len = len, - .pgoff = pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - -void perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) +void __perf_counter_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; - if (!atomic_read(&nr_munmap_tracking)) + if (!atomic_read(&nr_mmap_counters)) return; mmap_event = (struct perf_mmap_event){ - .file = file, + .vma = vma, + /* .file_name */ + /* .file_size */ .event = { - .header = { .type = PERF_EVENT_MUNMAP, }, - .pid = current->group_leader->pid, - .tid = current->pid, - .start = addr, - .len = len, - .pgoff = pgoff, + .header = { + .type = PERF_EVENT_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, }, }; @@ -2521,41 +3396,6 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, } /* - * Log irq_period changes so that analyzing tools can re-normalize the - * event flow. - */ - -static void perf_log_period(struct perf_counter *counter, u64 period) -{ - struct perf_output_handle handle; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 period; - } freq_event = { - .header = { - .type = PERF_EVENT_PERIOD, - .misc = 0, - .size = sizeof(freq_event), - }, - .time = sched_clock(), - .period = period, - }; - - if (counter->hw.irq_period == period) - return; - - ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0); - if (ret) - return; - - perf_output_put(&handle, freq_event); - perf_output_end(&handle); -} - -/* * IRQ throttle logging */ @@ -2567,15 +3407,22 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) struct { struct perf_event_header header; u64 time; + u64 id; + u64 stream_id; } throttle_event = { .header = { - .type = PERF_EVENT_THROTTLE + 1, + .type = PERF_EVENT_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), + .time = sched_clock(), + .id = primary_counter_id(counter), + .stream_id = counter->id, }; + if (enable) + throttle_event.header.type = PERF_EVENT_UNTHROTTLE; + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); if (ret) return; @@ -2585,27 +3432,48 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) } /* - * Generic counter overflow handling. + * Generic counter overflow handling, sampling. */ -int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) +int perf_counter_overflow(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) { int events = atomic_read(&counter->event_limit); int throttle = counter->pmu->unthrottle != NULL; + struct hw_perf_counter *hwc = &counter->hw; int ret = 0; if (!throttle) { - counter->hw.interrupts++; - } else if (counter->hw.interrupts != MAX_INTERRUPTS) { - counter->hw.interrupts++; - if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) { - counter->hw.interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); + hwc->interrupts++; + } else { + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > + (u64)sysctl_perf_counter_sample_rate) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(counter, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling counters even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the counter got enabled again: + */ ret = 1; } } + if (counter->attr.freq) { + u64 now = sched_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); + } + /* * XXX event_limit might not quite work as expected on inherited * counters @@ -2623,7 +3491,7 @@ int perf_counter_overflow(struct perf_counter *counter, perf_counter_disable(counter); } - perf_counter_output(counter, nmi, regs, addr); + perf_counter_output(counter, nmi, data); return ret; } @@ -2631,122 +3499,140 @@ int perf_counter_overflow(struct perf_counter *counter, * Generic software counter infrastructure */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) goto again; - delta = now - prev; + return nr; +} + +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct perf_sample_data *data) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 overflow; + + data->period = counter->hw.last_period; + overflow = perf_swcounter_set_period(counter); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (perf_counter_overflow(counter, nmi, data)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + } +} - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); +static void perf_swcounter_unthrottle(struct perf_counter *counter) +{ + /* + * Nothing to do, we already reset hwc->interrupts. + */ } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct perf_sample_data *data) { struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - } + atomic64_add(nr, &counter->count); - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); - } + if (!hwc->sample_period) + return; + + if (!data->regs) + return; - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swcounter_overflow(counter, nmi, data); } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static int perf_swcounter_is_counting(struct perf_counter *counter) { - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_counter *counter; - struct pt_regs *regs; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - regs = get_irq_regs(); /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. + * The counter is active, we're good! */ - if ((counter->hw_event.exclude_kernel || !regs) && - !counter->hw_event.exclude_user) - regs = task_pt_regs(current); - - if (regs) { - if (perf_counter_overflow(counter, 0, regs, 0)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, counter->hw.irq_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + return 1; - return ret; -} + /* + * The counter is off/error, not counting. + */ + if (counter->state != PERF_COUNTER_STATE_INACTIVE) + return 0; -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) -{ - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, regs, addr)) - /* soft-disable the counter */ - ; + /* + * The counter is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. + */ + if (counter->ctx->is_active) + return 0; + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; } static int perf_swcounter_match(struct perf_counter *counter, - enum perf_event_types type, + enum perf_type_id type, u32 event, struct pt_regs *regs) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; - - if (perf_event_raw(&counter->hw_event)) + if (!perf_swcounter_is_counting(counter)) return 0; - if (perf_event_type(&counter->hw_event) != type) + if (counter->attr.type != type) return 0; - - if (perf_event_id(&counter->hw_event) != event) + if (counter->attr.config != event) return 0; - if (counter->hw_event.exclude_user && user_mode(regs)) - return 0; + if (regs) { + if (counter->attr.exclude_user && user_mode(regs)) + return 0; - if (counter->hw_event.exclude_kernel && !user_mode(regs)) - return 0; + if (counter->attr.exclude_kernel && !user_mode(regs)) + return 0; + } return 1; } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct pt_regs *regs, u64 addr) -{ - int neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg) - perf_swcounter_overflow(counter, nmi, regs, addr); -} - static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs, - u64 addr) + enum perf_type_id type, + u32 event, u64 nr, int nmi, + struct perf_sample_data *data) { struct perf_counter *counter; @@ -2755,8 +3641,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, regs, addr); + if (perf_swcounter_match(counter, type, event, data->regs)) + perf_swcounter_add(counter, nr, nmi, data); } rcu_read_unlock(); } @@ -2775,12 +3661,13 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) return &cpuctx->recursion[0]; } -static void __perf_swcounter_event(enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs, - u64 addr) +static void do_perf_swcounter_event(enum perf_type_id type, u32 event, + u64 nr, int nmi, + struct perf_sample_data *data) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); + struct perf_counter_context *ctx; if (*recursion) goto out; @@ -2789,11 +3676,16 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, barrier(); perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, regs, addr); - if (cpuctx->task_ctx) { - perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, - nr, nmi, regs, addr); - } + nr, nmi, data); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); + rcu_read_unlock(); barrier(); (*recursion)--; @@ -2802,35 +3694,79 @@ out: put_cpu_var(perf_cpu_context); } -void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +void __perf_swcounter_event(u32 event, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { - __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); + struct perf_sample_data data = { + .regs = regs, + .addr = addr, + }; + + do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); } static void perf_swcounter_read(struct perf_counter *counter) { - perf_swcounter_update(counter); } static int perf_swcounter_enable(struct perf_counter *counter) { - perf_swcounter_set_period(counter); + struct hw_perf_counter *hwc = &counter->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swcounter_set_period(counter); + } return 0; } static void perf_swcounter_disable(struct perf_counter *counter) { - perf_swcounter_update(counter); } static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, + .unthrottle = perf_swcounter_unthrottle, }; /* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* * Software counter: cpu wall time clock */ @@ -2854,8 +3790,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) atomic64_set(&hwc->prev_count, cpu_clock(cpu)); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->irq_period) { - u64 period = max_t(u64, 10000, hwc->irq_period); + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); @@ -2866,7 +3802,7 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { - if (counter->hw.irq_period) + if (counter->hw.sample_period) hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -2906,8 +3842,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) atomic64_set(&hwc->prev_count, now); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->irq_period) { - u64 period = max_t(u64, 10000, hwc->irq_period); + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); @@ -2918,7 +3854,7 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { - if (counter->hw.irq_period) + if (counter->hw.sample_period) hrtimer_cancel(&counter->hw.hrtimer); task_clock_perf_counter_update(counter, counter->ctx->time); @@ -2946,67 +3882,25 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: cpu migrations - */ - -static inline u64 get_cpu_migrations(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->se.nr_migrations; - return cpu_nr_migrations(smp_processor_id()); -} - -static void cpu_migrations_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_cpu_migrations(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; - - atomic64_add(delta, &counter->count); -} - -static void cpu_migrations_perf_counter_read(struct perf_counter *counter) -{ - cpu_migrations_perf_counter_update(counter); -} - -static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - get_cpu_migrations(counter)); - return 0; -} - -static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) -{ - cpu_migrations_perf_counter_update(counter); -} - -static const struct pmu perf_ops_cpu_migrations = { - .enable = cpu_migrations_perf_counter_enable, - .disable = cpu_migrations_perf_counter_disable, - .read = cpu_migrations_perf_counter_read, -}; - #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { - struct pt_regs *regs = get_irq_regs(); + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + struct perf_sample_data data = { + .regs = get_irq_regs(), + .addr = addr, + .raw = &raw, + }; - if (!regs) - regs = task_pt_regs(current); + if (!data.regs) + data.regs = task_pt_regs(current); - __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); @@ -3015,20 +3909,23 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(perf_event_id(&counter->hw_event)); + ftrace_profile_disable(counter->attr.config); } static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { - int event_id = perf_event_id(&counter->hw_event); - int ret; + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); - ret = ftrace_profile_enable(event_id); - if (ret) + if (ftrace_profile_enable(counter->attr.config)) return NULL; counter->destroy = tp_perf_counter_destroy; - counter->hw.irq_period = counter->hw_event.irq_period; return &perf_ops_generic; } @@ -3039,9 +3936,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) } #endif +atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_counter_destroy(struct perf_counter *counter) +{ + u64 event = counter->attr.config; + + WARN_ON(counter->parent); + + atomic_dec(&perf_swcounter_enabled[event]); +} + static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { const struct pmu *pmu = NULL; + u64 event = counter->attr.config; /* * Software counters (currently) can't in general distinguish @@ -3050,12 +3959,12 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (perf_event_id(&counter->hw_event)) { - case PERF_COUNT_CPU_CLOCK: + switch (event) { + case PERF_COUNT_SW_CPU_CLOCK: pmu = &perf_ops_cpu_clock; break; - case PERF_COUNT_TASK_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -3066,16 +3975,17 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) pmu = &perf_ops_cpu_clock; break; - case PERF_COUNT_PAGE_FAULTS: - case PERF_COUNT_PAGE_FAULTS_MIN: - case PERF_COUNT_PAGE_FAULTS_MAJ: - case PERF_COUNT_CONTEXT_SWITCHES: + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: + if (!counter->parent) { + atomic_inc(&perf_swcounter_enabled[event]); + counter->destroy = sw_perf_counter_destroy; + } pmu = &perf_ops_generic; break; - case PERF_COUNT_CPU_MIGRATIONS: - if (!counter->hw_event.exclude_kernel) - pmu = &perf_ops_cpu_migrations; - break; } return pmu; @@ -3085,10 +3995,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_hw_event *hw_event, +perf_counter_alloc(struct perf_counter_attr *attr, int cpu, struct perf_counter_context *ctx, struct perf_counter *group_leader, + struct perf_counter *parent_counter, gfp_t gfpflags) { const struct pmu *pmu; @@ -3117,40 +4028,42 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mmap_mutex); - counter->cpu = cpu; - counter->hw_event = *hw_event; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; + counter->cpu = cpu; + counter->attr = *attr; + counter->group_leader = group_leader; + counter->pmu = NULL; + counter->ctx = ctx; + counter->oncpu = -1; - get_ctx(ctx); + counter->parent = parent_counter; - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (hw_event->disabled) + counter->ns = get_pid_ns(current->nsproxy->pid_ns); + counter->id = atomic64_inc_return(&perf_counter_id); + + counter->state = PERF_COUNTER_STATE_INACTIVE; + + if (attr->disabled) counter->state = PERF_COUNTER_STATE_OFF; pmu = NULL; hwc = &counter->hw; - if (hw_event->freq && hw_event->irq_freq) - hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq); - else - hwc->irq_period = hw_event->irq_period; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + + atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_RECORD_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited counters */ - if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP)) - goto done; - - if (perf_event_raw(hw_event)) { - pmu = hw_perf_counter_init(counter); + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; - } - switch (perf_event_type(hw_event)) { + switch (attr->type) { + case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: pmu = hw_perf_counter_init(counter); break; @@ -3161,6 +4074,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, case PERF_TYPE_TRACEPOINT: pmu = tp_perf_counter_init(counter); break; + + default: + break; } done: err = 0; @@ -3170,37 +4086,120 @@ done: err = PTR_ERR(pmu); if (err) { + if (counter->ns) + put_pid_ns(counter->ns); kfree(counter); return ERR_PTR(err); } counter->pmu = pmu; - atomic_inc(&nr_counters); - if (counter->hw_event.mmap) - atomic_inc(&nr_mmap_tracking); - if (counter->hw_event.munmap) - atomic_inc(&nr_munmap_tracking); - if (counter->hw_event.comm) - atomic_inc(&nr_comm_tracking); + if (!counter->parent) { + atomic_inc(&nr_counters); + if (counter->attr.mmap) + atomic_inc(&nr_mmap_counters); + if (counter->attr.comm) + atomic_inc(&nr_comm_counters); + if (counter->attr.task) + atomic_inc(&nr_task_counters); + } return counter; } +static int perf_copy_attr(struct perf_counter_attr __user *uattr, + struct perf_counter_attr *attr) +{ + int ret; + u32 size; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0. + */ + if (size > sizeof(*attr)) { + unsigned long val; + unsigned long __user *addr; + unsigned long __user *end; + + addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), + sizeof(unsigned long)); + end = PTR_ALIGN((void __user *)uattr + size, + sizeof(unsigned long)); + + for (; addr < end; addr += sizeof(unsigned long)) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * - * @hw_event_uptr: event type attributes for monitoring/sampling + * @attr_uptr: event type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader counter fd */ SYSCALL_DEFINE5(perf_counter_open, - const struct perf_counter_hw_event __user *, hw_event_uptr, + struct perf_counter_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; - struct perf_counter_hw_event hw_event; + struct perf_counter_attr attr; struct perf_counter_context *ctx; struct file *counter_file = NULL; struct file *group_file = NULL; @@ -3212,8 +4211,19 @@ SYSCALL_DEFINE5(perf_counter_open, if (flags) return -EINVAL; - if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) - return -EFAULT; + ret = perf_copy_attr(attr_uptr, &attr); + if (ret) + return ret; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_counter_sample_rate) + return -EINVAL; + } /* * Get the target context (task or percpu): @@ -3250,12 +4260,12 @@ SYSCALL_DEFINE5(perf_counter_open, /* * Only a group leader can be exclusive or pinned */ - if (hw_event.exclusive || hw_event.pinned) + if (attr.exclusive || attr.pinned) goto err_put_context; } - counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, - GFP_KERNEL); + counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, + NULL, GFP_KERNEL); ret = PTR_ERR(counter); if (IS_ERR(counter)) goto err_put_context; @@ -3269,8 +4279,10 @@ SYSCALL_DEFINE5(perf_counter_open, goto err_free_put_context; counter->filp = counter_file; + WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, counter, cpu); + ++ctx->generation; mutex_unlock(&ctx->mutex); counter->owner = current; @@ -3290,7 +4302,7 @@ err_free_put_context: kfree(counter); err_put_context: - put_context(ctx); + put_ctx(ctx); goto out_fput; } @@ -3317,33 +4329,32 @@ inherit_counter(struct perf_counter *parent_counter, if (parent_counter->parent) parent_counter = parent_counter->parent; - child_counter = perf_counter_alloc(&parent_counter->hw_event, + child_counter = perf_counter_alloc(&parent_counter->attr, parent_counter->cpu, child_ctx, - group_leader, GFP_KERNEL); + group_leader, parent_counter, + GFP_KERNEL); if (IS_ERR(child_counter)) return child_counter; + get_ctx(child_ctx); /* * Make the child state follow the state of the parent counter, - * not its hw_event.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en,dis}able_family. + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_counter_{en, dis}able_family. */ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) child_counter->state = PERF_COUNTER_STATE_INACTIVE; else child_counter->state = PERF_COUNTER_STATE_OFF; + if (parent_counter->attr.freq) + child_counter->hw.sample_period = parent_counter->hw.sample_period; + /* * Link it up in the child's context: */ add_counter_to_ctx(child_counter, child_ctx); - child_counter->parent = parent_counter; - /* - * inherit into child's child as well: - */ - child_counter->hw_event.inherit = 1; - /* * Get a reference to the parent filp - we will fput it * when the child counter exits. This is safe to do because @@ -3355,6 +4366,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link this into the parent counter's child list */ + WARN_ON_ONCE(parent_counter->ctx->parent_ctx); mutex_lock(&parent_counter->child_mutex); list_add_tail(&child_counter->child_list, &parent_counter->child_list); mutex_unlock(&parent_counter->child_mutex); @@ -3386,10 +4398,14 @@ static int inherit_group(struct perf_counter *parent_counter, } static void sync_child_counter(struct perf_counter *child_counter, - struct perf_counter *parent_counter) + struct task_struct *child) { + struct perf_counter *parent_counter = child_counter->parent; u64 child_val; + if (child_counter->attr.inherit_stat) + perf_counter_read_event(child_counter, child); + child_val = atomic64_read(&child_counter->count); /* @@ -3404,6 +4420,7 @@ static void sync_child_counter(struct perf_counter *child_counter, /* * Remove this counter from the parent's list */ + WARN_ON_ONCE(parent_counter->ctx->parent_ctx); mutex_lock(&parent_counter->child_mutex); list_del_init(&child_counter->child_list); mutex_unlock(&parent_counter->child_mutex); @@ -3416,9 +4433,9 @@ static void sync_child_counter(struct perf_counter *child_counter, } static void -__perf_counter_exit_task(struct task_struct *child, - struct perf_counter *child_counter, - struct perf_counter_context *child_ctx) +__perf_counter_exit_task(struct perf_counter *child_counter, + struct perf_counter_context *child_ctx, + struct task_struct *child) { struct perf_counter *parent_counter; @@ -3432,18 +4449,13 @@ __perf_counter_exit_task(struct task_struct *child, * counters need to be zapped - but otherwise linger. */ if (parent_counter) { - sync_child_counter(child_counter, parent_counter); + sync_child_counter(child_counter, child); free_counter(child_counter); } } /* * When a child task exits, feed back counter values to parent counters. - * - * Note: we may be running in child context, but the PID is not hashed - * anymore so new counters will not be added. - * (XXX not sure that is true when we get called from flush_old_exec. - * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { @@ -3451,22 +4463,60 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - child_ctx = child->perf_counter_ctxp; - - if (likely(!child_ctx)) + if (likely(!child->perf_counter_ctxp)) { + perf_counter_task(child, NULL, 0); return; + } local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_counter_ctxp; __perf_counter_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_counter_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; - local_irq_restore(flags); + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the counters from it. + */ + unclone_ctx(child_ctx); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the counters so that we + * won't get any samples after PERF_EVENT_EXIT. We can however still + * get a few PERF_EVENT_READ events. + */ + perf_counter_task(child, child_ctx, 0); - mutex_lock(&child_ctx->mutex); + /* + * We can recurse on the same lock type through: + * + * __perf_counter_exit_task() + * sync_child_counter() + * fput(parent_counter->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) - __perf_counter_exit_task(child, child_counter, child_ctx); + __perf_counter_exit_task(child_counter, child_ctx, child); /* * If the last counter was a group counter, it will have appended all @@ -3482,11 +4532,50 @@ again: } /* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_counter_free_task(struct task_struct *task) +{ + struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_counter *counter, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { + struct perf_counter *parent = counter->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&counter->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_counter(counter, ctx); + free_counter(counter); + } + + if (!list_empty(&ctx->counter_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + +/* * Initialize the perf_counter context in task_struct */ int perf_counter_init_task(struct task_struct *child) { struct perf_counter_context *child_ctx, *parent_ctx; + struct perf_counter_context *cloned_ctx; struct perf_counter *counter; struct task_struct *parent = current; int inherited_all = 1; @@ -3497,8 +4586,7 @@ int perf_counter_init_task(struct task_struct *child) mutex_init(&child->perf_counter_mutex); INIT_LIST_HEAD(&child->perf_counter_list); - parent_ctx = parent->perf_counter_ctxp; - if (likely(!parent_ctx || !parent_ctx->nr_counters)) + if (likely(!parent->perf_counter_ctxp)) return 0; /* @@ -3513,6 +4601,20 @@ int perf_counter_init_task(struct task_struct *child) __perf_counter_init_context(child_ctx, child); child->perf_counter_ctxp = child_ctx; + get_task_struct(child); + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ /* * Lock the parent list. No need to lock the child - not PID @@ -3528,7 +4630,7 @@ int perf_counter_init_task(struct task_struct *child) if (counter != counter->group_leader) continue; - if (!counter->hw_event.inherit) { + if (!counter->attr.inherit) { inherited_all = 0; continue; } @@ -3545,9 +4647,14 @@ int perf_counter_init_task(struct task_struct *child) /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of counters and the generation + * count can't have changed since we took the mutex. */ - if (parent_ctx->parent_ctx) { - child_ctx->parent_ctx = parent_ctx->parent_ctx; + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; } else { child_ctx->parent_ctx = parent_ctx; @@ -3558,6 +4665,8 @@ int perf_counter_init_task(struct task_struct *child) mutex_unlock(&parent_ctx->mutex); + perf_unpin_context(parent_ctx); + return ret; } @@ -3610,6 +4719,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -3622,14 +4736,20 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ static struct notifier_block __cpuinitdata perf_cpu_nb = { .notifier_call = perf_cpu_notify, + .priority = 20, }; void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); }