X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fperf_event.c;h=2a060be3b07fb98075cf995ce0c292af2c924b83;hb=dfacc4d6c98b89609250269f518c1f54c30454ef;hp=d8108465397d90ee32f44e94e901aeca28a961a3;hpb=32975a4f114be52286f9a5bf6c230dbb8c0e1903;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d810846..2a060be 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -81,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -int __weak -hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - return 0; -} - void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); @@ -261,6 +255,18 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -314,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *sibling, *tmp; - if (list_empty(&event->group_entry)) return; ctx->nr_events--; @@ -328,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; - update_event_times(event); + update_group_times(event); /* * If event was in error state, then keep it @@ -339,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; +} + +static void +perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; /* * If this was a group event with sibling events then @@ -504,18 +514,6 @@ retry: } /* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - -/* * Cross CPU call to disable a performance event */ static void __perf_event_disable(void *info) @@ -639,15 +637,20 @@ group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - struct perf_event *event, *partial_group; + struct perf_event *event, *partial_group = NULL; + const struct pmu *pmu = group_event->pmu; + bool txn = false; int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); - if (ret) - return ret < 0 ? ret : 0; + /* Check if group transaction availabe */ + if (pmu->start_txn) + txn = true; + + if (txn) + pmu->start_txn(pmu); if (event_sched_in(group_event, cpuctx, ctx)) return -EAGAIN; @@ -662,9 +665,19 @@ group_sched_in(struct perf_event *group_event, } } - return 0; + if (!txn) + return 0; + + ret = pmu->commit_txn(pmu); + if (!ret) { + pmu->cancel_txn(pmu); + return 0; + } group_error: + if (txn) + pmu->cancel_txn(pmu); + /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: @@ -1164,11 +1177,9 @@ void perf_event_task_sched_out(struct task_struct *task, struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; - struct pt_regs *regs; int do_switch = 1; - regs = task_pt_regs(task); - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; @@ -1368,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) return; + perf_disable(); + /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1380,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); cpuctx->task_ctx = ctx; + + perf_enable(); } #define MAX_INTERRUPTS (~0ULL) @@ -1524,12 +1539,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); + perf_disable(); event->pmu->unthrottle(event); + perf_enable(); } if (!event->attr.freq || !event->attr.sample_freq) continue; + perf_disable(); event->pmu->read(event); now = atomic64_read(&event->count); delta = now - hwc->freq_count_stamp; @@ -1537,6 +1555,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (delta > 0) perf_adjust_period(event, TICK_NSEC, delta); + perf_enable(); } raw_spin_unlock(&ctx->lock); } @@ -1546,9 +1565,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ static void rotate_ctx(struct perf_event_context *ctx) { - if (!ctx->nr_events) - return; - raw_spin_lock(&ctx->lock); /* Rotate the first entry last of non-pinned groups */ @@ -1561,19 +1577,28 @@ void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; + int rotate = 0; if (!atomic_read(&nr_events)) return; cpuctx = &__get_cpu_var(perf_cpu_context); - ctx = curr->perf_event_ctxp; + if (cpuctx->ctx.nr_events && + cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; - perf_disable(); + ctx = curr->perf_event_ctxp; + if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) + rotate = 1; perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) perf_ctx_adjust_freq(ctx); + if (!rotate) + return; + + perf_disable(); cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_out(ctx, EVENT_FLEXIBLE); @@ -1585,7 +1610,6 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_in(curr, EVENT_FLEXIBLE); - perf_enable(); } @@ -1848,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + /* + * Remove from the PMU, can't get re-enabled since we got + * here because the last ref went. + */ + perf_event_disable(event); + WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_event_remove_from_context(event); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&ctx->lock); + list_del_event(event, ctx); + perf_destroy_group(event, ctx); + raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); mutex_lock(&event->owner->perf_event_mutex); @@ -2275,6 +2320,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) return virt_to_page(data->data_pages[pgoff - 1]); } +static void *perf_mmap_alloc_page(int cpu) +{ + struct page *page; + int node; + + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NULL; + + return page_address(page); +} + static struct perf_mmap_data * perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { @@ -2291,12 +2349,12 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) if (!data) goto fail; - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + data->user_page = perf_mmap_alloc_page(event->cpu); if (!data->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + data->data_pages[i] = perf_mmap_alloc_page(event->cpu); if (!data->data_pages[i]) goto fail_data_pages; } @@ -2461,8 +2519,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) { long max_size = perf_data_size(data); - atomic_set(&data->lock, -1); - if (event->attr.watermark) { data->watermark = min_t(long, max_size, event->attr.wakeup_watermark); @@ -2535,6 +2591,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) long user_extra, extra; int ret = 0; + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same buffer. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; @@ -2634,6 +2698,7 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { + .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, @@ -2777,6 +2842,33 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } +__weak +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) +{ +} + + +/* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + /* * Output */ @@ -2812,82 +2904,57 @@ static void perf_output_wakeup(struct perf_output_handle *handle) } /* - * Curious locking construct. - * * We need to ensure a later event_id doesn't publish a head when a former - * event_id isn't done writing. However since we need to deal with NMIs we + * event isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * * We only publish the head (and generate a wakeup) when the outer-most - * event_id completes. + * event completes. */ -static void perf_output_lock(struct perf_output_handle *handle) +static void perf_output_get_handle(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int cur, cpu = get_cpu(); - - handle->locked = 0; - for (;;) { - cur = atomic_cmpxchg(&data->lock, -1, cpu); - if (cur == -1) { - handle->locked = 1; - break; - } - if (cur == cpu) - break; - - cpu_relax(); - } + preempt_disable(); + local_inc(&data->nest); + handle->wakeup = local_read(&data->wakeup); } -static void perf_output_unlock(struct perf_output_handle *handle) +static void perf_output_put_handle(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; + head = local_read(&data->head); /* - * NMI can happen here, which means we can miss a done_head update. + * IRQ/NMI can happen here, which means we can miss a head update. */ - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); + if (!local_dec_and_test(&data->nest)) + return; /* - * Therefore we have to validate we did not indeed do so. + * Publish the known good head. Rely on the full barrier implied + * by atomic_dec_and_test() order the data->head read and this + * write. */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); + data->user_page->data_head = head; + /* + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read data->head. + */ + if (unlikely(head != local_read(&data->head))) { + local_inc(&data->nest); goto again; } - if (atomic_xchg(&data->wakeup, 0)) + if (handle->wakeup != local_read(&data->wakeup)) perf_output_wakeup(handle); -out: - put_cpu(); + + preempt_enable(); } void perf_output_copy(struct perf_output_handle *handle, @@ -2963,13 +3030,13 @@ int perf_output_begin(struct perf_output_handle *handle, handle->sample = sample; if (!data->nr_pages) - goto fail; + goto out; - have_lost = atomic_read(&data->lost); + have_lost = local_read(&data->lost); if (have_lost) size += sizeof(lost_event); - perf_output_lock(handle); + perf_output_get_handle(handle); do { /* @@ -2979,24 +3046,24 @@ int perf_output_begin(struct perf_output_handle *handle, */ tail = ACCESS_ONCE(data->user_page->data_tail); smp_rmb(); - offset = head = atomic_long_read(&data->head); + offset = head = local_read(&data->head); head += size; if (unlikely(!perf_output_space(data, tail, offset, head))) goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + } while (local_cmpxchg(&data->head, offset, head) != offset); handle->offset = offset; handle->head = head; if (head - tail > data->watermark) - atomic_set(&data->wakeup, 1); + local_inc(&data->wakeup); if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; - lost_event.lost = atomic_xchg(&data->lost, 0); + lost_event.lost = local_xchg(&data->lost, 0); perf_output_put(handle, lost_event); } @@ -3004,8 +3071,8 @@ int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); + local_inc(&data->lost); + perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -3020,14 +3087,14 @@ void perf_output_end(struct perf_output_handle *handle) int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); + int events = local_inc_return(&data->events); if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); + local_sub(wakeup_events, &data->events); + local_inc(&data->wakeup); } } - perf_output_unlock(handle); + perf_output_put_handle(handle); rcu_read_unlock(); } @@ -3362,9 +3429,8 @@ static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; - int size; struct task_struct *task = task_event->task; - int ret; + int size, ret; size = task_event->event_id.header.size; ret = perf_output_begin(&handle, event, size, 0, 0); @@ -3720,7 +3786,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) .event_id = { .header = { .type = PERF_RECORD_MMAP, - .misc = 0, + .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ @@ -3938,36 +4004,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, nmi, data, regs); } -static int perf_swevent_is_counting(struct perf_event *event) -{ - /* - * The event is active, we're good! - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - return 1; - - /* - * The event is off/error, not counting. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE) - return 0; - - /* - * The event is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (event->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data); @@ -3991,12 +4027,6 @@ static int perf_swevent_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - if (event->cpu != -1 && event->cpu != smp_processor_id()) - return 0; - - if (!perf_swevent_is_counting(event)) - return 0; - if (event->attr.type != type) return 0; @@ -4013,18 +4043,80 @@ static int perf_swevent_match(struct perf_event *event, return 1; } -static void perf_swevent_ctx_event(struct perf_event_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static inline struct hlist_head * +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) +{ + u64 hash = swevent_hash(type, event_id); + + return &hlist->heads[hash]; +} + +/* For the read side: events when they trigger */ +static inline struct hlist_head * +find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) +{ + struct swevent_hlist *hlist; + + hlist = rcu_dereference(ctx->swevent_hlist); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +/* For the event head insertion and removal in the hlist */ +static inline struct hlist_head * +find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) +{ + struct swevent_hlist *hlist; + u32 event_id = event->attr.config; + u64 type = event->attr.type; + + /* + * Event scheduling is always serialized against hlist allocation + * and release. Which makes the protected version suitable here. + * The context lock guarantees that. + */ + hlist = rcu_dereference_protected(ctx->swevent_hlist, + lockdep_is_held(&event->ctx->lock)); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { + struct perf_cpu_context *cpuctx; struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + cpuctx = &__get_cpu_var(perf_cpu_context); + + rcu_read_lock(); + + head = find_swevent_head_rcu(cpuctx, type, event_id); + + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } +end: + rcu_read_unlock(); } int perf_swevent_get_recursion_context(void) @@ -4062,27 +4154,6 @@ void perf_swevent_put_recursion_context(int rctx) } EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, - nr, nmi, data, regs); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); - rcu_read_unlock(); -} void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4108,16 +4179,28 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_enable(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct perf_cpu_context *cpuctx; + struct hlist_head *head; + + cpuctx = &__get_cpu_var(perf_cpu_context); if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } + + head = find_swevent_head(cpuctx, event); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + return 0; } static void perf_swevent_disable(struct perf_event *event) { + hlist_del_rcu(&event->hlist_entry); } static const struct pmu perf_ops_generic = { @@ -4145,15 +4228,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((event->attr.exclude_kernel || !regs) && - !event->attr.exclude_user) - regs = task_pt_regs(current); - if (regs) { + if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) if (perf_event_overflow(event, 0, &data, regs)) ret = HRTIMER_NORESTART; @@ -4301,12 +4377,118 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; +/* Deref the hlist from the update side */ +static inline struct swevent_hlist * +swevent_hlist_deref(struct perf_cpu_context *cpuctx) +{ + return rcu_dereference_protected(cpuctx->swevent_hlist, + lockdep_is_held(&cpuctx->hlist_mutex)); +} + +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) +{ + struct swevent_hlist *hlist; + + hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); + kfree(hlist); +} + +static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +{ + struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); + + if (!hlist) + return; + + rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + + mutex_lock(&cpuctx->hlist_mutex); + + if (!--cpuctx->hlist_refcount) + swevent_hlist_release(cpuctx); + + mutex_unlock(&cpuctx->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + int err = 0; + + mutex_lock(&cpuctx->hlist_mutex); + + if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + cpuctx->hlist_refcount++; + exit: + mutex_unlock(&cpuctx->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; + fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + #ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) + int entry_size, struct pt_regs *regs, void *event) { - struct pt_regs *regs = get_irq_regs(); + const int type = PERF_TYPE_TRACEPOINT; struct perf_sample_data data; struct perf_raw_record raw = { .size = entry_size, @@ -4316,12 +4498,13 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, perf_sample_data_init(&data, addr); data.raw = &raw; - if (!regs) - regs = task_pt_regs(current); + if (!event) { + do_perf_sw_event(type, event_id, count, 1, &data, regs); + return; + } - /* Trace events already protected against recursion */ - do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); + if (perf_swevent_match(event, type, event_id, &data, regs)) + perf_swevent_add(event, count, 1, &data, regs); } EXPORT_SYMBOL_GPL(perf_tp_event); @@ -4337,11 +4520,14 @@ static int perf_tp_event_match(struct perf_event *event, static void tp_perf_event_destroy(struct perf_event *event) { - ftrace_profile_disable(event->attr.config); + perf_trace_disable(event->attr.config); + swevent_hlist_put(event); } static const struct pmu *tp_perf_event_init(struct perf_event *event) { + int err; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4351,10 +4537,15 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (ftrace_profile_enable(event->attr.config)) + if (perf_trace_enable(event->attr.config, event)) return NULL; event->destroy = tp_perf_event_destroy; + err = swevent_hlist_get(event); + if (err) { + perf_trace_disable(event->attr.config); + return ERR_PTR(err); + } return &perf_ops_generic; } @@ -4455,6 +4646,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); atomic_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); } static const struct pmu *sw_perf_event_init(struct perf_event *event) @@ -4493,6 +4685,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_ALIGNMENT_FAULTS: case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return ERR_PTR(err); + atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -4878,7 +5076,7 @@ err_fput_free_put_context: err_free_put_context: if (err < 0) - kfree(event); + free_event(event); err_put_context: if (err < 0) @@ -5157,7 +5355,7 @@ void perf_event_exit_task(struct task_struct *child) * * But since its the parent context it won't be the same instance. */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&child_ctx->mutex); again: list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, @@ -5358,16 +5556,37 @@ int perf_event_init_task(struct task_struct *child) return ret; } +static void __init perf_event_init_all_cpus(void) +{ + int cpu; + struct perf_cpu_context *cpuctx; + + for_each_possible_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + mutex_init(&cpuctx->hlist_mutex); + __perf_event_init_context(&cpuctx->ctx, NULL); + } +} + static void __cpuinit perf_event_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx, NULL); spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); + + mutex_lock(&cpuctx->hlist_mutex); + if (cpuctx->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + WARN_ON_ONCE(!hlist); + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + mutex_unlock(&cpuctx->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU @@ -5387,6 +5606,10 @@ static void perf_event_exit_cpu(int cpu) struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_event_context *ctx = &cpuctx->ctx; + mutex_lock(&cpuctx->hlist_mutex); + swevent_hlist_release(cpuctx); + mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); mutex_unlock(&ctx->mutex); @@ -5429,6 +5652,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { void __init perf_event_init(void) { + perf_event_init_all_cpus(); perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,