Merge branch 'perf/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic...
[safe/jmp/linux-2.6] / kernel / perf_event.c
index 511677b..2a060be 100644 (file)
@@ -2320,6 +2320,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
        return virt_to_page(data->data_pages[pgoff - 1]);
 }
 
+static void *perf_mmap_alloc_page(int cpu)
+{
+       struct page *page;
+       int node;
+
+       node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+       page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+       if (!page)
+               return NULL;
+
+       return page_address(page);
+}
+
 static struct perf_mmap_data *
 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
 {
@@ -2336,12 +2349,12 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        if (!data)
                goto fail;
 
-       data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+       data->user_page = perf_mmap_alloc_page(event->cpu);
        if (!data->user_page)
                goto fail_user_page;
 
        for (i = 0; i < nr_pages; i++) {
-               data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+               data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
                if (!data->data_pages[i])
                        goto fail_data_pages;
        }
@@ -2506,8 +2519,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
 {
        long max_size = perf_data_size(data);
 
-       atomic_set(&data->lock, -1);
-
        if (event->attr.watermark) {
                data->watermark = min_t(long, max_size,
                                        event->attr.wakeup_watermark);
@@ -2580,6 +2591,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        long user_extra, extra;
        int ret = 0;
 
+       /*
+        * Don't allow mmap() of inherited per-task counters. This would
+        * create a performance issue due to all children writing to the
+        * same buffer.
+        */
+       if (event->cpu == -1 && event->attr.inherit)
+               return -EINVAL;
+
        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;
 
@@ -2885,82 +2904,57 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 }
 
 /*
- * Curious locking construct.
- *
  * We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
  * cannot fully serialize things.
  *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
  * We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
  */
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
-       int cur, cpu = get_cpu();
-
-       handle->locked = 0;
-
-       for (;;) {
-               cur = atomic_cmpxchg(&data->lock, -1, cpu);
-               if (cur == -1) {
-                       handle->locked = 1;
-                       break;
-               }
-               if (cur == cpu)
-                       break;
 
-               cpu_relax();
-       }
+       preempt_disable();
+       local_inc(&data->nest);
+       handle->wakeup = local_read(&data->wakeup);
 }
 
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
        unsigned long head;
-       int cpu;
-
-       data->done_head = data->head;
-
-       if (!handle->locked)
-               goto out;
 
 again:
-       /*
-        * The xchg implies a full barrier that ensures all writes are done
-        * before we publish the new head, matched by a rmb() in userspace when
-        * reading this position.
-        */
-       while ((head = atomic_long_xchg(&data->done_head, 0)))
-               data->user_page->data_head = head;
+       head = local_read(&data->head);
 
        /*
-        * NMI can happen here, which means we can miss a done_head update.
+        * IRQ/NMI can happen here, which means we can miss a head update.
         */
 
-       cpu = atomic_xchg(&data->lock, -1);
-       WARN_ON_ONCE(cpu != smp_processor_id());
+       if (!local_dec_and_test(&data->nest))
+               return;
 
        /*
-        * Therefore we have to validate we did not indeed do so.
+        * Publish the known good head. Rely on the full barrier implied
+        * by atomic_dec_and_test() order the data->head read and this
+        * write.
         */
-       if (unlikely(atomic_long_read(&data->done_head))) {
-               /*
-                * Since we had it locked, we can lock it again.
-                */
-               while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-                       cpu_relax();
+       data->user_page->data_head = head;
 
+       /*
+        * Now check if we missed an update, rely on the (compiler)
+        * barrier in atomic_dec_and_test() to re-read data->head.
+        */
+       if (unlikely(head != local_read(&data->head))) {
+               local_inc(&data->nest);
                goto again;
        }
 
-       if (atomic_xchg(&data->wakeup, 0))
+       if (handle->wakeup != local_read(&data->wakeup))
                perf_output_wakeup(handle);
-out:
-       put_cpu();
+
+       preempt_enable();
 }
 
 void perf_output_copy(struct perf_output_handle *handle,
@@ -3036,13 +3030,13 @@ int perf_output_begin(struct perf_output_handle *handle,
        handle->sample  = sample;
 
        if (!data->nr_pages)
-               goto fail;
+               goto out;
 
-       have_lost = atomic_read(&data->lost);
+       have_lost = local_read(&data->lost);
        if (have_lost)
                size += sizeof(lost_event);
 
-       perf_output_lock(handle);
+       perf_output_get_handle(handle);
 
        do {
                /*
@@ -3052,24 +3046,24 @@ int perf_output_begin(struct perf_output_handle *handle,
                 */
                tail = ACCESS_ONCE(data->user_page->data_tail);
                smp_rmb();
-               offset = head = atomic_long_read(&data->head);
+               offset = head = local_read(&data->head);
                head += size;
                if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
-       } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+       } while (local_cmpxchg(&data->head, offset, head) != offset);
 
        handle->offset  = offset;
        handle->head    = head;
 
        if (head - tail > data->watermark)
-               atomic_set(&data->wakeup, 1);
+               local_inc(&data->wakeup);
 
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
-               lost_event.lost        = atomic_xchg(&data->lost, 0);
+               lost_event.lost        = local_xchg(&data->lost, 0);
 
                perf_output_put(handle, lost_event);
        }
@@ -3077,8 +3071,8 @@ int perf_output_begin(struct perf_output_handle *handle,
        return 0;
 
 fail:
-       atomic_inc(&data->lost);
-       perf_output_unlock(handle);
+       local_inc(&data->lost);
+       perf_output_put_handle(handle);
 out:
        rcu_read_unlock();
 
@@ -3093,14 +3087,14 @@ void perf_output_end(struct perf_output_handle *handle)
        int wakeup_events = event->attr.wakeup_events;
 
        if (handle->sample && wakeup_events) {
-               int events = atomic_inc_return(&data->events);
+               int events = local_inc_return(&data->events);
                if (events >= wakeup_events) {
-                       atomic_sub(wakeup_events, &data->events);
-                       atomic_set(&data->wakeup, 1);
+                       local_sub(wakeup_events, &data->events);
+                       local_inc(&data->wakeup);
                }
        }
 
-       perf_output_unlock(handle);
+       perf_output_put_handle(handle);
        rcu_read_unlock();
 }
 
@@ -3436,22 +3430,13 @@ static void perf_event_task_output(struct perf_event *event,
 {
        struct perf_output_handle handle;
        struct task_struct *task = task_event->task;
-       unsigned long flags;
        int size, ret;
 
-       /*
-        * If this CPU attempts to acquire an rq lock held by a CPU spinning
-        * in perf_output_lock() from interrupt context, it's game over.
-        */
-       local_irq_save(flags);
-
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
 
-       if (ret) {
-               local_irq_restore(flags);
+       if (ret)
                return;
-       }
 
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3462,7 +3447,6 @@ static void perf_event_task_output(struct perf_event *event,
        perf_output_put(&handle, task_event->event_id);
 
        perf_output_end(&handle);
-       local_irq_restore(flags);
 }
 
 static int perf_event_task_match(struct perf_event *event)
@@ -4502,8 +4486,9 @@ static int swevent_hlist_get(struct perf_event *event)
 #ifdef CONFIG_EVENT_TRACING
 
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-                  int entry_size, struct pt_regs *regs)
+                  int entry_size, struct pt_regs *regs, void *event)
 {
+       const int type = PERF_TYPE_TRACEPOINT;
        struct perf_sample_data data;
        struct perf_raw_record raw = {
                .size = entry_size,
@@ -4513,9 +4498,13 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
        perf_sample_data_init(&data, addr);
        data.raw = &raw;
 
-       /* Trace events already protected against recursion */
-       do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-                        &data, regs);
+       if (!event) {
+               do_perf_sw_event(type, event_id, count, 1, &data, regs);
+               return;
+       }
+
+       if (perf_swevent_match(event, type, event_id, &data, regs))
+               perf_swevent_add(event, count, 1, &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
@@ -4548,7 +4537,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
 
-       if (perf_trace_enable(event->attr.config))
+       if (perf_trace_enable(event->attr.config, event))
                return NULL;
 
        event->destroy = tp_perf_event_destroy;