xfs: remove nr_to_write writeback windup.
[safe/jmp/linux-2.6] / kernel / perf_event.c
index ff5d430..31d6afe 100644 (file)
@@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-       struct perf_event *group_leader = event->group_leader;
+       WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+       event->attach_state |= PERF_ATTACH_CONTEXT;
 
        /*
-        * Depending on whether it is a standalone or sibling event,
-        * add it straight to the context's event list, or to the group
-        * leader's sibling list:
+        * If we're a stand alone event or group leader, we go to the context
+        * list, group events are kept attached to the group so that
+        * perf_group_detach can, at all times, locate all siblings.
         */
-       if (group_leader == event) {
+       if (event->group_leader == event) {
                struct list_head *list;
 
                if (is_software_event(event))
@@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 
                list = ctx_group_list(event, ctx);
                list_add_tail(&event->group_entry, list);
-       } else {
-               if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
-                   !is_software_event(event))
-                       group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
-
-               list_add_tail(&event->group_entry, &group_leader->sibling_list);
-               group_leader->nr_siblings++;
        }
 
        list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                ctx->nr_stat++;
 }
 
+static void perf_group_attach(struct perf_event *event)
+{
+       struct perf_event *group_leader = event->group_leader;
+
+       WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+       event->attach_state |= PERF_ATTACH_GROUP;
+
+       if (group_leader == event)
+               return;
+
+       if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+                       !is_software_event(event))
+               group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
+       list_add_tail(&event->group_entry, &group_leader->sibling_list);
+       group_leader->nr_siblings++;
+}
+
 /*
  * Remove a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-       if (list_empty(&event->group_entry))
+       /*
+        * We can have double detach due to exit/hot-unplug + close.
+        */
+       if (!(event->attach_state & PERF_ATTACH_CONTEXT))
                return;
+
+       event->attach_state &= ~PERF_ATTACH_CONTEXT;
+
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
 
-       list_del_init(&event->group_entry);
        list_del_rcu(&event->event_entry);
 
-       if (event->group_leader != event)
-               event->group_leader->nr_siblings--;
+       if (event->group_leader == event)
+               list_del_init(&event->group_entry);
 
        update_group_times(event);
 
@@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                event->state = PERF_EVENT_STATE_OFF;
 }
 
-static void
-perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+static void perf_group_detach(struct perf_event *event)
 {
        struct perf_event *sibling, *tmp;
+       struct list_head *list = NULL;
+
+       /*
+        * We can have double detach due to exit/hot-unplug + close.
+        */
+       if (!(event->attach_state & PERF_ATTACH_GROUP))
+               return;
+
+       event->attach_state &= ~PERF_ATTACH_GROUP;
+
+       /*
+        * If this is a sibling, remove it from its group.
+        */
+       if (event->group_leader != event) {
+               list_del_init(&event->group_entry);
+               event->group_leader->nr_siblings--;
+               return;
+       }
+
+       if (!list_empty(&event->group_entry))
+               list = &event->group_entry;
 
        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
-        * to the context list directly:
+        * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
-               struct list_head *list;
-
-               list = ctx_group_list(event, ctx);
-               list_move_tail(&sibling->group_entry, list);
+               if (list)
+                       list_move_tail(&sibling->group_entry, list);
                sibling->group_leader = sibling;
 
                /* Inherit group flags from the previous leader */
@@ -652,8 +687,11 @@ group_sched_in(struct perf_event *group_event,
        if (txn)
                pmu->start_txn(pmu);
 
-       if (event_sched_in(group_event, cpuctx, ctx))
+       if (event_sched_in(group_event, cpuctx, ctx)) {
+               if (txn)
+                       pmu->cancel_txn(pmu);
                return -EAGAIN;
+       }
 
        /*
         * Schedule in siblings as one group (if any):
@@ -675,9 +713,6 @@ group_sched_in(struct perf_event *group_event,
        }
 
 group_error:
-       if (txn)
-               pmu->cancel_txn(pmu);
-
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
@@ -689,6 +724,9 @@ group_error:
        }
        event_sched_out(group_event, cpuctx, ctx);
 
+       if (txn)
+               pmu->cancel_txn(pmu);
+
        return -EAGAIN;
 }
 
@@ -727,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
 {
        list_add_event(event, ctx);
+       perf_group_attach(event);
        event->tstamp_enabled = ctx->time;
        event->tstamp_running = ctx->time;
        event->tstamp_stopped = ctx->time;
@@ -1841,6 +1880,7 @@ static void free_event_rcu(struct rcu_head *head)
 }
 
 static void perf_pending_sync(struct perf_event *event);
+static void perf_mmap_data_put(struct perf_mmap_data *data);
 
 static void free_event(struct perf_event *event)
 {
@@ -1856,9 +1896,9 @@ static void free_event(struct perf_event *event)
                        atomic_dec(&nr_task_events);
        }
 
-       if (event->output) {
-               fput(event->output->filp);
-               event->output = NULL;
+       if (event->data) {
+               perf_mmap_data_put(event->data);
+               event->data = NULL;
        }
 
        if (event->destroy)
@@ -1893,8 +1933,8 @@ int perf_event_release_kernel(struct perf_event *event)
         */
        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
        raw_spin_lock_irq(&ctx->lock);
+       perf_group_detach(event);
        list_del_event(event, ctx);
-       perf_destroy_group(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
 
@@ -2175,7 +2215,27 @@ unlock:
        return ret;
 }
 
-static int perf_event_set_output(struct perf_event *event, int output_fd);
+static const struct file_operations perf_fops;
+
+static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+{
+       struct file *file;
+
+       file = fget_light(fd, fput_needed);
+       if (!file)
+               return ERR_PTR(-EBADF);
+
+       if (file->f_op != &perf_fops) {
+               fput_light(file, *fput_needed);
+               *fput_needed = 0;
+               return ERR_PTR(-EBADF);
+       }
+
+       return file->private_data;
+}
+
+static int perf_event_set_output(struct perf_event *event,
+                                struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2202,7 +2262,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                return perf_event_period(event, (u64 __user *)arg);
 
        case PERF_EVENT_IOC_SET_OUTPUT:
-               return perf_event_set_output(event, arg);
+       {
+               struct perf_event *output_event = NULL;
+               int fput_needed = 0;
+               int ret;
+
+               if (arg != -1) {
+                       output_event = perf_fget_light(arg, &fput_needed);
+                       if (IS_ERR(output_event))
+                               return PTR_ERR(output_event);
+               }
+
+               ret = perf_event_set_output(event, output_event);
+               if (output_event)
+                       fput_light(output_event->filp, fput_needed);
+
+               return ret;
+       }
 
        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);
@@ -2297,11 +2373,6 @@ unlock:
        rcu_read_unlock();
 }
 
-static unsigned long perf_data_size(struct perf_mmap_data *data)
-{
-       return data->nr_pages << (PAGE_SHIFT + data->data_order);
-}
-
 #ifndef CONFIG_PERF_USE_VMALLOC
 
 /*
@@ -2340,8 +2411,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        unsigned long size;
        int i;
 
-       WARN_ON(atomic_read(&event->mmap_count));
-
        size = sizeof(struct perf_mmap_data);
        size += nr_pages * sizeof(void *);
 
@@ -2359,7 +2428,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
                        goto fail_data_pages;
        }
 
-       data->data_order = 0;
        data->nr_pages = nr_pages;
 
        return data;
@@ -2395,6 +2463,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
        kfree(data);
 }
 
+static inline int page_order(struct perf_mmap_data *data)
+{
+       return 0;
+}
+
 #else
 
 /*
@@ -2403,10 +2476,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
  * Required for architectures that have d-cache aliasing issues.
  */
 
+static inline int page_order(struct perf_mmap_data *data)
+{
+       return data->page_order;
+}
+
 static struct page *
 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
 {
-       if (pgoff > (1UL << data->data_order))
+       if (pgoff > (1UL << page_order(data)))
                return NULL;
 
        return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2426,7 +2504,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
        int i, nr;
 
        data = container_of(work, struct perf_mmap_data, work);
-       nr = 1 << data->data_order;
+       nr = 1 << page_order(data);
 
        base = data->user_page;
        for (i = 0; i < nr + 1; i++)
@@ -2448,8 +2526,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        unsigned long size;
        void *all_buf;
 
-       WARN_ON(atomic_read(&event->mmap_count));
-
        size = sizeof(struct perf_mmap_data);
        size += sizeof(void *);
 
@@ -2465,7 +2541,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
 
        data->user_page = all_buf;
        data->data_pages[0] = all_buf + PAGE_SIZE;
-       data->data_order = ilog2(nr_pages);
+       data->page_order = ilog2(nr_pages);
        data->nr_pages = 1;
 
        return data;
@@ -2479,6 +2555,11 @@ fail:
 
 #endif
 
+static unsigned long perf_data_size(struct perf_mmap_data *data)
+{
+       return data->nr_pages << (PAGE_SHIFT + page_order(data));
+}
+
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct perf_event *event = vma->vm_file->private_data;
@@ -2519,8 +2600,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
 {
        long max_size = perf_data_size(data);
 
-       atomic_set(&data->lock, -1);
-
        if (event->attr.watermark) {
                data->watermark = min_t(long, max_size,
                                        event->attr.wakeup_watermark);
@@ -2529,7 +2608,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
        if (!data->watermark)
                data->watermark = max_size / 2;
 
-
+       atomic_set(&data->refcount, 1);
        rcu_assign_pointer(event->data, data);
 }
 
@@ -2541,13 +2620,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
        perf_mmap_data_free(data);
 }
 
-static void perf_mmap_data_release(struct perf_event *event)
+static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
 {
-       struct perf_mmap_data *data = event->data;
+       struct perf_mmap_data *data;
+
+       rcu_read_lock();
+       data = rcu_dereference(event->data);
+       if (data) {
+               if (!atomic_inc_not_zero(&data->refcount))
+                       data = NULL;
+       }
+       rcu_read_unlock();
 
-       WARN_ON(atomic_read(&event->mmap_count));
+       return data;
+}
+
+static void perf_mmap_data_put(struct perf_mmap_data *data)
+{
+       if (!atomic_dec_and_test(&data->refcount))
+               return;
 
-       rcu_assign_pointer(event->data, NULL);
        call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
 }
 
@@ -2562,15 +2654,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
 
-       WARN_ON_ONCE(event->ctx->parent_ctx);
        if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
                unsigned long size = perf_data_size(event->data);
-               struct user_struct *user = current_user();
+               struct user_struct *user = event->mmap_user;
+               struct perf_mmap_data *data = event->data;
 
                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-               vma->vm_mm->locked_vm -= event->data->nr_locked;
-               perf_mmap_data_release(event);
+               vma->vm_mm->locked_vm -= event->mmap_locked;
+               rcu_assign_pointer(event->data, NULL);
                mutex_unlock(&event->mmap_mutex);
+
+               perf_mmap_data_put(data);
+               free_uid(user);
        }
 }
 
@@ -2622,13 +2717,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
        WARN_ON_ONCE(event->ctx->parent_ctx);
        mutex_lock(&event->mmap_mutex);
-       if (event->output) {
-               ret = -EINVAL;
-               goto unlock;
-       }
-
-       if (atomic_inc_not_zero(&event->mmap_count)) {
-               if (nr_pages != event->data->nr_pages)
+       if (event->data) {
+               if (event->data->nr_pages == nr_pages)
+                       atomic_inc(&event->data->refcount);
+               else
                        ret = -EINVAL;
                goto unlock;
        }
@@ -2660,21 +2752,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        WARN_ON(event->data);
 
        data = perf_mmap_data_alloc(event, nr_pages);
-       ret = -ENOMEM;
-       if (!data)
+       if (!data) {
+               ret = -ENOMEM;
                goto unlock;
+       }
 
-       ret = 0;
        perf_mmap_data_init(event, data);
-
-       atomic_set(&event->mmap_count, 1);
-       atomic_long_add(user_extra, &user->locked_vm);
-       vma->vm_mm->locked_vm += extra;
-       event->data->nr_locked = extra;
        if (vma->vm_flags & VM_WRITE)
                event->data->writable = 1;
 
+       atomic_long_add(user_extra, &user->locked_vm);
+       event->mmap_locked = extra;
+       event->mmap_user = get_current_user();
+       vma->vm_mm->locked_vm += event->mmap_locked;
+
 unlock:
+       if (!ret)
+               atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
 
        vma->vm_flags |= VM_RESERVED;
@@ -2906,127 +3000,87 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 }
 
 /*
- * Curious locking construct.
- *
  * We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
  * cannot fully serialize things.
  *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
  * We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
  */
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
-       int cur, cpu = get_cpu();
-
-       handle->locked = 0;
-
-       for (;;) {
-               cur = atomic_cmpxchg(&data->lock, -1, cpu);
-               if (cur == -1) {
-                       handle->locked = 1;
-                       break;
-               }
-               if (cur == cpu)
-                       break;
 
-               cpu_relax();
-       }
+       preempt_disable();
+       local_inc(&data->nest);
+       handle->wakeup = local_read(&data->wakeup);
 }
 
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
        unsigned long head;
-       int cpu;
-
-       data->done_head = data->head;
-
-       if (!handle->locked)
-               goto out;
 
 again:
-       /*
-        * The xchg implies a full barrier that ensures all writes are done
-        * before we publish the new head, matched by a rmb() in userspace when
-        * reading this position.
-        */
-       while ((head = atomic_long_xchg(&data->done_head, 0)))
-               data->user_page->data_head = head;
+       head = local_read(&data->head);
 
        /*
-        * NMI can happen here, which means we can miss a done_head update.
+        * IRQ/NMI can happen here, which means we can miss a head update.
         */
 
-       cpu = atomic_xchg(&data->lock, -1);
-       WARN_ON_ONCE(cpu != smp_processor_id());
+       if (!local_dec_and_test(&data->nest))
+               goto out;
 
        /*
-        * Therefore we have to validate we did not indeed do so.
+        * Publish the known good head. Rely on the full barrier implied
+        * by atomic_dec_and_test() order the data->head read and this
+        * write.
         */
-       if (unlikely(atomic_long_read(&data->done_head))) {
-               /*
-                * Since we had it locked, we can lock it again.
-                */
-               while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-                       cpu_relax();
+       data->user_page->data_head = head;
 
+       /*
+        * Now check if we missed an update, rely on the (compiler)
+        * barrier in atomic_dec_and_test() to re-read data->head.
+        */
+       if (unlikely(head != local_read(&data->head))) {
+               local_inc(&data->nest);
                goto again;
        }
 
-       if (atomic_xchg(&data->wakeup, 0))
+       if (handle->wakeup != local_read(&data->wakeup))
                perf_output_wakeup(handle);
-out:
-       put_cpu();
+
+ out:
+       preempt_enable();
 }
 
-void perf_output_copy(struct perf_output_handle *handle,
+__always_inline void perf_output_copy(struct perf_output_handle *handle,
                      const void *buf, unsigned int len)
 {
-       unsigned int pages_mask;
-       unsigned long offset;
-       unsigned int size;
-       void **pages;
-
-       offset          = handle->offset;
-       pages_mask      = handle->data->nr_pages - 1;
-       pages           = handle->data->data_pages;
-
        do {
-               unsigned long page_offset;
-               unsigned long page_size;
-               int nr;
+               unsigned long size = min_t(unsigned long, handle->size, len);
 
-               nr          = (offset >> PAGE_SHIFT) & pages_mask;
-               page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
-               page_offset = offset & (page_size - 1);
-               size        = min_t(unsigned int, page_size - page_offset, len);
+               memcpy(handle->addr, buf, size);
 
-               memcpy(pages[nr] + page_offset, buf, size);
+               len -= size;
+               handle->addr += size;
+               buf += size;
+               handle->size -= size;
+               if (!handle->size) {
+                       struct perf_mmap_data *data = handle->data;
 
-               len         -= size;
-               buf         += size;
-               offset      += size;
+                       handle->page++;
+                       handle->page &= data->nr_pages - 1;
+                       handle->addr = data->data_pages[handle->page];
+                       handle->size = PAGE_SIZE << page_order(data);
+               }
        } while (len);
-
-       handle->offset = offset;
-
-       /*
-        * Check we didn't copy past our reservation window, taking the
-        * possible unsigned int wrap into account.
-        */
-       WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
 int perf_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event, unsigned int size,
                      int nmi, int sample)
 {
-       struct perf_event *output_event;
        struct perf_mmap_data *data;
        unsigned long tail, offset, head;
        int have_lost;
@@ -3043,10 +3097,6 @@ int perf_output_begin(struct perf_output_handle *handle,
        if (event->parent)
                event = event->parent;
 
-       output_event = rcu_dereference(event->output);
-       if (output_event)
-               event = output_event;
-
        data = rcu_dereference(event->data);
        if (!data)
                goto out;
@@ -3059,11 +3109,11 @@ int perf_output_begin(struct perf_output_handle *handle,
        if (!data->nr_pages)
                goto out;
 
-       have_lost = atomic_read(&data->lost);
+       have_lost = local_read(&data->lost);
        if (have_lost)
                size += sizeof(lost_event);
 
-       perf_output_lock(handle);
+       perf_output_get_handle(handle);
 
        do {
                /*
@@ -3073,24 +3123,28 @@ int perf_output_begin(struct perf_output_handle *handle,
                 */
                tail = ACCESS_ONCE(data->user_page->data_tail);
                smp_rmb();
-               offset = head = atomic_long_read(&data->head);
+               offset = head = local_read(&data->head);
                head += size;
                if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
-       } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+       } while (local_cmpxchg(&data->head, offset, head) != offset);
 
-       handle->offset  = offset;
-       handle->head    = head;
+       if (head - local_read(&data->wakeup) > data->watermark)
+               local_add(data->watermark, &data->wakeup);
 
-       if (head - tail > data->watermark)
-               atomic_set(&data->wakeup, 1);
+       handle->page = offset >> (PAGE_SHIFT + page_order(data));
+       handle->page &= data->nr_pages - 1;
+       handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
+       handle->addr = data->data_pages[handle->page];
+       handle->addr += handle->size;
+       handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
 
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
-               lost_event.lost        = atomic_xchg(&data->lost, 0);
+               lost_event.lost        = local_xchg(&data->lost, 0);
 
                perf_output_put(handle, lost_event);
        }
@@ -3098,8 +3152,8 @@ int perf_output_begin(struct perf_output_handle *handle,
        return 0;
 
 fail:
-       atomic_inc(&data->lost);
-       perf_output_unlock(handle);
+       local_inc(&data->lost);
+       perf_output_put_handle(handle);
 out:
        rcu_read_unlock();
 
@@ -3114,14 +3168,14 @@ void perf_output_end(struct perf_output_handle *handle)
        int wakeup_events = event->attr.wakeup_events;
 
        if (handle->sample && wakeup_events) {
-               int events = atomic_inc_return(&data->events);
+               int events = local_inc_return(&data->events);
                if (events >= wakeup_events) {
-                       atomic_sub(wakeup_events, &data->events);
-                       atomic_set(&data->wakeup, 1);
+                       local_sub(wakeup_events, &data->events);
+                       local_inc(&data->wakeup);
                }
        }
 
-       perf_output_unlock(handle);
+       perf_output_put_handle(handle);
        rcu_read_unlock();
 }
 
@@ -3457,22 +3511,13 @@ static void perf_event_task_output(struct perf_event *event,
 {
        struct perf_output_handle handle;
        struct task_struct *task = task_event->task;
-       unsigned long flags;
        int size, ret;
 
-       /*
-        * If this CPU attempts to acquire an rq lock held by a CPU spinning
-        * in perf_output_lock() from interrupt context, it's game over.
-        */
-       local_irq_save(flags);
-
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
 
-       if (ret) {
-               local_irq_restore(flags);
+       if (ret)
                return;
-       }
 
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3483,7 +3528,6 @@ static void perf_event_task_output(struct perf_event *event,
        perf_output_put(&handle, task_event->event_id);
 
        perf_output_end(&handle);
-       local_irq_restore(flags);
 }
 
 static int perf_event_task_match(struct perf_event *event)
@@ -4011,13 +4055,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        }
 }
 
-static void perf_swevent_unthrottle(struct perf_event *event)
-{
-       /*
-        * Nothing to do, we already reset hwc->interrupts.
-        */
-}
-
 static void perf_swevent_add(struct perf_event *event, u64 nr,
                               int nmi, struct perf_sample_data *data,
                               struct pt_regs *regs)
@@ -4041,9 +4078,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
-static int perf_tp_event_match(struct perf_event *event,
-                               struct perf_sample_data *data);
-
 static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
@@ -4073,10 +4107,6 @@ static int perf_swevent_match(struct perf_event *event,
        if (perf_exclude_event(event, regs))
                return 0;
 
-       if (event->attr.type == PERF_TYPE_TRACEPOINT &&
-           !perf_tp_event_match(event, data))
-               return 0;
-
        return 1;
 }
 
@@ -4087,19 +4117,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id)
        return hash_64(val, SWEVENT_HLIST_BITS);
 }
 
-static struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 {
-       u64 hash;
-       struct swevent_hlist *hlist;
+       u64 hash = swevent_hash(type, event_id);
 
-       hash = swevent_hash(type, event_id);
+       return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+       struct swevent_hlist *hlist;
 
        hlist = rcu_dereference(ctx->swevent_hlist);
        if (!hlist)
                return NULL;
 
-       return &hlist->heads[hash];
+       return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+{
+       struct swevent_hlist *hlist;
+       u32 event_id = event->attr.config;
+       u64 type = event->attr.type;
+
+       /*
+        * Event scheduling is always serialized against hlist allocation
+        * and release. Which makes the protected version suitable here.
+        * The context lock guarantees that.
+        */
+       hlist = rcu_dereference_protected(ctx->swevent_hlist,
+                                         lockdep_is_held(&event->ctx->lock));
+       if (!hlist)
+               return NULL;
+
+       return __find_swevent_head(hlist, type, event_id);
 }
 
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
@@ -4116,7 +4173,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 
        rcu_read_lock();
 
-       head = find_swevent_head(cpuctx, type, event_id);
+       head = find_swevent_head_rcu(cpuctx, type, event_id);
 
        if (!head)
                goto end;
@@ -4131,7 +4188,7 @@ end:
 
 int perf_swevent_get_recursion_context(void)
 {
-       struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        int rctx;
 
        if (in_nmi())
@@ -4143,10 +4200,8 @@ int perf_swevent_get_recursion_context(void)
        else
                rctx = 0;
 
-       if (cpuctx->recursion[rctx]) {
-               put_cpu_var(perf_cpu_context);
+       if (cpuctx->recursion[rctx])
                return -1;
-       }
 
        cpuctx->recursion[rctx]++;
        barrier();
@@ -4160,7 +4215,6 @@ void perf_swevent_put_recursion_context(int rctx)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        barrier();
        cpuctx->recursion[rctx]--;
-       put_cpu_var(perf_cpu_context);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
@@ -4171,6 +4225,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        struct perf_sample_data data;
        int rctx;
 
+       preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (rctx < 0)
                return;
@@ -4180,6 +4235,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
 
        perf_swevent_put_recursion_context(rctx);
+       preempt_enable_notrace();
 }
 
 static void perf_swevent_read(struct perf_event *event)
@@ -4199,7 +4255,7 @@ static int perf_swevent_enable(struct perf_event *event)
                perf_swevent_set_period(event);
        }
 
-       head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+       head = find_swevent_head(cpuctx, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;
 
@@ -4213,11 +4269,22 @@ static void perf_swevent_disable(struct perf_event *event)
        hlist_del_rcu(&event->hlist_entry);
 }
 
+static void perf_swevent_void(struct perf_event *event)
+{
+}
+
+static int perf_swevent_int(struct perf_event *event)
+{
+       return 0;
+}
+
 static const struct pmu perf_ops_generic = {
        .enable         = perf_swevent_enable,
        .disable        = perf_swevent_disable,
+       .start          = perf_swevent_int,
+       .stop           = perf_swevent_void,
        .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_unthrottle,
+       .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
 };
 
 /*
@@ -4387,6 +4454,14 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_event_read,
 };
 
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+{
+       return rcu_dereference_protected(cpuctx->swevent_hlist,
+                                        lockdep_is_held(&cpuctx->hlist_mutex));
+}
+
 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 {
        struct swevent_hlist *hlist;
@@ -4397,12 +4472,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 
 static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
 {
-       struct swevent_hlist *hlist;
+       struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
 
-       if (!cpuctx->swevent_hlist)
+       if (!hlist)
                return;
 
-       hlist = cpuctx->swevent_hlist;
        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 }
@@ -4439,7 +4513,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 
        mutex_lock(&cpuctx->hlist_mutex);
 
-       if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+       if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
                struct swevent_hlist *hlist;
 
                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4488,11 +4562,48 @@ static int swevent_hlist_get(struct perf_event *event)
 
 #ifdef CONFIG_EVENT_TRACING
 
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-                  int entry_size, struct pt_regs *regs, void *event)
+static const struct pmu perf_ops_tracepoint = {
+       .enable         = perf_trace_enable,
+       .disable        = perf_trace_disable,
+       .start          = perf_swevent_int,
+       .stop           = perf_swevent_void,
+       .read           = perf_swevent_read,
+       .unthrottle     = perf_swevent_void,
+};
+
+static int perf_tp_filter_match(struct perf_event *event,
+                               struct perf_sample_data *data)
+{
+       void *record = data->raw->data;
+
+       if (likely(!event->filter) || filter_match_preds(event->filter, record))
+               return 1;
+       return 0;
+}
+
+static int perf_tp_event_match(struct perf_event *event,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
+{
+       /*
+        * All tracepoints are from kernel-space.
+        */
+       if (event->attr.exclude_kernel)
+               return 0;
+
+       if (!perf_tp_filter_match(event, data))
+               return 0;
+
+       return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+                  struct pt_regs *regs, struct hlist_head *head)
 {
-       const int type = PERF_TYPE_TRACEPOINT;
        struct perf_sample_data data;
+       struct perf_event *event;
+       struct hlist_node *node;
+
        struct perf_raw_record raw = {
                .size = entry_size,
                .data = record,
@@ -4501,30 +4612,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
        perf_sample_data_init(&data, addr);
        data.raw = &raw;
 
-       if (!event) {
-               do_perf_sw_event(type, event_id, count, 1, &data, regs);
-               return;
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+               if (perf_tp_event_match(event, &data, regs))
+                       perf_swevent_add(event, count, 1, &data, regs);
        }
-
-       if (perf_swevent_match(event, type, event_id, &data, regs))
-               perf_swevent_add(event, count, 1, &data, regs);
+       rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
-static int perf_tp_event_match(struct perf_event *event,
-                               struct perf_sample_data *data)
-{
-       void *record = data->raw->data;
-
-       if (likely(!event->filter) || filter_match_preds(event->filter, record))
-               return 1;
-       return 0;
-}
-
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-       perf_trace_disable(event->attr.config);
-       swevent_hlist_put(event);
+       perf_trace_destroy(event);
 }
 
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4540,17 +4639,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
 
-       if (perf_trace_enable(event->attr.config, event))
+       err = perf_trace_init(event);
+       if (err)
                return NULL;
 
        event->destroy = tp_perf_event_destroy;
-       err = swevent_hlist_get(event);
-       if (err) {
-               perf_trace_disable(event->attr.config);
-               return ERR_PTR(err);
-       }
 
-       return &perf_ops_generic;
+       return &perf_ops_tracepoint;
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4578,12 +4673,6 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #else
 
-static int perf_tp_event_match(struct perf_event *event,
-                               struct perf_sample_data *data)
-{
-       return 1;
-}
-
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
        return NULL;
@@ -4912,54 +5001,53 @@ err_size:
        goto out;
 }
 
-static int perf_event_set_output(struct perf_event *event, int output_fd)
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-       struct perf_event *output_event = NULL;
-       struct file *output_file = NULL;
-       struct perf_event *old_output;
-       int fput_needed = 0;
+       struct perf_mmap_data *data = NULL, *old_data = NULL;
        int ret = -EINVAL;
 
-       if (!output_fd)
+       if (!output_event)
                goto set;
 
-       output_file = fget_light(output_fd, &fput_needed);
-       if (!output_file)
-               return -EBADF;
-
-       if (output_file->f_op != &perf_fops)
+       /* don't allow circular references */
+       if (event == output_event)
                goto out;
 
-       output_event = output_file->private_data;
-
-       /* Don't chain output fds */
-       if (output_event->output)
+       /*
+        * Don't allow cross-cpu buffers
+        */
+       if (output_event->cpu != event->cpu)
                goto out;
 
-       /* Don't set an output fd when we already have an output channel */
-       if (event->data)
+       /*
+        * If its not a per-cpu buffer, it must be the same task.
+        */
+       if (output_event->cpu == -1 && output_event->ctx != event->ctx)
                goto out;
 
-       atomic_long_inc(&output_file->f_count);
-
 set:
        mutex_lock(&event->mmap_mutex);
-       old_output = event->output;
-       rcu_assign_pointer(event->output, output_event);
-       mutex_unlock(&event->mmap_mutex);
+       /* Can't redirect output if we've got an active mmap() */
+       if (atomic_read(&event->mmap_count))
+               goto unlock;
 
-       if (old_output) {
-               /*
-                * we need to make sure no existing perf_output_*()
-                * is still referencing this event.
-                */
-               synchronize_rcu();
-               fput(old_output->filp);
+       if (output_event) {
+               /* get the buffer we want to redirect to */
+               data = perf_mmap_data_get(output_event);
+               if (!data)
+                       goto unlock;
        }
 
+       old_data = event->data;
+       rcu_assign_pointer(event->data, data);
        ret = 0;
+unlock:
+       mutex_unlock(&event->mmap_mutex);
+
+       if (old_data)
+               perf_mmap_data_put(old_data);
 out:
-       fput_light(output_file, fput_needed);
        return ret;
 }
 
@@ -4975,13 +5063,13 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
-       struct perf_event *event, *group_leader;
+       struct perf_event *event, *group_leader = NULL, *output_event = NULL;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct file *group_file = NULL;
+       int event_fd;
        int fput_needed = 0;
-       int fput_needed2 = 0;
        int err;
 
        /* for future expandability... */
@@ -5002,26 +5090,38 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EINVAL;
        }
 
+       event_fd = get_unused_fd_flags(O_RDWR);
+       if (event_fd < 0)
+               return event_fd;
+
        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(pid, cpu);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
+       if (IS_ERR(ctx)) {
+               err = PTR_ERR(ctx);
+               goto err_fd;
+       }
+
+       if (group_fd != -1) {
+               group_leader = perf_fget_light(group_fd, &fput_needed);
+               if (IS_ERR(group_leader)) {
+                       err = PTR_ERR(group_leader);
+                       goto err_put_context;
+               }
+               group_file = group_leader->filp;
+               if (flags & PERF_FLAG_FD_OUTPUT)
+                       output_event = group_leader;
+               if (flags & PERF_FLAG_FD_NO_GROUP)
+                       group_leader = NULL;
+       }
 
        /*
         * Look up the group leader (we will attach this event to it):
         */
-       group_leader = NULL;
-       if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+       if (group_leader) {
                err = -EINVAL;
-               group_file = fget_light(group_fd, &fput_needed);
-               if (!group_file)
-                       goto err_put_context;
-               if (group_file->f_op != &perf_fops)
-                       goto err_put_context;
 
-               group_leader = group_file->private_data;
                /*
                 * Do not allow a recursive hierarchy (this new sibling
                 * becoming part of another group-sibling):
@@ -5043,22 +5143,21 @@ SYSCALL_DEFINE5(perf_event_open,
 
        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
                                     NULL, NULL, GFP_KERNEL);
-       err = PTR_ERR(event);
-       if (IS_ERR(event))
+       if (IS_ERR(event)) {
+               err = PTR_ERR(event);
                goto err_put_context;
+       }
 
-       err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
-       if (err < 0)
-               goto err_free_put_context;
+       if (output_event) {
+               err = perf_event_set_output(event, output_event);
+               if (err)
+                       goto err_free_put_context;
+       }
 
-       event_file = fget_light(err, &fput_needed2);
-       if (!event_file)
+       event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+       if (IS_ERR(event_file)) {
+               err = PTR_ERR(event_file);
                goto err_free_put_context;
-
-       if (flags & PERF_FLAG_FD_OUTPUT) {
-               err = perf_event_set_output(event, group_fd);
-               if (err)
-                       goto err_fput_free_put_context;
        }
 
        event->filp = event_file;
@@ -5074,19 +5173,23 @@ SYSCALL_DEFINE5(perf_event_open,
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
 
-err_fput_free_put_context:
-       fput_light(event_file, fput_needed2);
+       /*
+        * Drop the reference on the group_event after placing the
+        * new event on the sibling_list. This ensures destruction
+        * of the group leader will find the pointer to itself in
+        * perf_group_detach().
+        */
+       fput_light(group_file, fput_needed);
+       fd_install(event_fd, event_file);
+       return event_fd;
 
 err_free_put_context:
-       if (err < 0)
-               free_event(event);
-
+       free_event(event);
 err_put_context:
-       if (err < 0)
-               put_ctx(ctx);
-
        fput_light(group_file, fput_needed);
-
+       put_ctx(ctx);
+err_fd:
+       put_unused_fd(event_fd);
        return err;
 }
 
@@ -5397,6 +5500,7 @@ static void perf_free_event(struct perf_event *event,
 
        fput(parent->filp);
 
+       perf_group_detach(event);
        list_del_event(event, ctx);
        free_event(event);
 }