perf_counter: Provide functions for locking and pinning the context for a task
[safe/jmp/linux-2.6] / kernel / perf_counter.c
index ec9c400..da8dfef 100644 (file)
@@ -103,13 +103,86 @@ static void get_ctx(struct perf_counter_context *ctx)
        atomic_inc(&ctx->refcount);
 }
 
+static void free_ctx(struct rcu_head *head)
+{
+       struct perf_counter_context *ctx;
+
+       ctx = container_of(head, struct perf_counter_context, rcu_head);
+       kfree(ctx);
+}
+
 static void put_ctx(struct perf_counter_context *ctx)
 {
        if (atomic_dec_and_test(&ctx->refcount)) {
                if (ctx->parent_ctx)
                        put_ctx(ctx->parent_ctx);
-               kfree(ctx);
+               if (ctx->task)
+                       put_task_struct(ctx->task);
+               call_rcu(&ctx->rcu_head, free_ctx);
+       }
+}
+
+/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *perf_lock_task_context(
+                               struct task_struct *task, unsigned long *flags)
+{
+       struct perf_counter_context *ctx;
+
+       rcu_read_lock();
+ retry:
+       ctx = rcu_dereference(task->perf_counter_ctxp);
+       if (ctx) {
+               /*
+                * If this context is a clone of another, it might
+                * get swapped for another underneath us by
+                * perf_counter_task_sched_out, though the
+                * rcu_read_lock() protects us from any context
+                * getting freed.  Lock the context and check if it
+                * got swapped before we could get the lock, and retry
+                * if so.  If we locked the right context, then it
+                * can't get swapped on us any more.
+                */
+               spin_lock_irqsave(&ctx->lock, *flags);
+               if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+                       spin_unlock_irqrestore(&ctx->lock, *flags);
+                       goto retry;
+               }
+       }
+       rcu_read_unlock();
+       return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+       struct perf_counter_context *ctx;
+       unsigned long flags;
+
+       ctx = perf_lock_task_context(task, &flags);
+       if (ctx) {
+               ++ctx->pin_count;
+               get_ctx(ctx);
+               spin_unlock_irqrestore(&ctx->lock, flags);
        }
+       return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ctx->lock, flags);
+       --ctx->pin_count;
+       spin_unlock_irqrestore(&ctx->lock, flags);
+       put_ctx(ctx);
 }
 
 /*
@@ -212,22 +285,6 @@ group_sched_out(struct perf_counter *group_counter,
 }
 
 /*
- * Mark this context as not being a clone of another.
- * Called when counters are added to or removed from this context.
- * We also increment our generation number so that anything that
- * was cloned from this context before this will not match anything
- * cloned from this context after this.
- */
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-       ++ctx->generation;
-       if (!ctx->parent_ctx)
-               return;
-       put_ctx(ctx->parent_ctx);
-       ctx->parent_ctx = NULL;
-}
-
-/*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
@@ -238,7 +295,6 @@ static void __perf_counter_remove_from_context(void *info)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_counter *counter = info;
        struct perf_counter_context *ctx = counter->ctx;
-       unsigned long flags;
 
        /*
         * If this is a task context, we need to check whether it is
@@ -248,7 +304,7 @@ static void __perf_counter_remove_from_context(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
 
-       spin_lock_irqsave(&ctx->lock, flags);
+       spin_lock(&ctx->lock);
        /*
         * Protect the list operation against NMI by disabling the
         * counters on a global level.
@@ -270,7 +326,7 @@ static void __perf_counter_remove_from_context(void *info)
        }
 
        perf_enable();
-       spin_unlock_irqrestore(&ctx->lock, flags);
+       spin_unlock(&ctx->lock);
 }
 
 
@@ -281,13 +337,19 @@ static void __perf_counter_remove_from_context(void *info)
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
  */
 static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
        struct perf_counter_context *ctx = counter->ctx;
        struct task_struct *task = ctx->task;
 
-       unclone_ctx(ctx);
        if (!task) {
                /*
                 * Per cpu counters are removed via an smp call and
@@ -380,7 +442,6 @@ static void __perf_counter_disable(void *info)
        struct perf_counter *counter = info;
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_counter_context *ctx = counter->ctx;
-       unsigned long flags;
 
        /*
         * If this is a per-task counter, need to check whether this
@@ -389,7 +450,7 @@ static void __perf_counter_disable(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
 
-       spin_lock_irqsave(&ctx->lock, flags);
+       spin_lock(&ctx->lock);
 
        /*
         * If the counter is on, turn it off.
@@ -405,11 +466,21 @@ static void __perf_counter_disable(void *info)
                counter->state = PERF_COUNTER_STATE_OFF;
        }
 
-       spin_unlock_irqrestore(&ctx->lock, flags);
+       spin_unlock(&ctx->lock);
 }
 
 /*
  * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
  */
 static void perf_counter_disable(struct perf_counter *counter)
 {
@@ -602,7 +673,6 @@ static void __perf_install_in_context(void *info)
        struct perf_counter_context *ctx = counter->ctx;
        struct perf_counter *leader = counter->group_leader;
        int cpu = smp_processor_id();
-       unsigned long flags;
        int err;
 
        /*
@@ -618,7 +688,7 @@ static void __perf_install_in_context(void *info)
                cpuctx->task_ctx = ctx;
        }
 
-       spin_lock_irqsave(&ctx->lock, flags);
+       spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
 
@@ -668,7 +738,7 @@ static void __perf_install_in_context(void *info)
  unlock:
        perf_enable();
 
-       spin_unlock_irqrestore(&ctx->lock, flags);
+       spin_unlock(&ctx->lock);
 }
 
 /*
@@ -732,7 +802,6 @@ static void __perf_counter_enable(void *info)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_counter_context *ctx = counter->ctx;
        struct perf_counter *leader = counter->group_leader;
-       unsigned long flags;
        int err;
 
        /*
@@ -745,7 +814,7 @@ static void __perf_counter_enable(void *info)
                cpuctx->task_ctx = ctx;
        }
 
-       spin_lock_irqsave(&ctx->lock, flags);
+       spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
 
@@ -789,11 +858,17 @@ static void __perf_counter_enable(void *info)
        }
 
  unlock:
-       spin_unlock_irqrestore(&ctx->lock, flags);
+       spin_unlock(&ctx->lock);
 }
 
 /*
  * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
  */
 static void perf_counter_enable(struct perf_counter *counter)
 {
@@ -903,7 +978,8 @@ static int context_equiv(struct perf_counter_context *ctx1,
                         struct perf_counter_context *ctx2)
 {
        return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-               && ctx1->parent_gen == ctx2->parent_gen;
+               && ctx1->parent_gen == ctx2->parent_gen
+               && !ctx1->pin_count && !ctx2->pin_count;
 }
 
 /*
@@ -923,7 +999,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
        struct perf_counter_context *ctx = task->perf_counter_ctxp;
        struct perf_counter_context *next_ctx;
+       struct perf_counter_context *parent;
        struct pt_regs *regs;
+       int do_switch = 1;
 
        regs = task_pt_regs(task);
        perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
@@ -932,30 +1010,65 @@ void perf_counter_task_sched_out(struct task_struct *task,
                return;
 
        update_context_time(ctx);
+
+       rcu_read_lock();
+       parent = rcu_dereference(ctx->parent_ctx);
        next_ctx = next->perf_counter_ctxp;
-       if (next_ctx && context_equiv(ctx, next_ctx)) {
-               task->perf_counter_ctxp = next_ctx;
-               next->perf_counter_ctxp = ctx;
-               ctx->task = next;
-               next_ctx->task = task;
-               return;
+       if (parent && next_ctx &&
+           rcu_dereference(next_ctx->parent_ctx) == parent) {
+               /*
+                * Looks like the two contexts are clones, so we might be
+                * able to optimize the context switch.  We lock both
+                * contexts and check that they are clones under the
+                * lock (including re-checking that neither has been
+                * uncloned in the meantime).  It doesn't matter which
+                * order we take the locks because no other cpu could
+                * be trying to lock both of these tasks.
+                */
+               spin_lock(&ctx->lock);
+               spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+               if (context_equiv(ctx, next_ctx)) {
+                       /*
+                        * XXX do we need a memory barrier of sorts
+                        * wrt to rcu_dereference() of perf_counter_ctxp
+                        */
+                       task->perf_counter_ctxp = next_ctx;
+                       next->perf_counter_ctxp = ctx;
+                       ctx->task = next;
+                       next_ctx->task = task;
+                       do_switch = 0;
+               }
+               spin_unlock(&next_ctx->lock);
+               spin_unlock(&ctx->lock);
        }
+       rcu_read_unlock();
 
-       __perf_counter_sched_out(ctx, cpuctx);
-
-       cpuctx->task_ctx = NULL;
+       if (do_switch) {
+               __perf_counter_sched_out(ctx, cpuctx);
+               cpuctx->task_ctx = NULL;
+       }
 }
 
+/*
+ * Called with IRQs disabled
+ */
 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 {
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
        if (!cpuctx->task_ctx)
                return;
+
+       if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+               return;
+
        __perf_counter_sched_out(ctx, cpuctx);
        cpuctx->task_ctx = NULL;
 }
 
+/*
+ * Called with IRQs disabled
+ */
 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 {
        __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
@@ -1215,18 +1328,14 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
        ctx->task = task;
 }
 
-static void put_context(struct perf_counter_context *ctx)
-{
-       if (ctx->task)
-               put_task_struct(ctx->task);
-}
-
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_counter_context *ctx;
-       struct perf_counter_context *tctx;
+       struct perf_counter_context *parent_ctx;
        struct task_struct *task;
+       unsigned long flags;
+       int err;
 
        /*
         * If cpu is not a wildcard then this is a percpu counter:
@@ -1249,6 +1358,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 
                cpuctx = &per_cpu(perf_cpu_context, cpu);
                ctx = &cpuctx->ctx;
+               get_ctx(ctx);
 
                return ctx;
        }
@@ -1265,37 +1375,58 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
        if (!task)
                return ERR_PTR(-ESRCH);
 
+       /*
+        * Can't attach counters to a dying task.
+        */
+       err = -ESRCH;
+       if (task->flags & PF_EXITING)
+               goto errout;
+
        /* Reuse ptrace permission checks for now. */
-       if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-               put_task_struct(task);
-               return ERR_PTR(-EACCES);
+       err = -EACCES;
+       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+               goto errout;
+
+ retry:
+       ctx = perf_lock_task_context(task, &flags);
+       if (ctx) {
+               parent_ctx = ctx->parent_ctx;
+               if (parent_ctx) {
+                       put_ctx(parent_ctx);
+                       ctx->parent_ctx = NULL;         /* no longer a clone */
+               }
+               /*
+                * Get an extra reference before dropping the lock so that
+                * this context won't get freed if the task exits.
+                */
+               get_ctx(ctx);
+               spin_unlock_irqrestore(&ctx->lock, flags);
        }
 
-       ctx = task->perf_counter_ctxp;
        if (!ctx) {
                ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-               if (!ctx) {
-                       put_task_struct(task);
-                       return ERR_PTR(-ENOMEM);
-               }
+               err = -ENOMEM;
+               if (!ctx)
+                       goto errout;
                __perf_counter_init_context(ctx, task);
-               /*
-                * Make sure other cpus see correct values for *ctx
-                * once task->perf_counter_ctxp is visible to them.
-                */
-               smp_wmb();
-               tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
-               if (tctx) {
+               get_ctx(ctx);
+               if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
                        /*
                         * We raced with some other task; use
                         * the context they set.
                         */
                        kfree(ctx);
-                       ctx = tctx;
+                       goto retry;
                }
+               get_task_struct(task);
        }
 
+       put_task_struct(task);
        return ctx;
+
+ errout:
+       put_task_struct(task);
+       return ERR_PTR(err);
 }
 
 static void free_counter_rcu(struct rcu_head *head)
@@ -1303,7 +1434,6 @@ static void free_counter_rcu(struct rcu_head *head)
        struct perf_counter *counter;
 
        counter = container_of(head, struct perf_counter, rcu_head);
-       put_ctx(counter->ctx);
        kfree(counter);
 }
 
@@ -1324,6 +1454,7 @@ static void free_counter(struct perf_counter *counter)
        if (counter->destroy)
                counter->destroy(counter);
 
+       put_ctx(counter->ctx);
        call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1337,6 +1468,7 @@ static int perf_release(struct inode *inode, struct file *file)
 
        file->private_data = NULL;
 
+       WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_counter_remove_from_context(counter);
        mutex_unlock(&ctx->mutex);
@@ -1347,7 +1479,6 @@ static int perf_release(struct inode *inode, struct file *file)
        put_task_struct(counter->owner);
 
        free_counter(counter);
-       put_context(ctx);
 
        return 0;
 }
@@ -1369,6 +1500,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
        if (counter->state == PERF_COUNTER_STATE_ERROR)
                return 0;
 
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->child_mutex);
        values[0] = perf_counter_read(counter);
        n = 1;
@@ -1428,6 +1560,7 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
        struct perf_counter_context *ctx = counter->ctx;
        struct perf_counter *sibling;
 
+       WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        counter = counter->group_leader;
 
@@ -1437,11 +1570,18 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
        mutex_unlock(&ctx->mutex);
 }
 
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
 static void perf_counter_for_each_child(struct perf_counter *counter,
                                        void (*func)(struct perf_counter *))
 {
        struct perf_counter *child;
 
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->child_mutex);
        func(counter);
        list_for_each_entry(child, &counter->child_list, child_list)
@@ -1454,6 +1594,7 @@ static void perf_counter_for_each(struct perf_counter *counter,
 {
        struct perf_counter *child;
 
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->child_mutex);
        perf_counter_for_each_sibling(counter, func);
        list_for_each_entry(child, &counter->child_list, child_list)
@@ -1659,6 +1800,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_counter *counter = vma->vm_file->private_data;
 
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
        if (atomic_dec_and_mutex_lock(&counter->mmap_count,
                                      &counter->mmap_mutex)) {
                struct user_struct *user = current_user();
@@ -1706,6 +1848,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        if (vma->vm_pgoff != 0)
                return -EINVAL;
 
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->mmap_mutex);
        if (atomic_inc_not_zero(&counter->mmap_count)) {
                if (nr_pages != counter->data->nr_pages)
@@ -2326,6 +2469,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
 static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 {
        struct perf_cpu_context *cpuctx;
+       struct perf_counter_context *ctx;
        unsigned int size;
        char *comm = comm_event->task->comm;
 
@@ -2340,7 +2484,15 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
        perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
        put_cpu_var(perf_cpu_context);
 
-       perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event);
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (ctx)
+               perf_counter_comm_ctx(ctx, comm_event);
+       rcu_read_unlock();
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2349,8 +2501,6 @@ void perf_counter_comm(struct task_struct *task)
 
        if (!atomic_read(&nr_comm_tracking))
                return;
-       if (!current->perf_counter_ctxp)
-               return;
 
        comm_event = (struct perf_comm_event){
                .task   = task,
@@ -2433,6 +2583,7 @@ static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
 static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 {
        struct perf_cpu_context *cpuctx;
+       struct perf_counter_context *ctx;
        struct file *file = mmap_event->file;
        unsigned int size;
        char tmp[16];
@@ -2467,7 +2618,15 @@ got_name:
        perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
        put_cpu_var(perf_cpu_context);
 
-       perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event);
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (ctx)
+               perf_counter_mmap_ctx(ctx, mmap_event);
+       rcu_read_unlock();
 
        kfree(buf);
 }
@@ -2479,8 +2638,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 
        if (!atomic_read(&nr_mmap_tracking))
                return;
-       if (!current->perf_counter_ctxp)
-               return;
 
        mmap_event = (struct perf_mmap_event){
                .file   = file,
@@ -2576,7 +2733,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
                .time = sched_clock(),
        };
 
-       ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0);
+       ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
        if (ret)
                return;
 
@@ -2781,6 +2938,7 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 {
        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
        int *recursion = perf_swcounter_recursion_context(cpuctx);
+       struct perf_counter_context *ctx;
 
        if (*recursion)
                goto out;
@@ -2790,10 +2948,15 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 
        perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
                                 nr, nmi, regs, addr);
-       if (cpuctx->task_ctx) {
-               perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
-                                        nr, nmi, regs, addr);
-       }
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (ctx)
+               perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
+       rcu_read_unlock();
 
        barrier();
        (*recursion)--;
@@ -3122,7 +3285,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
        counter->group_leader           = group_leader;
        counter->pmu                    = NULL;
        counter->ctx                    = ctx;
-       get_ctx(ctx);
+       counter->oncpu                  = -1;
 
        counter->state = PERF_COUNTER_STATE_INACTIVE;
        if (hw_event->disabled)
@@ -3267,8 +3430,10 @@ SYSCALL_DEFINE5(perf_counter_open,
                goto err_free_put_context;
 
        counter->filp = counter_file;
+       WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, counter, cpu);
+       ++ctx->generation;
        mutex_unlock(&ctx->mutex);
 
        counter->owner = current;
@@ -3288,7 +3453,7 @@ err_free_put_context:
        kfree(counter);
 
 err_put_context:
-       put_context(ctx);
+       put_ctx(ctx);
 
        goto out_fput;
 }
@@ -3320,6 +3485,7 @@ inherit_counter(struct perf_counter *parent_counter,
                                           group_leader, GFP_KERNEL);
        if (IS_ERR(child_counter))
                return child_counter;
+       get_ctx(child_ctx);
 
        /*
         * Make the child state follow the state of the parent counter,
@@ -3353,6 +3519,7 @@ inherit_counter(struct perf_counter *parent_counter,
        /*
         * Link this into the parent counter's child list
         */
+       WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
        mutex_lock(&parent_counter->child_mutex);
        list_add_tail(&child_counter->child_list, &parent_counter->child_list);
        mutex_unlock(&parent_counter->child_mutex);
@@ -3402,6 +3569,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
        /*
         * Remove this counter from the parent's list
         */
+       WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
        mutex_lock(&parent_counter->child_mutex);
        list_del_init(&child_counter->child_list);
        mutex_unlock(&parent_counter->child_mutex);
@@ -3414,8 +3582,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
 }
 
 static void
-__perf_counter_exit_task(struct task_struct *child,
-                        struct perf_counter *child_counter,
+__perf_counter_exit_task(struct perf_counter *child_counter,
                         struct perf_counter_context *child_ctx)
 {
        struct perf_counter *parent_counter;
@@ -3437,11 +3604,6 @@ __perf_counter_exit_task(struct task_struct *child,
 
 /*
  * When a child task exits, feed back counter values to parent counters.
- *
- * Note: we may be running in child context, but the PID is not hashed
- * anymore so new counters will not be added.
- * (XXX not sure that is true when we get called from flush_old_exec.
- *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
@@ -3449,16 +3611,36 @@ void perf_counter_exit_task(struct task_struct *child)
        struct perf_counter_context *child_ctx;
        unsigned long flags;
 
-       WARN_ON_ONCE(child != current);
-
-       child_ctx = child->perf_counter_ctxp;
-
-       if (likely(!child_ctx))
+       if (likely(!child->perf_counter_ctxp))
                return;
 
        local_irq_save(flags);
+       /*
+        * We can't reschedule here because interrupts are disabled,
+        * and either child is current or it is a task that can't be
+        * scheduled, so we are now safe from rescheduling changing
+        * our context.
+        */
+       child_ctx = child->perf_counter_ctxp;
        __perf_counter_task_sched_out(child_ctx);
+
+       /*
+        * Take the context lock here so that if find_get_context is
+        * reading child->perf_counter_ctxp, we wait until it has
+        * incremented the context's refcount before we do put_ctx below.
+        */
+       spin_lock(&child_ctx->lock);
        child->perf_counter_ctxp = NULL;
+       if (child_ctx->parent_ctx) {
+               /*
+                * This context is a clone; unclone it so it can't get
+                * swapped to another process while we're removing all
+                * the counters from it.
+                */
+               put_ctx(child_ctx->parent_ctx);
+               child_ctx->parent_ctx = NULL;
+       }
+       spin_unlock(&child_ctx->lock);
        local_irq_restore(flags);
 
        mutex_lock(&child_ctx->mutex);
@@ -3466,7 +3648,7 @@ void perf_counter_exit_task(struct task_struct *child)
 again:
        list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
                                 list_entry)
-               __perf_counter_exit_task(child, child_counter, child_ctx);
+               __perf_counter_exit_task(child_counter, child_ctx);
 
        /*
         * If the last counter was a group counter, it will have appended all
@@ -3482,11 +3664,50 @@ again:
 }
 
 /*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_counter_free_task(struct task_struct *task)
+{
+       struct perf_counter_context *ctx = task->perf_counter_ctxp;
+       struct perf_counter *counter, *tmp;
+
+       if (!ctx)
+               return;
+
+       mutex_lock(&ctx->mutex);
+again:
+       list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+               struct perf_counter *parent = counter->parent;
+
+               if (WARN_ON_ONCE(!parent))
+                       continue;
+
+               mutex_lock(&parent->child_mutex);
+               list_del_init(&counter->child_list);
+               mutex_unlock(&parent->child_mutex);
+
+               fput(parent->filp);
+
+               list_del_counter(counter, ctx);
+               free_counter(counter);
+       }
+
+       if (!list_empty(&ctx->counter_list))
+               goto again;
+
+       mutex_unlock(&ctx->mutex);
+
+       put_ctx(ctx);
+}
+
+/*
  * Initialize the perf_counter context in task_struct
  */
 int perf_counter_init_task(struct task_struct *child)
 {
        struct perf_counter_context *child_ctx, *parent_ctx;
+       struct perf_counter_context *cloned_ctx;
        struct perf_counter *counter;
        struct task_struct *parent = current;
        int inherited_all = 1;
@@ -3497,8 +3718,7 @@ int perf_counter_init_task(struct task_struct *child)
        mutex_init(&child->perf_counter_mutex);
        INIT_LIST_HEAD(&child->perf_counter_list);
 
-       parent_ctx = parent->perf_counter_ctxp;
-       if (likely(!parent_ctx || !parent_ctx->nr_counters))
+       if (likely(!parent->perf_counter_ctxp))
                return 0;
 
        /*
@@ -3513,6 +3733,20 @@ int perf_counter_init_task(struct task_struct *child)
 
        __perf_counter_init_context(child_ctx, child);
        child->perf_counter_ctxp = child_ctx;
+       get_task_struct(child);
+
+       /*
+        * If the parent's context is a clone, pin it so it won't get
+        * swapped under us.
+        */
+       parent_ctx = perf_pin_task_context(parent);
+
+       /*
+        * No need to check if parent_ctx != NULL here; since we saw
+        * it non-NULL earlier, the only reason for it to become NULL
+        * is if we exit, and since we're currently in the middle of
+        * a fork we can't be exiting at the same time.
+        */
 
        /*
         * Lock the parent list. No need to lock the child - not PID
@@ -3545,9 +3779,14 @@ int perf_counter_init_task(struct task_struct *child)
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
+                * Note that if the parent is a clone, it could get
+                * uncloned at any point, but that doesn't matter
+                * because the list of counters and the generation
+                * count can't have changed since we took the mutex.
                 */
-               if (parent_ctx->parent_ctx) {
-                       child_ctx->parent_ctx = parent_ctx->parent_ctx;
+               cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+               if (cloned_ctx) {
+                       child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
                } else {
                        child_ctx->parent_ctx = parent_ctx;
@@ -3558,6 +3797,8 @@ int perf_counter_init_task(struct task_struct *child)
 
        mutex_unlock(&parent_ctx->mutex);
 
+       perf_unpin_context(parent_ctx);
+
        return ret;
 }