tunnels: fix netns vs proto registration ordering
[safe/jmp/linux-2.6] / kernel / hw_breakpoint.c
index 08f6d01..50dbd59 100644 (file)
  * Copyright (C) 2007 Alan Stern
  * Copyright (C) IBM Corporation, 2009
  * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
  */
 
 /*
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/cpu.h>
 #include <linux/smp.h>
 
 #include <linux/hw_breakpoint.h>
 
-#include <asm/processor.h>
+/*
+ * Constraints data
+ */
+
+/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
 
-#ifdef CONFIG_X86
-#include <asm/debugreg.h>
-#endif
+/* Number of pinned task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
 
-static atomic_t bp_slot;
+/* Number of non-pinned cpu/task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
 
-int reserve_bp_slot(struct perf_event *bp)
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+       unsigned int pinned;
+       unsigned int flexible;
+};
+
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu)
 {
-       if (atomic_inc_return(&bp_slot) == HBP_NUM) {
-               atomic_dec(&bp_slot);
+       int i;
+       unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
 
-               return -ENOSPC;
+       for (i = HBP_NUM -1; i >= 0; i--) {
+               if (tsk_pinned[i] > 0)
+                       return i + 1;
        }
 
        return 0;
 }
 
-void release_bp_slot(struct perf_event *bp)
+static int task_bp_pinned(struct task_struct *tsk)
 {
-       atomic_dec(&bp_slot);
+       struct perf_event_context *ctx = tsk->perf_event_ctxp;
+       struct list_head *list;
+       struct perf_event *bp;
+       unsigned long flags;
+       int count = 0;
+
+       if (WARN_ONCE(!ctx, "No perf context for this task"))
+               return 0;
+
+       list = &ctx->event_list;
+
+       raw_spin_lock_irqsave(&ctx->lock, flags);
+
+       /*
+        * The current breakpoint counter is not included in the list
+        * at the open() callback time
+        */
+       list_for_each_entry(bp, list, event_entry) {
+               if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+                       count++;
+       }
+
+       raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+       return count;
 }
 
-int __register_perf_hw_breakpoint(struct perf_event *bp)
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 {
-       int ret;
+       int cpu = bp->cpu;
+       struct task_struct *tsk = bp->ctx->task;
 
-       ret = reserve_bp_slot(bp);
-       if (ret)
-               return ret;
+       if (cpu >= 0) {
+               slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+               if (!tsk)
+                       slots->pinned += max_task_bp_pinned(cpu);
+               else
+                       slots->pinned += task_bp_pinned(tsk);
+               slots->flexible = per_cpu(nr_bp_flexible, cpu);
 
-       if (!bp->attr.disabled)
-               ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+               return;
+       }
 
-       return ret;
+       for_each_online_cpu(cpu) {
+               unsigned int nr;
+
+               nr = per_cpu(nr_cpu_bp_pinned, cpu);
+               if (!tsk)
+                       nr += max_task_bp_pinned(cpu);
+               else
+                       nr += task_bp_pinned(tsk);
+
+               if (nr > slots->pinned)
+                       slots->pinned = nr;
+
+               nr = per_cpu(nr_bp_flexible, cpu);
+
+               if (nr > slots->flexible)
+                       slots->flexible = nr;
+       }
 }
 
-int register_perf_hw_breakpoint(struct perf_event *bp)
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+{
+       unsigned int *tsk_pinned;
+       int count = 0;
+
+       count = task_bp_pinned(tsk);
+
+       tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+       if (enable) {
+               tsk_pinned[count]++;
+               if (count > 0)
+                       tsk_pinned[count-1]--;
+       } else {
+               tsk_pinned[count]--;
+               if (count > 0)
+                       tsk_pinned[count-1]++;
+       }
+}
+
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void toggle_bp_slot(struct perf_event *bp, bool enable)
 {
-       bp->callback = perf_bp_event;
+       int cpu = bp->cpu;
+       struct task_struct *tsk = bp->ctx->task;
+
+       /* Pinned counter task profiling */
+       if (tsk) {
+               if (cpu >= 0) {
+                       toggle_bp_task_slot(tsk, cpu, enable);
+                       return;
+               }
+
+               for_each_online_cpu(cpu)
+                       toggle_bp_task_slot(tsk, cpu, enable);
+               return;
+       }
 
-       return __register_perf_hw_breakpoint(bp);
+       /* Pinned counter cpu profiling */
+       if (enable)
+               per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+       else
+               per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
 }
 
 /*
- * Register a breakpoint bound to a task and a given cpu.
- * If cpu is -1, the breakpoint is active for the task in every cpu
- * If the task is -1, the breakpoint is active for every tasks in the given
- * cpu.
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ *  == Non-pinned counter == (Considered as pinned for now)
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ *           + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
+ *
+ *       -> If there are already non-pinned counters in this cpu, it means
+ *          there is already a free slot for them.
+ *          Otherwise, we check that the maximum number of per task
+ *          breakpoints (for this cpu) plus the number of per cpu breakpoint
+ *          (for this cpu) doesn't cover every registers.
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ *           + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
+ *
+ *       -> This is roughly the same, except we check the number of per cpu
+ *          bp for every cpu and we keep the max one. Same for the per tasks
+ *          breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ *            + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
+ *
+ *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ *          one register at least (or they will never be fed).
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
  */
-static struct perf_event *
-register_user_hw_breakpoint_cpu(unsigned long addr,
-                               int len,
-                               int type,
-                               perf_callback_t triggered,
-                               pid_t pid,
-                               int cpu,
-                               bool active)
+int reserve_bp_slot(struct perf_event *bp)
 {
-       struct perf_event_attr *attr;
-       struct perf_event *bp;
+       struct bp_busy_slots slots = {0};
+       int ret = 0;
 
-       attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-       if (!attr)
-               return ERR_PTR(-ENOMEM);
+       mutex_lock(&nr_bp_mutex);
 
-       attr->type = PERF_TYPE_BREAKPOINT;
-       attr->size = sizeof(*attr);
-       attr->bp_addr = addr;
-       attr->bp_len = len;
-       attr->bp_type = type;
-       /*
-        * Such breakpoints are used by debuggers to trigger signals when
-        * we hit the excepted memory op. We can't miss such events, they
-        * must be pinned.
-        */
-       attr->pinned = 1;
+       fetch_bp_busy_slots(&slots, bp);
 
-       if (!active)
-               attr->disabled = 1;
+       /* Flexible counters need to keep at least one slot */
+       if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+               ret = -ENOSPC;
+               goto end;
+       }
+
+       toggle_bp_slot(bp, true);
 
-       bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered);
-       kfree(attr);
+end:
+       mutex_unlock(&nr_bp_mutex);
 
-       return bp;
+       return ret;
+}
+
+void release_bp_slot(struct perf_event *bp)
+{
+       mutex_lock(&nr_bp_mutex);
+
+       toggle_bp_slot(bp, false);
+
+       mutex_unlock(&nr_bp_mutex);
+}
+
+
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+       int ret;
+
+       ret = reserve_bp_slot(bp);
+       if (ret)
+               return ret;
+
+       /*
+        * Ptrace breakpoints can be temporary perf events only
+        * meant to reserve a slot. In this case, it is created disabled and
+        * we don't want to check the params right now (as we put a null addr)
+        * But perf tools create events as disabled and we want to check
+        * the params for them.
+        * This is a quick hack that will be removed soon, once we remove
+        * the tmp breakpoints from ptrace
+        */
+       if (!bp->attr.disabled || !bp->overflow_handler)
+               ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+
+       return ret;
 }
 
 /**
  * register_user_hw_breakpoint - register a hardware breakpoint for user space
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @active: should we activate it while registering it
- *
  */
 struct perf_event *
-register_user_hw_breakpoint(unsigned long addr,
-                           int len,
-                           int type,
-                           perf_callback_t triggered,
-                           struct task_struct *tsk,
-                           bool active)
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+                           perf_overflow_handler_t triggered,
+                           struct task_struct *tsk)
 {
-       return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
-                                              tsk->pid, -1, active);
+       return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
 /**
  * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
  * @bp: the breakpoint structure to modify
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: new breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @active: should we activate it while registering it
  */
-struct perf_event *
-modify_user_hw_breakpoint(struct perf_event *bp,
-                         unsigned long addr,
-                         int len,
-                         int type,
-                         perf_callback_t triggered,
-                         struct task_struct *tsk,
-                         bool active)
+int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
 {
-       /*
-        * FIXME: do it without unregistering
-        * - We don't want to lose our slot
-        * - If the new bp is incorrect, don't lose the older one
-        */
-       unregister_hw_breakpoint(bp);
+       u64 old_addr = bp->attr.bp_addr;
+       int old_type = bp->attr.bp_type;
+       int old_len = bp->attr.bp_len;
+       int err = 0;
+
+       perf_event_disable(bp);
+
+       bp->attr.bp_addr = attr->bp_addr;
+       bp->attr.bp_type = attr->bp_type;
+       bp->attr.bp_len = attr->bp_len;
 
-       return register_user_hw_breakpoint(addr, len, type, triggered,
-                                          tsk, active);
+       if (attr->disabled)
+               goto end;
+
+       err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+       if (!err)
+               perf_event_enable(bp);
+
+       if (err) {
+               bp->attr.bp_addr = old_addr;
+               bp->attr.bp_type = old_type;
+               bp->attr.bp_len = old_len;
+               if (!bp->attr.disabled)
+                       perf_event_enable(bp);
+
+               return err;
+       }
+
+end:
+       bp->attr.disabled = attr->disabled;
+
+       return 0;
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
 
@@ -192,34 +370,16 @@ void unregister_hw_breakpoint(struct perf_event *bp)
 }
 EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 
-static struct perf_event *
-register_kernel_hw_breakpoint_cpu(unsigned long addr,
-                                 int len,
-                                 int type,
-                                 perf_callback_t triggered,
-                                 int cpu,
-                                 bool active)
-{
-       return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
-                                              -1, cpu, active);
-}
-
 /**
  * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
- * @active: should we activate it while registering it
  *
  * @return a set of per_cpu pointers to perf events
  */
 struct perf_event **
-register_wide_hw_breakpoint(unsigned long addr,
-                           int len,
-                           int type,
-                           perf_callback_t triggered,
-                           bool active)
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+                           perf_overflow_handler_t triggered)
 {
        struct perf_event **cpu_events, **pevent, *bp;
        long err;
@@ -229,32 +389,35 @@ register_wide_hw_breakpoint(unsigned long addr,
        if (!cpu_events)
                return ERR_PTR(-ENOMEM);
 
-       for_each_possible_cpu(cpu) {
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
-               bp = register_kernel_hw_breakpoint_cpu(addr, len, type,
-                                       triggered, cpu, active);
+               bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
 
                *pevent = bp;
 
-               if (IS_ERR(bp) || !bp) {
+               if (IS_ERR(bp)) {
                        err = PTR_ERR(bp);
                        goto fail;
                }
        }
+       put_online_cpus();
 
        return cpu_events;
 
 fail:
-       for_each_possible_cpu(cpu) {
+       for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
-               if (IS_ERR(*pevent) || !*pevent)
+               if (IS_ERR(*pevent))
                        break;
                unregister_hw_breakpoint(*pevent);
        }
+       put_online_cpus();
+
        free_percpu(cpu_events);
-       /* return the error if any */
        return ERR_PTR(err);
 }
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
 
 /**
  * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
@@ -271,7 +434,7 @@ void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
        }
        free_percpu(cpu_events);
 }
-
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
 
 static struct notifier_block hw_breakpoint_exceptions_nb = {
        .notifier_call = hw_breakpoint_exceptions_notify,