perf counters: restructure the API
authorIngo Molnar <mingo@elte.hu>
Wed, 10 Dec 2008 11:33:23 +0000 (12:33 +0100)
committerIngo Molnar <mingo@elte.hu>
Thu, 11 Dec 2008 14:45:48 +0000 (15:45 +0100)
Impact: clean up new API

Thorough cleanup of the new perf counters API, we now get clean separation
of the various concepts:

 - introduce perf_counter_hw_event to separate out the event source details

 - move special type flags into separate attributes: PERF_COUNT_NMI,
   PERF_COUNT_RAW

 - extend the type to u64 and reserve it fully to the architecture in the
   raw type case.

And make use of all these changes in the core and x86 perfcounters code.

Also change the syscall signature to:

  asmlinkage int sys_perf_counter_open(

struct perf_counter_hw_event *hw_event_uptr __user,
pid_t pid,
int cpu,
int group_fd);

( Note that group_fd is unused for now - it's reserved for the counter
  groups abstraction. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/x86/kernel/cpu/perf_counter.c
include/linux/perf_counter.h
include/linux/syscalls.h
kernel/perf_counter.c

index 30e7ebf..ef1936a 100644 (file)
@@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
  */
 int hw_perf_counter_init(struct perf_counter *counter)
 {
+       struct perf_counter_hw_event *hw_event = &counter->hw_event;
        struct hw_perf_counter *hwc = &counter->hw;
-       u32 hw_event_type = counter->event.hw_event_type;
 
        if (unlikely(!perf_counters_initialized))
                return -EINVAL;
@@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter)
        hwc->nmi = 0;
        if (capable(CAP_SYS_ADMIN)) {
                hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-               if (hw_event_type & PERF_COUNT_NMI)
+               if (hw_event->nmi)
                        hwc->nmi = 1;
        }
 
-       hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
-       hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+       hwc->config_base        = MSR_ARCH_PERFMON_EVENTSEL0;
+       hwc->counter_base       = MSR_ARCH_PERFMON_PERFCTR0;
 
-       hwc->irq_period = counter->event.hw_event_period;
+       hwc->irq_period         = hw_event->irq_period;
        /*
         * Intel PMCs cannot be accessed sanely above 32 bit width,
         * so we install an artificial 1<<31 period regardless of
@@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter)
        if (!hwc->irq_period)
                hwc->irq_period = 0x7FFFFFFF;
 
-       hwc->next_count = -((s32) hwc->irq_period);
+       hwc->next_count = -(s32)hwc->irq_period;
 
        /*
         * Raw event type provide the config in the event structure
         */
-       hw_event_type &= ~PERF_COUNT_NMI;
-       if (hw_event_type == PERF_COUNT_RAW) {
-               hwc->config |= counter->event.hw_raw_ctrl;
+       if (hw_event->raw) {
+               hwc->config |= hw_event->type;
        } else {
-               if (hw_event_type >= max_intel_perfmon_events)
+               if (hw_event->type >= max_intel_perfmon_events)
                        return -EINVAL;
                /*
                 * The generic map:
                 */
-               hwc->config |= intel_perfmon_event_map[hw_event_type];
+               hwc->config |= intel_perfmon_event_map[hw_event->type];
        }
        counter->wakeup_pending = 0;
 
@@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
        int bit;
 
        list_for_each_entry(counter, &ctx->counters, list) {
-               if (counter->record_type != PERF_RECORD_SIMPLE ||
+               if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
                    counter == leader)
                        continue;
 
@@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
                                perf_save_and_restart(counter);
                        }
                }
-               perf_store_irq_data(leader, counter->event.hw_event_type);
+               perf_store_irq_data(leader, counter->hw_event.type);
                perf_store_irq_data(leader, atomic64_counter_read(counter));
        }
 }
@@ -410,7 +409,7 @@ again:
 
                perf_save_and_restart(counter);
 
-               switch (counter->record_type) {
+               switch (counter->hw_event.record_type) {
                case PERF_RECORD_SIMPLE:
                        continue;
                case PERF_RECORD_IRQ:
@@ -418,7 +417,7 @@ again:
                        break;
                case PERF_RECORD_GROUP:
                        perf_store_irq_data(counter,
-                                           counter->event.hw_event_type);
+                                           counter->hw_event.type);
                        perf_store_irq_data(counter,
                                            atomic64_counter_read(counter));
                        perf_handle_group(counter, &status, &ack);
index 1f00176..a2b4852 100644 (file)
 struct task_struct;
 
 /*
- * Generalized hardware event types, used by the hw_event_type parameter
- * of the sys_perf_counter_open() syscall:
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
  */
 enum hw_event_types {
-       PERF_COUNT_CYCLES,
-       PERF_COUNT_INSTRUCTIONS,
-       PERF_COUNT_CACHE_REFERENCES,
-       PERF_COUNT_CACHE_MISSES,
-       PERF_COUNT_BRANCH_INSTRUCTIONS,
-       PERF_COUNT_BRANCH_MISSES,
        /*
-        * If this bit is set in the type, then trigger NMI sampling:
+        * Common hardware events, generalized by the kernel:
         */
-       PERF_COUNT_NMI                  = (1 << 30),
-       PERF_COUNT_RAW                  = (1 << 31),
+       PERF_COUNT_CYCLES               =  0,
+       PERF_COUNT_INSTRUCTIONS         =  1,
+       PERF_COUNT_CACHE_REFERENCES     =  2,
+       PERF_COUNT_CACHE_MISSES         =  3,
+       PERF_COUNT_BRANCH_INSTRUCTIONS  =  4,
+       PERF_COUNT_BRANCH_MISSES        =  5,
+
+       /*
+        * Special "software" counters provided by the kernel, even if
+        * the hardware does not support performance counters. These
+        * counters measure various physical and sw events of the
+        * kernel (and allow the profiling of them as well):
+        */
+       PERF_COUNT_CPU_CLOCK            = -1,
+       PERF_COUNT_TASK_CLOCK           = -2,
+       PERF_COUNT_PAGE_FAULTS          = -3,
+       PERF_COUNT_CONTEXT_SWITCHES     = -4,
 };
 
 /*
  * IRQ-notification data record type:
  */
-enum perf_record_type {
-       PERF_RECORD_SIMPLE,
-       PERF_RECORD_IRQ,
-       PERF_RECORD_GROUP,
+enum perf_counter_record_type {
+       PERF_RECORD_SIMPLE              =  0,
+       PERF_RECORD_IRQ                 =  1,
+       PERF_RECORD_GROUP               =  2,
 };
 
-struct perf_counter_event {
-       u32                     hw_event_type;
-       u32                     hw_event_period;
-       u64                     hw_raw_ctrl;
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+       u64                     type;
+
+       u64                     irq_period;
+       u32                     record_type;
+
+       u32                     disabled     :  1, /* off by default */
+                               nmi          :  1, /* NMI sampling   */
+                               raw          :  1, /* raw event type */
+                               __reserved_1 : 29;
+
+       u64                     __reserved_2;
 };
 
+/*
+ * Kernel-internal data types:
+ */
+
 /**
- * struct hw_perf_counter - performance counter hardware details
+ * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
-       u64                     config;
-       unsigned long           config_base;
-       unsigned long           counter_base;
-       int                     nmi;
-       unsigned int            idx;
-       u64                     prev_count;
-       s32                     next_count;
-       u64                     irq_period;
+       u64                             config;
+       unsigned long                   config_base;
+       unsigned long                   counter_base;
+       int                             nmi;
+       unsigned int                    idx;
+       u64                             prev_count;
+       u64                             irq_period;
+       s32                             next_count;
 };
 
 /*
  * Hardcoded buffer length limit for now, for IRQ-fed events:
  */
-#define PERF_DATA_BUFLEN       2048
+#define PERF_DATA_BUFLEN               2048
 
 /**
  * struct perf_data - performance counter IRQ data sampling ...
  */
 struct perf_data {
-       int                     len;
-       int                     rd_idx;
-       int                     overrun;
-       u8                      data[PERF_DATA_BUFLEN];
+       int                             len;
+       int                             rd_idx;
+       int                             overrun;
+       u8                              data[PERF_DATA_BUFLEN];
 };
 
 /**
@@ -96,7 +124,7 @@ struct perf_counter {
 #else
        atomic_t                        count32[2];
 #endif
-       struct perf_counter_event       event;
+       struct perf_counter_hw_event    hw_event;
        struct hw_perf_counter          hw;
 
        struct perf_counter_context     *ctx;
@@ -110,8 +138,6 @@ struct perf_counter {
        int                             oncpu;
        int                             cpu;
 
-       enum perf_record_type           record_type;
-
        /* read() / irq related data */
        wait_queue_head_t               waitq;
        /* optional: for NMIs */
index 3ecd73d..a549678 100644 (file)
@@ -54,7 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
-struct perf_counter_event;
+struct perf_counter_hw_event;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-                     pid_t pid, int cpu, int masterfd);
+
+asmlinkage int sys_perf_counter_open(
+
+       struct perf_counter_hw_event    *hw_event_uptr          __user,
+       pid_t                           pid,
+       int                             cpu,
+       int                             group_fd);
 #endif
index 2557c67..0d323ce 100644 (file)
@@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
        struct perf_counter *counter = file->private_data;
 
-       switch (counter->record_type) {
+       switch (counter->hw_event.record_type) {
        case PERF_RECORD_SIMPLE:
                return perf_read_hw(counter, buf, count);
 
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 {
        struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
        INIT_LIST_HEAD(&counter->list);
        init_waitqueue_head(&counter->waitq);
 
-       counter->irqdata        = &counter->data[0];
-       counter->usrdata        = &counter->data[1];
-       counter->cpu            = cpu;
-       counter->record_type    = record_type;
-       counter->event          = *event;
-       counter->wakeup_pending = 0;
+       counter->irqdata                = &counter->data[0];
+       counter->usrdata                = &counter->data[1];
+       counter->cpu                    = cpu;
+       counter->hw_event               = *hw_event;
+       counter->wakeup_pending         = 0;
 
        return counter;
 }
 
 /**
- * sys_perf_task_open - open a performance counter associate it to a task
- * @hw_event_type:     event type for monitoring/sampling...
+ * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ *
+ * @hw_event_uptr:     event type attributes for monitoring/sampling
  * @pid:               target pid
+ * @cpu:               target cpu
+ * @group_fd:          group leader counter fd
  */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-                     pid_t pid, int cpu, int masterfd)
+asmlinkage int sys_perf_counter_open(
+
+       struct perf_counter_hw_event    *hw_event_uptr          __user,
+       pid_t                           pid,
+       int                             cpu,
+       int                             group_fd)
+
 {
        struct perf_counter_context *ctx;
-       struct perf_counter_event event;
+       struct perf_counter_hw_event hw_event;
        struct perf_counter *counter;
        int ret;
 
-       if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+       if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
                return -EFAULT;
 
        ctx = find_get_context(pid, cpu);
@@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
                return PTR_ERR(ctx);
 
        ret = -ENOMEM;
-       counter = perf_counter_alloc(&event, cpu, record_type);
+       counter = perf_counter_alloc(&hw_event, cpu);
        if (!counter)
                goto err_put_context;