20cf5af27ade7ce20886a791ed3b3be5d778c32e
[safe/jmp/linux-2.6] / include / linux / perf_counter.h
1 /*
2  *  Performance counters:
3  *
4  *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5  *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6  *
7  *  Data type definitions, declarations, prototypes.
8  *
9  *  Started by: Thomas Gleixner and Ingo Molnar
10  *
11  *  For licencing details see kernel-base/COPYING
12  */
13 #ifndef _LINUX_PERF_COUNTER_H
14 #define _LINUX_PERF_COUNTER_H
15
16 #include <linux/types.h>
17 #include <linux/ioctl.h>
18 #include <asm/byteorder.h>
19
20 /*
21  * User-space ABI bits:
22  */
23
24 /*
25  * attr.type
26  */
27 enum perf_type_id {
28         PERF_TYPE_HARDWARE              = 0,
29         PERF_TYPE_SOFTWARE              = 1,
30         PERF_TYPE_TRACEPOINT            = 2,
31         PERF_TYPE_HW_CACHE              = 3,
32         PERF_TYPE_RAW                   = 4,
33
34         PERF_TYPE_MAX,                  /* non ABI */
35 };
36
37 /*
38  * Generalized performance counter event types, used by the attr.event_id
39  * parameter of the sys_perf_counter_open() syscall:
40  */
41 enum perf_hw_id {
42         /*
43          * Common hardware events, generalized by the kernel:
44          */
45         PERF_COUNT_HW_CPU_CYCLES                = 0,
46         PERF_COUNT_HW_INSTRUCTIONS              = 1,
47         PERF_COUNT_HW_CACHE_REFERENCES          = 2,
48         PERF_COUNT_HW_CACHE_MISSES              = 3,
49         PERF_COUNT_HW_BRANCH_INSTRUCTIONS       = 4,
50         PERF_COUNT_HW_BRANCH_MISSES             = 5,
51         PERF_COUNT_HW_BUS_CYCLES                = 6,
52
53         PERF_COUNT_HW_MAX,              /* non ABI */
54 };
55
56 /*
57  * Generalized hardware cache counters:
58  *
59  *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
60  *       { read, write, prefetch } x
61  *       { accesses, misses }
62  */
63 enum perf_hw_cache_id {
64         PERF_COUNT_HW_CACHE_L1D         = 0,
65         PERF_COUNT_HW_CACHE_L1I         = 1,
66         PERF_COUNT_HW_CACHE_LL          = 2,
67         PERF_COUNT_HW_CACHE_DTLB        = 3,
68         PERF_COUNT_HW_CACHE_ITLB        = 4,
69         PERF_COUNT_HW_CACHE_BPU         = 5,
70
71         PERF_COUNT_HW_CACHE_MAX,        /* non ABI */
72 };
73
74 enum perf_hw_cache_op_id {
75         PERF_COUNT_HW_CACHE_OP_READ     = 0,
76         PERF_COUNT_HW_CACHE_OP_WRITE    = 1,
77         PERF_COUNT_HW_CACHE_OP_PREFETCH = 2,
78
79         PERF_COUNT_HW_CACHE_OP_MAX,     /* non ABI */
80 };
81
82 enum perf_hw_cache_op_result_id {
83         PERF_COUNT_HW_CACHE_RESULT_ACCESS       = 0,
84         PERF_COUNT_HW_CACHE_RESULT_MISS         = 1,
85
86         PERF_COUNT_HW_CACHE_RESULT_MAX,         /* non ABI */
87 };
88
89 /*
90  * Special "software" counters provided by the kernel, even if the hardware
91  * does not support performance counters. These counters measure various
92  * physical and sw events of the kernel (and allow the profiling of them as
93  * well):
94  */
95 enum perf_sw_ids {
96         PERF_COUNT_SW_CPU_CLOCK         = 0,
97         PERF_COUNT_SW_TASK_CLOCK        = 1,
98         PERF_COUNT_SW_PAGE_FAULTS       = 2,
99         PERF_COUNT_SW_CONTEXT_SWITCHES  = 3,
100         PERF_COUNT_SW_CPU_MIGRATIONS    = 4,
101         PERF_COUNT_SW_PAGE_FAULTS_MIN   = 5,
102         PERF_COUNT_SW_PAGE_FAULTS_MAJ   = 6,
103
104         PERF_COUNT_SW_MAX,              /* non ABI */
105 };
106
107 /*
108  * Bits that can be set in attr.sample_type to request information
109  * in the overflow packets.
110  */
111 enum perf_counter_sample_format {
112         PERF_SAMPLE_IP                  = 1U << 0,
113         PERF_SAMPLE_TID                 = 1U << 1,
114         PERF_SAMPLE_TIME                = 1U << 2,
115         PERF_SAMPLE_ADDR                = 1U << 3,
116         PERF_SAMPLE_GROUP               = 1U << 4,
117         PERF_SAMPLE_CALLCHAIN           = 1U << 5,
118         PERF_SAMPLE_ID                  = 1U << 6,
119         PERF_SAMPLE_CPU                 = 1U << 7,
120         PERF_SAMPLE_PERIOD              = 1U << 8,
121 };
122
123 /*
124  * Bits that can be set in attr.read_format to request that
125  * reads on the counter should return the indicated quantities,
126  * in increasing order of bit value, after the counter value.
127  */
128 enum perf_counter_read_format {
129         PERF_FORMAT_TOTAL_TIME_ENABLED  =  1U << 0,
130         PERF_FORMAT_TOTAL_TIME_RUNNING  =  1U << 1,
131         PERF_FORMAT_ID                  =  1U << 2,
132 };
133
134 /*
135  * Hardware event to monitor via a performance monitoring counter:
136  */
137 struct perf_counter_attr {
138         /*
139          * Major type: hardware/software/tracepoint/etc.
140          */
141         __u32                   type;
142         __u32                   __reserved_1;
143
144         /*
145          * Type specific configuration information.
146          */
147         __u64                   config;
148
149         union {
150                 __u64           sample_period;
151                 __u64           sample_freq;
152         };
153
154         __u64                   sample_type;
155         __u64                   read_format;
156
157         __u64                   disabled       :  1, /* off by default        */
158                                 inherit        :  1, /* children inherit it   */
159                                 pinned         :  1, /* must always be on PMU */
160                                 exclusive      :  1, /* only group on PMU     */
161                                 exclude_user   :  1, /* don't count user      */
162                                 exclude_kernel :  1, /* ditto kernel          */
163                                 exclude_hv     :  1, /* ditto hypervisor      */
164                                 exclude_idle   :  1, /* don't count when idle */
165                                 mmap           :  1, /* include mmap data     */
166                                 comm           :  1, /* include comm data     */
167                                 freq           :  1, /* use freq, not period  */
168
169                                 __reserved_2   : 53;
170
171         __u32                   wakeup_events;  /* wakeup every n events */
172         __u32                   __reserved_3;
173
174         __u64                   __reserved_4;
175 };
176
177 /*
178  * Ioctls that can be done on a perf counter fd:
179  */
180 #define PERF_COUNTER_IOC_ENABLE         _IO ('$', 0)
181 #define PERF_COUNTER_IOC_DISABLE        _IO ('$', 1)
182 #define PERF_COUNTER_IOC_REFRESH        _IO ('$', 2)
183 #define PERF_COUNTER_IOC_RESET          _IO ('$', 3)
184 #define PERF_COUNTER_IOC_PERIOD         _IOW('$', 4, u64)
185
186 enum perf_counter_ioc_flags {
187         PERF_IOC_FLAG_GROUP             = 1U << 0,
188 };
189
190 /*
191  * Structure of the page that can be mapped via mmap
192  */
193 struct perf_counter_mmap_page {
194         __u32   version;                /* version number of this structure */
195         __u32   compat_version;         /* lowest version this is compat with */
196
197         /*
198          * Bits needed to read the hw counters in user-space.
199          *
200          *   u32 seq;
201          *   s64 count;
202          *
203          *   do {
204          *     seq = pc->lock;
205          *
206          *     barrier()
207          *     if (pc->index) {
208          *       count = pmc_read(pc->index - 1);
209          *       count += pc->offset;
210          *     } else
211          *       goto regular_read;
212          *
213          *     barrier();
214          *   } while (pc->lock != seq);
215          *
216          * NOTE: for obvious reason this only works on self-monitoring
217          *       processes.
218          */
219         __u32   lock;                   /* seqlock for synchronization */
220         __u32   index;                  /* hardware counter identifier */
221         __s64   offset;                 /* add to hardware counter value */
222
223         /*
224          * Control data for the mmap() data buffer.
225          *
226          * User-space reading this value should issue an rmb(), on SMP capable
227          * platforms, after reading this value -- see perf_counter_wakeup().
228          */
229         __u64   data_head;              /* head in the data section */
230 };
231
232 #define PERF_EVENT_MISC_CPUMODE_MASK    (3 << 0)
233 #define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0)
234 #define PERF_EVENT_MISC_KERNEL          (1 << 0)
235 #define PERF_EVENT_MISC_USER            (2 << 0)
236 #define PERF_EVENT_MISC_HYPERVISOR      (3 << 0)
237 #define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
238
239 struct perf_event_header {
240         __u32   type;
241         __u16   misc;
242         __u16   size;
243 };
244
245 enum perf_event_type {
246
247         /*
248          * The MMAP events record the PROT_EXEC mappings so that we can
249          * correlate userspace IPs to code. They have the following structure:
250          *
251          * struct {
252          *      struct perf_event_header        header;
253          *
254          *      u32                             pid, tid;
255          *      u64                             addr;
256          *      u64                             len;
257          *      u64                             pgoff;
258          *      char                            filename[];
259          * };
260          */
261         PERF_EVENT_MMAP                 = 1,
262
263         /*
264          * struct {
265          *      struct perf_event_header        header;
266          *
267          *      u32                             pid, tid;
268          *      char                            comm[];
269          * };
270          */
271         PERF_EVENT_COMM                 = 3,
272
273         /*
274          * struct {
275          *      struct perf_event_header        header;
276          *      u64                             time;
277          *      u64                             id;
278          *      u64                             sample_period;
279          * };
280          */
281         PERF_EVENT_PERIOD               = 4,
282
283         /*
284          * struct {
285          *      struct perf_event_header        header;
286          *      u64                             time;
287          * };
288          */
289         PERF_EVENT_THROTTLE             = 5,
290         PERF_EVENT_UNTHROTTLE           = 6,
291
292         /*
293          * struct {
294          *      struct perf_event_header        header;
295          *      u32                             pid, ppid;
296          * };
297          */
298         PERF_EVENT_FORK                 = 7,
299
300         /*
301          * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
302          * will be PERF_RECORD_*
303          *
304          * struct {
305          *      struct perf_event_header        header;
306          *
307          *      { u64                   ip;       } && PERF_RECORD_IP
308          *      { u32                   pid, tid; } && PERF_RECORD_TID
309          *      { u64                   time;     } && PERF_RECORD_TIME
310          *      { u64                   addr;     } && PERF_RECORD_ADDR
311          *      { u64                   config;   } && PERF_RECORD_CONFIG
312          *      { u32                   cpu, res; } && PERF_RECORD_CPU
313          *
314          *      { u64                   nr;
315          *        { u64 id, val; }      cnt[nr];  } && PERF_RECORD_GROUP
316          *
317          *      { u16                   nr,
318          *                              hv,
319          *                              kernel,
320          *                              user;
321          *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
322          * };
323          */
324 };
325
326 #ifdef __KERNEL__
327 /*
328  * Kernel-internal data types and definitions:
329  */
330
331 #ifdef CONFIG_PERF_COUNTERS
332 # include <asm/perf_counter.h>
333 #endif
334
335 #include <linux/list.h>
336 #include <linux/mutex.h>
337 #include <linux/rculist.h>
338 #include <linux/rcupdate.h>
339 #include <linux/spinlock.h>
340 #include <linux/hrtimer.h>
341 #include <linux/fs.h>
342 #include <linux/pid_namespace.h>
343 #include <asm/atomic.h>
344
345 struct task_struct;
346
347 /**
348  * struct hw_perf_counter - performance counter hardware details:
349  */
350 struct hw_perf_counter {
351 #ifdef CONFIG_PERF_COUNTERS
352         union {
353                 struct { /* hardware */
354                         u64                             config;
355                         unsigned long                   config_base;
356                         unsigned long                   counter_base;
357                         int                             idx;
358                 };
359                 union { /* software */
360                         atomic64_t                      count;
361                         struct hrtimer                  hrtimer;
362                 };
363         };
364         atomic64_t                      prev_count;
365         u64                             sample_period;
366         u64                             last_period;
367         atomic64_t                      period_left;
368         u64                             interrupts;
369
370         u64                             freq_count;
371         u64                             freq_interrupts;
372         u64                             freq_stamp;
373 #endif
374 };
375
376 struct perf_counter;
377
378 /**
379  * struct pmu - generic performance monitoring unit
380  */
381 struct pmu {
382         int (*enable)                   (struct perf_counter *counter);
383         void (*disable)                 (struct perf_counter *counter);
384         void (*read)                    (struct perf_counter *counter);
385         void (*unthrottle)              (struct perf_counter *counter);
386 };
387
388 /**
389  * enum perf_counter_active_state - the states of a counter
390  */
391 enum perf_counter_active_state {
392         PERF_COUNTER_STATE_ERROR        = -2,
393         PERF_COUNTER_STATE_OFF          = -1,
394         PERF_COUNTER_STATE_INACTIVE     =  0,
395         PERF_COUNTER_STATE_ACTIVE       =  1,
396 };
397
398 struct file;
399
400 struct perf_mmap_data {
401         struct rcu_head                 rcu_head;
402         int                             nr_pages;       /* nr of data pages  */
403         int                             nr_locked;      /* nr pages mlocked  */
404
405         atomic_t                        poll;           /* POLL_ for wakeups */
406         atomic_t                        events;         /* event limit       */
407
408         atomic_long_t                   head;           /* write position    */
409         atomic_long_t                   done_head;      /* completed head    */
410
411         atomic_t                        lock;           /* concurrent writes */
412
413         atomic_t                        wakeup;         /* needs a wakeup    */
414
415         struct perf_counter_mmap_page   *user_page;
416         void                            *data_pages[0];
417 };
418
419 struct perf_pending_entry {
420         struct perf_pending_entry *next;
421         void (*func)(struct perf_pending_entry *);
422 };
423
424 /**
425  * struct perf_counter - performance counter kernel representation:
426  */
427 struct perf_counter {
428 #ifdef CONFIG_PERF_COUNTERS
429         struct list_head                list_entry;
430         struct list_head                event_entry;
431         struct list_head                sibling_list;
432         int                             nr_siblings;
433         struct perf_counter             *group_leader;
434         const struct pmu                *pmu;
435
436         enum perf_counter_active_state  state;
437         atomic64_t                      count;
438
439         /*
440          * These are the total time in nanoseconds that the counter
441          * has been enabled (i.e. eligible to run, and the task has
442          * been scheduled in, if this is a per-task counter)
443          * and running (scheduled onto the CPU), respectively.
444          *
445          * They are computed from tstamp_enabled, tstamp_running and
446          * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
447          */
448         u64                             total_time_enabled;
449         u64                             total_time_running;
450
451         /*
452          * These are timestamps used for computing total_time_enabled
453          * and total_time_running when the counter is in INACTIVE or
454          * ACTIVE state, measured in nanoseconds from an arbitrary point
455          * in time.
456          * tstamp_enabled: the notional time when the counter was enabled
457          * tstamp_running: the notional time when the counter was scheduled on
458          * tstamp_stopped: in INACTIVE state, the notional time when the
459          *      counter was scheduled off.
460          */
461         u64                             tstamp_enabled;
462         u64                             tstamp_running;
463         u64                             tstamp_stopped;
464
465         struct perf_counter_attr        attr;
466         struct hw_perf_counter          hw;
467
468         struct perf_counter_context     *ctx;
469         struct file                     *filp;
470
471         /*
472          * These accumulate total time (in nanoseconds) that children
473          * counters have been enabled and running, respectively.
474          */
475         atomic64_t                      child_total_time_enabled;
476         atomic64_t                      child_total_time_running;
477
478         /*
479          * Protect attach/detach and child_list:
480          */
481         struct mutex                    child_mutex;
482         struct list_head                child_list;
483         struct perf_counter             *parent;
484
485         int                             oncpu;
486         int                             cpu;
487
488         struct list_head                owner_entry;
489         struct task_struct              *owner;
490
491         /* mmap bits */
492         struct mutex                    mmap_mutex;
493         atomic_t                        mmap_count;
494         struct perf_mmap_data           *data;
495
496         /* poll related */
497         wait_queue_head_t               waitq;
498         struct fasync_struct            *fasync;
499
500         /* delayed work for NMIs and such */
501         int                             pending_wakeup;
502         int                             pending_kill;
503         int                             pending_disable;
504         struct perf_pending_entry       pending;
505
506         atomic_t                        event_limit;
507
508         void (*destroy)(struct perf_counter *);
509         struct rcu_head                 rcu_head;
510
511         struct pid_namespace            *ns;
512         u64                             id;
513 #endif
514 };
515
516 /**
517  * struct perf_counter_context - counter context structure
518  *
519  * Used as a container for task counters and CPU counters as well:
520  */
521 struct perf_counter_context {
522         /*
523          * Protect the states of the counters in the list,
524          * nr_active, and the list:
525          */
526         spinlock_t              lock;
527         /*
528          * Protect the list of counters.  Locking either mutex or lock
529          * is sufficient to ensure the list doesn't change; to change
530          * the list you need to lock both the mutex and the spinlock.
531          */
532         struct mutex            mutex;
533
534         struct list_head        counter_list;
535         struct list_head        event_list;
536         int                     nr_counters;
537         int                     nr_active;
538         int                     is_active;
539         atomic_t                refcount;
540         struct task_struct      *task;
541
542         /*
543          * Context clock, runs when context enabled.
544          */
545         u64                     time;
546         u64                     timestamp;
547
548         /*
549          * These fields let us detect when two contexts have both
550          * been cloned (inherited) from a common ancestor.
551          */
552         struct perf_counter_context *parent_ctx;
553         u64                     parent_gen;
554         u64                     generation;
555         int                     pin_count;
556         struct rcu_head         rcu_head;
557 };
558
559 /**
560  * struct perf_counter_cpu_context - per cpu counter context structure
561  */
562 struct perf_cpu_context {
563         struct perf_counter_context     ctx;
564         struct perf_counter_context     *task_ctx;
565         int                             active_oncpu;
566         int                             max_pertask;
567         int                             exclusive;
568
569         /*
570          * Recursion avoidance:
571          *
572          * task, softirq, irq, nmi context
573          */
574         int                             recursion[4];
575 };
576
577 #ifdef CONFIG_PERF_COUNTERS
578
579 /*
580  * Set by architecture code:
581  */
582 extern int perf_max_counters;
583
584 extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
585
586 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
587 extern void perf_counter_task_sched_out(struct task_struct *task,
588                                         struct task_struct *next, int cpu);
589 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
590 extern int perf_counter_init_task(struct task_struct *child);
591 extern void perf_counter_exit_task(struct task_struct *child);
592 extern void perf_counter_free_task(struct task_struct *task);
593 extern void perf_counter_do_pending(void);
594 extern void perf_counter_print_debug(void);
595 extern void __perf_disable(void);
596 extern bool __perf_enable(void);
597 extern void perf_disable(void);
598 extern void perf_enable(void);
599 extern int perf_counter_task_disable(void);
600 extern int perf_counter_task_enable(void);
601 extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
602                struct perf_cpu_context *cpuctx,
603                struct perf_counter_context *ctx, int cpu);
604 extern void perf_counter_update_userpage(struct perf_counter *counter);
605
606 struct perf_sample_data {
607         struct pt_regs          *regs;
608         u64                     addr;
609         u64                     period;
610 };
611
612 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
613                                  struct perf_sample_data *data);
614
615 /*
616  * Return 1 for a software counter, 0 for a hardware counter
617  */
618 static inline int is_software_counter(struct perf_counter *counter)
619 {
620         return (counter->attr.type != PERF_TYPE_RAW) &&
621                 (counter->attr.type != PERF_TYPE_HARDWARE);
622 }
623
624 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
625
626 extern void __perf_counter_mmap(struct vm_area_struct *vma);
627
628 static inline void perf_counter_mmap(struct vm_area_struct *vma)
629 {
630         if (vma->vm_flags & VM_EXEC)
631                 __perf_counter_mmap(vma);
632 }
633
634 extern void perf_counter_comm(struct task_struct *tsk);
635 extern void perf_counter_fork(struct task_struct *tsk);
636
637 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
638
639 #define MAX_STACK_DEPTH         255
640
641 struct perf_callchain_entry {
642         u16     nr, hv, kernel, user;
643         u64     ip[MAX_STACK_DEPTH];
644 };
645
646 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
647
648 extern int sysctl_perf_counter_paranoid;
649 extern int sysctl_perf_counter_mlock;
650 extern int sysctl_perf_counter_sample_rate;
651
652 extern void perf_counter_init(void);
653
654 #ifndef perf_misc_flags
655 #define perf_misc_flags(regs)   (user_mode(regs) ? PERF_EVENT_MISC_USER : \
656                                  PERF_EVENT_MISC_KERNEL)
657 #define perf_instruction_pointer(regs)  instruction_pointer(regs)
658 #endif
659
660 #else
661 static inline void
662 perf_counter_task_sched_in(struct task_struct *task, int cpu)           { }
663 static inline void
664 perf_counter_task_sched_out(struct task_struct *task,
665                             struct task_struct *next, int cpu)          { }
666 static inline void
667 perf_counter_task_tick(struct task_struct *task, int cpu)               { }
668 static inline int perf_counter_init_task(struct task_struct *child)     { return 0; }
669 static inline void perf_counter_exit_task(struct task_struct *child)    { }
670 static inline void perf_counter_free_task(struct task_struct *task)     { }
671 static inline void perf_counter_do_pending(void)                        { }
672 static inline void perf_counter_print_debug(void)                       { }
673 static inline void perf_disable(void)                                   { }
674 static inline void perf_enable(void)                                    { }
675 static inline int perf_counter_task_disable(void)       { return -EINVAL; }
676 static inline int perf_counter_task_enable(void)        { return -EINVAL; }
677
678 static inline void
679 perf_swcounter_event(u32 event, u64 nr, int nmi,
680                      struct pt_regs *regs, u64 addr)                    { }
681
682 static inline void perf_counter_mmap(struct vm_area_struct *vma)        { }
683 static inline void perf_counter_comm(struct task_struct *tsk)           { }
684 static inline void perf_counter_fork(struct task_struct *tsk)           { }
685 static inline void perf_counter_init(void)                              { }
686 static inline void perf_counter_task_migration(struct task_struct *task,
687                                                int cpu)                 { }
688 #endif
689
690 #endif /* __KERNEL__ */
691 #endif /* _LINUX_PERF_COUNTER_H */