x86: perf_counter cleanup
[safe/jmp/linux-2.6] / arch / x86 / kernel / cpu / perf_counter.c
1 /*
2  * Performance counter x86 architecture code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *  Copyright(C) 2009 Jaswinder Singh Rajput
7  *
8  *  For licencing details see kernel-base/COPYING
9  */
10
11 #include <linux/perf_counter.h>
12 #include <linux/capability.h>
13 #include <linux/notifier.h>
14 #include <linux/hardirq.h>
15 #include <linux/kprobes.h>
16 #include <linux/module.h>
17 #include <linux/kdebug.h>
18 #include <linux/sched.h>
19
20 #include <asm/perf_counter.h>
21 #include <asm/apic.h>
22
23 static bool perf_counters_initialized __read_mostly;
24
25 /*
26  * Number of (generic) HW counters:
27  */
28 static int nr_counters_generic __read_mostly;
29 static u64 perf_counter_mask __read_mostly;
30 static u64 counter_value_mask __read_mostly;
31 static int counter_value_bits __read_mostly;
32
33 static int nr_counters_fixed __read_mostly;
34
35 struct cpu_hw_counters {
36         struct perf_counter     *counters[X86_PMC_IDX_MAX];
37         unsigned long           used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
38         unsigned long           interrupts;
39         u64                     throttle_ctrl;
40         unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
41         int                     enabled;
42 };
43
44 /*
45  * struct pmc_x86_ops - performance counter x86 ops
46  */
47 struct pmc_x86_ops {
48         u64             (*save_disable_all)(void);
49         void            (*restore_all)(u64);
50         u64             (*get_status)(u64);
51         void            (*ack_status)(u64);
52         void            (*enable)(int, u64);
53         void            (*disable)(int, u64);
54         unsigned        eventsel;
55         unsigned        perfctr;
56         u64             (*event_map)(int);
57         u64             (*raw_event)(u64);
58         int             max_events;
59 };
60
61 static struct pmc_x86_ops *pmc_ops;
62
63 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
64         .enabled = 1,
65 };
66
67 /*
68  * Intel PerfMon v3. Used on Core2 and later.
69  */
70 static const u64 intel_perfmon_event_map[] =
71 {
72   [PERF_COUNT_CPU_CYCLES]               = 0x003c,
73   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
74   [PERF_COUNT_CACHE_REFERENCES]         = 0x4f2e,
75   [PERF_COUNT_CACHE_MISSES]             = 0x412e,
76   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
77   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
78   [PERF_COUNT_BUS_CYCLES]               = 0x013c,
79 };
80
81 static u64 pmc_intel_event_map(int event)
82 {
83         return intel_perfmon_event_map[event];
84 }
85
86 static u64 pmc_intel_raw_event(u64 event)
87 {
88 #define CORE_EVNTSEL_EVENT_MASK         0x000000FF
89 #define CORE_EVNTSEL_UNIT_MASK          0x0000FF00
90 #define CORE_EVNTSEL_COUNTER_MASK       0xFF000000
91
92 #define CORE_EVNTSEL_MASK               \
93         (CORE_EVNTSEL_EVENT_MASK |      \
94          CORE_EVNTSEL_UNIT_MASK  |      \
95          CORE_EVNTSEL_COUNTER_MASK)
96
97         return event & CORE_EVNTSEL_MASK;
98 }
99
100 /*
101  * AMD Performance Monitor K7 and later.
102  */
103 static const u64 amd_perfmon_event_map[] =
104 {
105   [PERF_COUNT_CPU_CYCLES]               = 0x0076,
106   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
107   [PERF_COUNT_CACHE_REFERENCES]         = 0x0080,
108   [PERF_COUNT_CACHE_MISSES]             = 0x0081,
109   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
110   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
111 };
112
113 static u64 pmc_amd_event_map(int event)
114 {
115         return amd_perfmon_event_map[event];
116 }
117
118 static u64 pmc_amd_raw_event(u64 event)
119 {
120 #define K7_EVNTSEL_EVENT_MASK   0x7000000FF
121 #define K7_EVNTSEL_UNIT_MASK    0x00000FF00
122 #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000
123
124 #define K7_EVNTSEL_MASK                 \
125         (K7_EVNTSEL_EVENT_MASK |        \
126          K7_EVNTSEL_UNIT_MASK  |        \
127          K7_EVNTSEL_COUNTER_MASK)
128
129         return event & K7_EVNTSEL_MASK;
130 }
131
132 /*
133  * Propagate counter elapsed time into the generic counter.
134  * Can only be executed on the CPU where the counter is active.
135  * Returns the delta events processed.
136  */
137 static void
138 x86_perf_counter_update(struct perf_counter *counter,
139                         struct hw_perf_counter *hwc, int idx)
140 {
141         u64 prev_raw_count, new_raw_count, delta;
142
143         /*
144          * Careful: an NMI might modify the previous counter value.
145          *
146          * Our tactic to handle this is to first atomically read and
147          * exchange a new raw count - then add that new-prev delta
148          * count to the generic counter atomically:
149          */
150 again:
151         prev_raw_count = atomic64_read(&hwc->prev_count);
152         rdmsrl(hwc->counter_base + idx, new_raw_count);
153
154         if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
155                                         new_raw_count) != prev_raw_count)
156                 goto again;
157
158         /*
159          * Now we have the new raw value and have updated the prev
160          * timestamp already. We can now calculate the elapsed delta
161          * (counter-)time and add that to the generic counter.
162          *
163          * Careful, not all hw sign-extends above the physical width
164          * of the count, so we do that by clipping the delta to 32 bits:
165          */
166         delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
167
168         atomic64_add(delta, &counter->count);
169         atomic64_sub(delta, &hwc->period_left);
170 }
171
172 /*
173  * Setup the hardware configuration for a given hw_event_type
174  */
175 static int __hw_perf_counter_init(struct perf_counter *counter)
176 {
177         struct perf_counter_hw_event *hw_event = &counter->hw_event;
178         struct hw_perf_counter *hwc = &counter->hw;
179
180         if (unlikely(!perf_counters_initialized))
181                 return -EINVAL;
182
183         /*
184          * Generate PMC IRQs:
185          * (keep 'enabled' bit clear for now)
186          */
187         hwc->config = ARCH_PERFMON_EVENTSEL_INT;
188
189         /*
190          * Count user and OS events unless requested not to.
191          */
192         if (!hw_event->exclude_user)
193                 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
194         if (!hw_event->exclude_kernel)
195                 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
196
197         /*
198          * If privileged enough, allow NMI events:
199          */
200         hwc->nmi = 0;
201         if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
202                 hwc->nmi = 1;
203
204         hwc->irq_period         = hw_event->irq_period;
205         /*
206          * Intel PMCs cannot be accessed sanely above 32 bit width,
207          * so we install an artificial 1<<31 period regardless of
208          * the generic counter period:
209          */
210         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
211                 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
212                         hwc->irq_period = 0x7FFFFFFF;
213
214         atomic64_set(&hwc->period_left, hwc->irq_period);
215
216         /*
217          * Raw event type provide the config in the event structure
218          */
219         if (hw_event->raw) {
220                 hwc->config |= pmc_ops->raw_event(hw_event->type);
221         } else {
222                 if (hw_event->type >= pmc_ops->max_events)
223                         return -EINVAL;
224                 /*
225                  * The generic map:
226                  */
227                 hwc->config |= pmc_ops->event_map(hw_event->type);
228         }
229         counter->wakeup_pending = 0;
230
231         return 0;
232 }
233
234 static u64 pmc_intel_save_disable_all(void)
235 {
236         u64 ctrl;
237
238         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
239         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
240
241         return ctrl;
242 }
243
244 static u64 pmc_amd_save_disable_all(void)
245 {
246         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
247         int enabled, idx;
248
249         enabled = cpuc->enabled;
250         cpuc->enabled = 0;
251         barrier();
252
253         for (idx = 0; idx < nr_counters_generic; idx++) {
254                 u64 val;
255
256                 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
257                 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
258                         val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
259                         wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
260                 }
261         }
262
263         return enabled;
264 }
265
266 u64 hw_perf_save_disable(void)
267 {
268         if (unlikely(!perf_counters_initialized))
269                 return 0;
270
271         return pmc_ops->save_disable_all();
272 }
273 /*
274  * Exported because of ACPI idle
275  */
276 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
277
278 static void pmc_intel_restore_all(u64 ctrl)
279 {
280         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
281 }
282
283 static void pmc_amd_restore_all(u64 ctrl)
284 {
285         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
286         int idx;
287
288         cpuc->enabled = ctrl;
289         barrier();
290         if (!ctrl)
291                 return;
292
293         for (idx = 0; idx < nr_counters_generic; idx++) {
294                 if (test_bit(idx, cpuc->active_mask)) {
295                         u64 val;
296
297                         rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
298                         val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
299                         wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
300                 }
301         }
302 }
303
304 void hw_perf_restore(u64 ctrl)
305 {
306         if (unlikely(!perf_counters_initialized))
307                 return;
308
309         pmc_ops->restore_all(ctrl);
310 }
311 /*
312  * Exported because of ACPI idle
313  */
314 EXPORT_SYMBOL_GPL(hw_perf_restore);
315
316 static u64 pmc_intel_get_status(u64 mask)
317 {
318         u64 status;
319
320         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
321
322         return status;
323 }
324
325 static u64 pmc_amd_get_status(u64 mask)
326 {
327         u64 status = 0;
328         int idx;
329
330         for (idx = 0; idx < nr_counters_generic; idx++) {
331                 s64 val;
332
333                 if (!(mask & (1 << idx)))
334                         continue;
335
336                 rdmsrl(MSR_K7_PERFCTR0 + idx, val);
337                 val <<= (64 - counter_value_bits);
338                 if (val >= 0)
339                         status |= (1 << idx);
340         }
341
342         return status;
343 }
344
345 static u64 hw_perf_get_status(u64 mask)
346 {
347         if (unlikely(!perf_counters_initialized))
348                 return 0;
349
350         return pmc_ops->get_status(mask);
351 }
352
353 static void pmc_intel_ack_status(u64 ack)
354 {
355         wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
356 }
357
358 static void pmc_amd_ack_status(u64 ack)
359 {
360 }
361
362 static void hw_perf_ack_status(u64 ack)
363 {
364         if (unlikely(!perf_counters_initialized))
365                 return;
366
367         pmc_ops->ack_status(ack);
368 }
369
370 static void pmc_intel_enable(int idx, u64 config)
371 {
372         wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
373                         config | ARCH_PERFMON_EVENTSEL0_ENABLE);
374 }
375
376 static void pmc_amd_enable(int idx, u64 config)
377 {
378         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
379
380         set_bit(idx, cpuc->active_mask);
381         if (cpuc->enabled)
382                 config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
383
384         wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
385 }
386
387 static void hw_perf_enable(int idx, u64 config)
388 {
389         if (unlikely(!perf_counters_initialized))
390                 return;
391
392         pmc_ops->enable(idx, config);
393 }
394
395 static void pmc_intel_disable(int idx, u64 config)
396 {
397         wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
398 }
399
400 static void pmc_amd_disable(int idx, u64 config)
401 {
402         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
403
404         clear_bit(idx, cpuc->active_mask);
405         wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
406
407 }
408
409 static void hw_perf_disable(int idx, u64 config)
410 {
411         if (unlikely(!perf_counters_initialized))
412                 return;
413
414         pmc_ops->disable(idx, config);
415 }
416
417 static inline void
418 __pmc_fixed_disable(struct perf_counter *counter,
419                     struct hw_perf_counter *hwc, unsigned int __idx)
420 {
421         int idx = __idx - X86_PMC_IDX_FIXED;
422         u64 ctrl_val, mask;
423         int err;
424
425         mask = 0xfULL << (idx * 4);
426
427         rdmsrl(hwc->config_base, ctrl_val);
428         ctrl_val &= ~mask;
429         err = checking_wrmsrl(hwc->config_base, ctrl_val);
430 }
431
432 static inline void
433 __pmc_generic_disable(struct perf_counter *counter,
434                            struct hw_perf_counter *hwc, unsigned int idx)
435 {
436         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
437                 __pmc_fixed_disable(counter, hwc, idx);
438         else
439                 hw_perf_disable(idx, hwc->config);
440 }
441
442 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
443
444 /*
445  * Set the next IRQ period, based on the hwc->period_left value.
446  * To be called with the counter disabled in hw:
447  */
448 static void
449 __hw_perf_counter_set_period(struct perf_counter *counter,
450                              struct hw_perf_counter *hwc, int idx)
451 {
452         s64 left = atomic64_read(&hwc->period_left);
453         s32 period = hwc->irq_period;
454         int err;
455
456         /*
457          * If we are way outside a reasoable range then just skip forward:
458          */
459         if (unlikely(left <= -period)) {
460                 left = period;
461                 atomic64_set(&hwc->period_left, left);
462         }
463
464         if (unlikely(left <= 0)) {
465                 left += period;
466                 atomic64_set(&hwc->period_left, left);
467         }
468
469         per_cpu(prev_left[idx], smp_processor_id()) = left;
470
471         /*
472          * The hw counter starts counting from this counter offset,
473          * mark it to be able to extra future deltas:
474          */
475         atomic64_set(&hwc->prev_count, (u64)-left);
476
477         err = checking_wrmsrl(hwc->counter_base + idx,
478                              (u64)(-left) & counter_value_mask);
479 }
480
481 static inline void
482 __pmc_fixed_enable(struct perf_counter *counter,
483                    struct hw_perf_counter *hwc, unsigned int __idx)
484 {
485         int idx = __idx - X86_PMC_IDX_FIXED;
486         u64 ctrl_val, bits, mask;
487         int err;
488
489         /*
490          * Enable IRQ generation (0x8),
491          * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
492          * if requested:
493          */
494         bits = 0x8ULL;
495         if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
496                 bits |= 0x2;
497         if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
498                 bits |= 0x1;
499         bits <<= (idx * 4);
500         mask = 0xfULL << (idx * 4);
501
502         rdmsrl(hwc->config_base, ctrl_val);
503         ctrl_val &= ~mask;
504         ctrl_val |= bits;
505         err = checking_wrmsrl(hwc->config_base, ctrl_val);
506 }
507
508 static void
509 __pmc_generic_enable(struct perf_counter *counter,
510                           struct hw_perf_counter *hwc, int idx)
511 {
512         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
513                 __pmc_fixed_enable(counter, hwc, idx);
514         else
515                 hw_perf_enable(idx, hwc->config);
516 }
517
518 static int
519 fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
520 {
521         unsigned int event;
522
523         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
524                 return -1;
525
526         if (unlikely(hwc->nmi))
527                 return -1;
528
529         event = hwc->config & ARCH_PERFMON_EVENT_MASK;
530
531         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
532                 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
533         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
534                 return X86_PMC_IDX_FIXED_CPU_CYCLES;
535         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
536                 return X86_PMC_IDX_FIXED_BUS_CYCLES;
537
538         return -1;
539 }
540
541 /*
542  * Find a PMC slot for the freshly enabled / scheduled in counter:
543  */
544 static int pmc_generic_enable(struct perf_counter *counter)
545 {
546         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
547         struct hw_perf_counter *hwc = &counter->hw;
548         int idx;
549
550         idx = fixed_mode_idx(counter, hwc);
551         if (idx >= 0) {
552                 /*
553                  * Try to get the fixed counter, if that is already taken
554                  * then try to get a generic counter:
555                  */
556                 if (test_and_set_bit(idx, cpuc->used))
557                         goto try_generic;
558
559                 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
560                 /*
561                  * We set it so that counter_base + idx in wrmsr/rdmsr maps to
562                  * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
563                  */
564                 hwc->counter_base =
565                         MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
566                 hwc->idx = idx;
567         } else {
568                 idx = hwc->idx;
569                 /* Try to get the previous generic counter again */
570                 if (test_and_set_bit(idx, cpuc->used)) {
571 try_generic:
572                         idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
573                         if (idx == nr_counters_generic)
574                                 return -EAGAIN;
575
576                         set_bit(idx, cpuc->used);
577                         hwc->idx = idx;
578                 }
579                 hwc->config_base  = pmc_ops->eventsel;
580                 hwc->counter_base = pmc_ops->perfctr;
581         }
582
583         perf_counters_lapic_init(hwc->nmi);
584
585         __pmc_generic_disable(counter, hwc, idx);
586
587         cpuc->counters[idx] = counter;
588         /*
589          * Make it visible before enabling the hw:
590          */
591         smp_wmb();
592
593         __hw_perf_counter_set_period(counter, hwc, idx);
594         __pmc_generic_enable(counter, hwc, idx);
595
596         return 0;
597 }
598
599 void perf_counter_print_debug(void)
600 {
601         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
602         struct cpu_hw_counters *cpuc;
603         int cpu, idx;
604
605         if (!nr_counters_generic)
606                 return;
607
608         local_irq_disable();
609
610         cpu = smp_processor_id();
611         cpuc = &per_cpu(cpu_hw_counters, cpu);
612
613         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
614                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
615                 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
616                 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
617                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
618
619                 pr_info("\n");
620                 pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
621                 pr_info("CPU#%d: status:     %016llx\n", cpu, status);
622                 pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
623                 pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
624         }
625         pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
626
627         for (idx = 0; idx < nr_counters_generic; idx++) {
628                 rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
629                 rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
630
631                 prev_left = per_cpu(prev_left[idx], cpu);
632
633                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
634                         cpu, idx, pmc_ctrl);
635                 pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
636                         cpu, idx, pmc_count);
637                 pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
638                         cpu, idx, prev_left);
639         }
640         for (idx = 0; idx < nr_counters_fixed; idx++) {
641                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
642
643                 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
644                         cpu, idx, pmc_count);
645         }
646         local_irq_enable();
647 }
648
649 static void pmc_generic_disable(struct perf_counter *counter)
650 {
651         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
652         struct hw_perf_counter *hwc = &counter->hw;
653         unsigned int idx = hwc->idx;
654
655         __pmc_generic_disable(counter, hwc, idx);
656
657         clear_bit(idx, cpuc->used);
658         cpuc->counters[idx] = NULL;
659         /*
660          * Make sure the cleared pointer becomes visible before we
661          * (potentially) free the counter:
662          */
663         smp_wmb();
664
665         /*
666          * Drain the remaining delta count out of a counter
667          * that we are disabling:
668          */
669         x86_perf_counter_update(counter, hwc, idx);
670 }
671
672 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
673 {
674         struct perf_data *irqdata = counter->irqdata;
675
676         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
677                 irqdata->overrun++;
678         } else {
679                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
680
681                 *p = data;
682                 irqdata->len += sizeof(u64);
683         }
684 }
685
686 /*
687  * Save and restart an expired counter. Called by NMI contexts,
688  * so it has to be careful about preempting normal counter ops:
689  */
690 static void perf_save_and_restart(struct perf_counter *counter)
691 {
692         struct hw_perf_counter *hwc = &counter->hw;
693         int idx = hwc->idx;
694
695         x86_perf_counter_update(counter, hwc, idx);
696         __hw_perf_counter_set_period(counter, hwc, idx);
697
698         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
699                 __pmc_generic_enable(counter, hwc, idx);
700 }
701
702 static void
703 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
704 {
705         struct perf_counter *counter, *group_leader = sibling->group_leader;
706
707         /*
708          * Store sibling timestamps (if any):
709          */
710         list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
711
712                 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
713                 perf_store_irq_data(sibling, counter->hw_event.type);
714                 perf_store_irq_data(sibling, atomic64_read(&counter->count));
715         }
716 }
717
718 /*
719  * Maximum interrupt frequency of 100KHz per CPU
720  */
721 #define PERFMON_MAX_INTERRUPTS (100000/HZ)
722
723 /*
724  * This handler is triggered by the local APIC, so the APIC IRQ handling
725  * rules apply:
726  */
727 static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
728 {
729         int bit, cpu = smp_processor_id();
730         u64 ack, status;
731         struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
732         int ret = 0;
733
734         cpuc->throttle_ctrl = hw_perf_save_disable();
735
736         status = hw_perf_get_status(cpuc->throttle_ctrl);
737         if (!status)
738                 goto out;
739
740         ret = 1;
741 again:
742         inc_irq_stat(apic_perf_irqs);
743         ack = status;
744         for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
745                 struct perf_counter *counter = cpuc->counters[bit];
746
747                 clear_bit(bit, (unsigned long *) &status);
748                 if (!counter)
749                         continue;
750
751                 perf_save_and_restart(counter);
752
753                 switch (counter->hw_event.record_type) {
754                 case PERF_RECORD_SIMPLE:
755                         continue;
756                 case PERF_RECORD_IRQ:
757                         perf_store_irq_data(counter, instruction_pointer(regs));
758                         break;
759                 case PERF_RECORD_GROUP:
760                         perf_handle_group(counter, &status, &ack);
761                         break;
762                 }
763                 /*
764                  * From NMI context we cannot call into the scheduler to
765                  * do a task wakeup - but we mark these generic as
766                  * wakeup_pending and initate a wakeup callback:
767                  */
768                 if (nmi) {
769                         counter->wakeup_pending = 1;
770                         set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
771                 } else {
772                         wake_up(&counter->waitq);
773                 }
774         }
775
776         hw_perf_ack_status(ack);
777
778         /*
779          * Repeat if there is more work to be done:
780          */
781         status = hw_perf_get_status(cpuc->throttle_ctrl);
782         if (status)
783                 goto again;
784 out:
785         /*
786          * Restore - do not reenable when global enable is off or throttled:
787          */
788         if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
789                 hw_perf_restore(cpuc->throttle_ctrl);
790
791         return ret;
792 }
793
794 void perf_counter_unthrottle(void)
795 {
796         struct cpu_hw_counters *cpuc;
797
798         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
799                 return;
800
801         if (unlikely(!perf_counters_initialized))
802                 return;
803
804         cpuc = &__get_cpu_var(cpu_hw_counters);
805         if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
806                 if (printk_ratelimit())
807                         printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
808                 hw_perf_restore(cpuc->throttle_ctrl);
809         }
810         cpuc->interrupts = 0;
811 }
812
813 void smp_perf_counter_interrupt(struct pt_regs *regs)
814 {
815         irq_enter();
816         apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
817         ack_APIC_irq();
818         __smp_perf_counter_interrupt(regs, 0);
819         irq_exit();
820 }
821
822 /*
823  * This handler is triggered by NMI contexts:
824  */
825 void perf_counter_notify(struct pt_regs *regs)
826 {
827         struct cpu_hw_counters *cpuc;
828         unsigned long flags;
829         int bit, cpu;
830
831         local_irq_save(flags);
832         cpu = smp_processor_id();
833         cpuc = &per_cpu(cpu_hw_counters, cpu);
834
835         for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
836                 struct perf_counter *counter = cpuc->counters[bit];
837
838                 if (!counter)
839                         continue;
840
841                 if (counter->wakeup_pending) {
842                         counter->wakeup_pending = 0;
843                         wake_up(&counter->waitq);
844                 }
845         }
846
847         local_irq_restore(flags);
848 }
849
850 void perf_counters_lapic_init(int nmi)
851 {
852         u32 apic_val;
853
854         if (!perf_counters_initialized)
855                 return;
856         /*
857          * Enable the performance counter vector in the APIC LVT:
858          */
859         apic_val = apic_read(APIC_LVTERR);
860
861         apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
862         if (nmi)
863                 apic_write(APIC_LVTPC, APIC_DM_NMI);
864         else
865                 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
866         apic_write(APIC_LVTERR, apic_val);
867 }
868
869 static int __kprobes
870 perf_counter_nmi_handler(struct notifier_block *self,
871                          unsigned long cmd, void *__args)
872 {
873         struct die_args *args = __args;
874         struct pt_regs *regs;
875         int ret;
876
877         switch (cmd) {
878         case DIE_NMI:
879         case DIE_NMI_IPI:
880                 break;
881
882         default:
883                 return NOTIFY_DONE;
884         }
885
886         regs = args->regs;
887
888         apic_write(APIC_LVTPC, APIC_DM_NMI);
889         ret = __smp_perf_counter_interrupt(regs, 1);
890
891         return ret ? NOTIFY_STOP : NOTIFY_OK;
892 }
893
894 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
895         .notifier_call          = perf_counter_nmi_handler,
896         .next                   = NULL,
897         .priority               = 1
898 };
899
900 static struct pmc_x86_ops pmc_intel_ops = {
901         .save_disable_all       = pmc_intel_save_disable_all,
902         .restore_all            = pmc_intel_restore_all,
903         .get_status             = pmc_intel_get_status,
904         .ack_status             = pmc_intel_ack_status,
905         .enable                 = pmc_intel_enable,
906         .disable                = pmc_intel_disable,
907         .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
908         .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
909         .event_map              = pmc_intel_event_map,
910         .raw_event              = pmc_intel_raw_event,
911         .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
912 };
913
914 static struct pmc_x86_ops pmc_amd_ops = {
915         .save_disable_all       = pmc_amd_save_disable_all,
916         .restore_all            = pmc_amd_restore_all,
917         .get_status             = pmc_amd_get_status,
918         .ack_status             = pmc_amd_ack_status,
919         .enable                 = pmc_amd_enable,
920         .disable                = pmc_amd_disable,
921         .eventsel               = MSR_K7_EVNTSEL0,
922         .perfctr                = MSR_K7_PERFCTR0,
923         .event_map              = pmc_amd_event_map,
924         .raw_event              = pmc_amd_raw_event,
925         .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
926 };
927
928 static struct pmc_x86_ops *pmc_intel_init(void)
929 {
930         union cpuid10_eax eax;
931         unsigned int ebx;
932         unsigned int unused;
933         union cpuid10_edx edx;
934
935         /*
936          * Check whether the Architectural PerfMon supports
937          * Branch Misses Retired Event or not.
938          */
939         cpuid(10, &eax.full, &ebx, &unused, &edx.full);
940         if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
941                 return NULL;
942
943         pr_info("Intel Performance Monitoring support detected.\n");
944         pr_info("... version:         %d\n", eax.split.version_id);
945         pr_info("... bit width:       %d\n", eax.split.bit_width);
946         pr_info("... mask length:     %d\n", eax.split.mask_length);
947
948         nr_counters_generic = eax.split.num_counters;
949         nr_counters_fixed = edx.split.num_counters_fixed;
950         counter_value_mask = (1ULL << eax.split.bit_width) - 1;
951
952         return &pmc_intel_ops;
953 }
954
955 static struct pmc_x86_ops *pmc_amd_init(void)
956 {
957         u64 old;
958         int bits;
959
960         nr_counters_generic = 4;
961         nr_counters_fixed = 0;
962         counter_value_mask = 0x0000FFFFFFFFFFFFULL;
963         counter_value_bits = 48;
964
965         pr_info("AMD Performance Monitoring support detected.\n");
966
967         return &pmc_amd_ops;
968 }
969
970 void __init init_hw_perf_counters(void)
971 {
972         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
973                 return;
974
975         switch (boot_cpu_data.x86_vendor) {
976         case X86_VENDOR_INTEL:
977                 pmc_ops = pmc_intel_init();
978                 break;
979         case X86_VENDOR_AMD:
980                 pmc_ops = pmc_amd_init();
981                 break;
982         }
983         if (!pmc_ops)
984                 return;
985
986         pr_info("... num counters:    %d\n", nr_counters_generic);
987         if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
988                 nr_counters_generic = X86_PMC_MAX_GENERIC;
989                 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
990                         nr_counters_generic, X86_PMC_MAX_GENERIC);
991         }
992         perf_counter_mask = (1 << nr_counters_generic) - 1;
993         perf_max_counters = nr_counters_generic;
994
995         pr_info("... value mask:      %016Lx\n", counter_value_mask);
996
997         if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
998                 nr_counters_fixed = X86_PMC_MAX_FIXED;
999                 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1000                         nr_counters_fixed, X86_PMC_MAX_FIXED);
1001         }
1002         pr_info("... fixed counters:  %d\n", nr_counters_fixed);
1003
1004         perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1005
1006         pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
1007         perf_counters_initialized = true;
1008
1009         perf_counters_lapic_init(0);
1010         register_die_notifier(&perf_counter_nmi_notifier);
1011 }
1012
1013 static void pmc_generic_read(struct perf_counter *counter)
1014 {
1015         x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1016 }
1017
1018 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
1019         .enable         = pmc_generic_enable,
1020         .disable        = pmc_generic_disable,
1021         .read           = pmc_generic_read,
1022 };
1023
1024 const struct hw_perf_counter_ops *
1025 hw_perf_counter_init(struct perf_counter *counter)
1026 {
1027         int err;
1028
1029         err = __hw_perf_counter_init(counter);
1030         if (err)
1031                 return NULL;
1032
1033         return &x86_perf_counter_ops;
1034 }