6cba9d47b711489dd17dbf52781d7cf2aa66f1e7
[safe/jmp/linux-2.6] / arch / x86 / kernel / cpu / perf_counter.c
1 /*
2  * Performance counter x86 architecture code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *  Copyright(C) 2009 Jaswinder Singh Rajput
7  *
8  *  For licencing details see kernel-base/COPYING
9  */
10
11 #include <linux/perf_counter.h>
12 #include <linux/capability.h>
13 #include <linux/notifier.h>
14 #include <linux/hardirq.h>
15 #include <linux/kprobes.h>
16 #include <linux/module.h>
17 #include <linux/kdebug.h>
18 #include <linux/sched.h>
19
20 #include <asm/apic.h>
21
22 static bool perf_counters_initialized __read_mostly;
23
24 /*
25  * Number of (generic) HW counters:
26  */
27 static int nr_counters_generic __read_mostly;
28 static u64 perf_counter_mask __read_mostly;
29 static u64 counter_value_mask __read_mostly;
30 static int counter_value_bits __read_mostly;
31
32 static int nr_counters_fixed __read_mostly;
33
34 struct cpu_hw_counters {
35         struct perf_counter     *counters[X86_PMC_IDX_MAX];
36         unsigned long           used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
37         unsigned long           interrupts;
38         u64                     throttle_ctrl;
39         unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
40         int                     enabled;
41 };
42
43 /*
44  * struct pmc_x86_ops - performance counter x86 ops
45  */
46 struct pmc_x86_ops {
47         u64             (*save_disable_all)(void);
48         void            (*restore_all)(u64);
49         u64             (*get_status)(u64);
50         void            (*ack_status)(u64);
51         void            (*enable)(int, u64);
52         void            (*disable)(int, u64);
53         unsigned        eventsel;
54         unsigned        perfctr;
55         u64             (*event_map)(int);
56         u64             (*raw_event)(u64);
57         int             max_events;
58 };
59
60 static struct pmc_x86_ops *pmc_ops __read_mostly;
61
62 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
63         .enabled = 1,
64 };
65
66 static __read_mostly int intel_perfmon_version;
67
68 /*
69  * Intel PerfMon v3. Used on Core2 and later.
70  */
71 static const u64 intel_perfmon_event_map[] =
72 {
73   [PERF_COUNT_CPU_CYCLES]               = 0x003c,
74   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
75   [PERF_COUNT_CACHE_REFERENCES]         = 0x4f2e,
76   [PERF_COUNT_CACHE_MISSES]             = 0x412e,
77   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
78   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
79   [PERF_COUNT_BUS_CYCLES]               = 0x013c,
80 };
81
82 static u64 pmc_intel_event_map(int event)
83 {
84         return intel_perfmon_event_map[event];
85 }
86
87 static u64 pmc_intel_raw_event(u64 event)
88 {
89 #define CORE_EVNTSEL_EVENT_MASK         0x000000FFULL
90 #define CORE_EVNTSEL_UNIT_MASK          0x0000FF00ULL
91 #define CORE_EVNTSEL_COUNTER_MASK       0xFF000000ULL
92
93 #define CORE_EVNTSEL_MASK               \
94         (CORE_EVNTSEL_EVENT_MASK |      \
95          CORE_EVNTSEL_UNIT_MASK  |      \
96          CORE_EVNTSEL_COUNTER_MASK)
97
98         return event & CORE_EVNTSEL_MASK;
99 }
100
101 /*
102  * AMD Performance Monitor K7 and later.
103  */
104 static const u64 amd_perfmon_event_map[] =
105 {
106   [PERF_COUNT_CPU_CYCLES]               = 0x0076,
107   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
108   [PERF_COUNT_CACHE_REFERENCES]         = 0x0080,
109   [PERF_COUNT_CACHE_MISSES]             = 0x0081,
110   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
111   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
112 };
113
114 static u64 pmc_amd_event_map(int event)
115 {
116         return amd_perfmon_event_map[event];
117 }
118
119 static u64 pmc_amd_raw_event(u64 event)
120 {
121 #define K7_EVNTSEL_EVENT_MASK   0x7000000FFULL
122 #define K7_EVNTSEL_UNIT_MASK    0x00000FF00ULL
123 #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
124
125 #define K7_EVNTSEL_MASK                 \
126         (K7_EVNTSEL_EVENT_MASK |        \
127          K7_EVNTSEL_UNIT_MASK  |        \
128          K7_EVNTSEL_COUNTER_MASK)
129
130         return event & K7_EVNTSEL_MASK;
131 }
132
133 /*
134  * Propagate counter elapsed time into the generic counter.
135  * Can only be executed on the CPU where the counter is active.
136  * Returns the delta events processed.
137  */
138 static void
139 x86_perf_counter_update(struct perf_counter *counter,
140                         struct hw_perf_counter *hwc, int idx)
141 {
142         u64 prev_raw_count, new_raw_count, delta;
143
144         /*
145          * Careful: an NMI might modify the previous counter value.
146          *
147          * Our tactic to handle this is to first atomically read and
148          * exchange a new raw count - then add that new-prev delta
149          * count to the generic counter atomically:
150          */
151 again:
152         prev_raw_count = atomic64_read(&hwc->prev_count);
153         rdmsrl(hwc->counter_base + idx, new_raw_count);
154
155         if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
156                                         new_raw_count) != prev_raw_count)
157                 goto again;
158
159         /*
160          * Now we have the new raw value and have updated the prev
161          * timestamp already. We can now calculate the elapsed delta
162          * (counter-)time and add that to the generic counter.
163          *
164          * Careful, not all hw sign-extends above the physical width
165          * of the count, so we do that by clipping the delta to 32 bits:
166          */
167         delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
168
169         atomic64_add(delta, &counter->count);
170         atomic64_sub(delta, &hwc->period_left);
171 }
172
173 /*
174  * Setup the hardware configuration for a given hw_event_type
175  */
176 static int __hw_perf_counter_init(struct perf_counter *counter)
177 {
178         struct perf_counter_hw_event *hw_event = &counter->hw_event;
179         struct hw_perf_counter *hwc = &counter->hw;
180
181         if (unlikely(!perf_counters_initialized))
182                 return -EINVAL;
183
184         /*
185          * Generate PMC IRQs:
186          * (keep 'enabled' bit clear for now)
187          */
188         hwc->config = ARCH_PERFMON_EVENTSEL_INT;
189
190         /*
191          * Count user and OS events unless requested not to.
192          */
193         if (!hw_event->exclude_user)
194                 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
195         if (!hw_event->exclude_kernel)
196                 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
197
198         /*
199          * If privileged enough, allow NMI events:
200          */
201         hwc->nmi = 0;
202         if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
203                 hwc->nmi = 1;
204
205         hwc->irq_period         = hw_event->irq_period;
206         /*
207          * Intel PMCs cannot be accessed sanely above 32 bit width,
208          * so we install an artificial 1<<31 period regardless of
209          * the generic counter period:
210          */
211         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
212                 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
213                         hwc->irq_period = 0x7FFFFFFF;
214
215         atomic64_set(&hwc->period_left, hwc->irq_period);
216
217         /*
218          * Raw event type provide the config in the event structure
219          */
220         if (hw_event->raw) {
221                 hwc->config |= pmc_ops->raw_event(hw_event->type);
222         } else {
223                 if (hw_event->type >= pmc_ops->max_events)
224                         return -EINVAL;
225                 /*
226                  * The generic map:
227                  */
228                 hwc->config |= pmc_ops->event_map(hw_event->type);
229         }
230         counter->wakeup_pending = 0;
231
232         return 0;
233 }
234
235 static u64 pmc_intel_save_disable_all(void)
236 {
237         u64 ctrl;
238
239         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
240         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
241
242         return ctrl;
243 }
244
245 static u64 pmc_amd_save_disable_all(void)
246 {
247         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
248         int enabled, idx;
249
250         enabled = cpuc->enabled;
251         cpuc->enabled = 0;
252         /*
253          * ensure we write the disable before we start disabling the
254          * counters proper, so that pcm_amd_enable() does the right thing.
255          */
256         barrier();
257
258         for (idx = 0; idx < nr_counters_generic; idx++) {
259                 u64 val;
260
261                 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
262                 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
263                         val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
264                         wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
265                 }
266         }
267
268         return enabled;
269 }
270
271 u64 hw_perf_save_disable(void)
272 {
273         if (unlikely(!perf_counters_initialized))
274                 return 0;
275
276         return pmc_ops->save_disable_all();
277 }
278 /*
279  * Exported because of ACPI idle
280  */
281 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
282
283 static void pmc_intel_restore_all(u64 ctrl)
284 {
285         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
286 }
287
288 static void pmc_amd_restore_all(u64 ctrl)
289 {
290         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
291         int idx;
292
293         cpuc->enabled = ctrl;
294         barrier();
295         if (!ctrl)
296                 return;
297
298         for (idx = 0; idx < nr_counters_generic; idx++) {
299                 if (test_bit(idx, cpuc->active_mask)) {
300                         u64 val;
301
302                         rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
303                         val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
304                         wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
305                 }
306         }
307 }
308
309 void hw_perf_restore(u64 ctrl)
310 {
311         if (unlikely(!perf_counters_initialized))
312                 return;
313
314         pmc_ops->restore_all(ctrl);
315 }
316 /*
317  * Exported because of ACPI idle
318  */
319 EXPORT_SYMBOL_GPL(hw_perf_restore);
320
321 static u64 pmc_intel_get_status(u64 mask)
322 {
323         u64 status;
324
325         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
326
327         return status;
328 }
329
330 static u64 pmc_amd_get_status(u64 mask)
331 {
332         u64 status = 0;
333         int idx;
334
335         for (idx = 0; idx < nr_counters_generic; idx++) {
336                 s64 val;
337
338                 if (!(mask & (1 << idx)))
339                         continue;
340
341                 rdmsrl(MSR_K7_PERFCTR0 + idx, val);
342                 val <<= (64 - counter_value_bits);
343                 if (val >= 0)
344                         status |= (1 << idx);
345         }
346
347         return status;
348 }
349
350 static u64 hw_perf_get_status(u64 mask)
351 {
352         if (unlikely(!perf_counters_initialized))
353                 return 0;
354
355         return pmc_ops->get_status(mask);
356 }
357
358 static void pmc_intel_ack_status(u64 ack)
359 {
360         wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
361 }
362
363 static void pmc_amd_ack_status(u64 ack)
364 {
365 }
366
367 static void hw_perf_ack_status(u64 ack)
368 {
369         if (unlikely(!perf_counters_initialized))
370                 return;
371
372         pmc_ops->ack_status(ack);
373 }
374
375 static void pmc_intel_enable(int idx, u64 config)
376 {
377         wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
378                         config | ARCH_PERFMON_EVENTSEL0_ENABLE);
379 }
380
381 static void pmc_amd_enable(int idx, u64 config)
382 {
383         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
384
385         set_bit(idx, cpuc->active_mask);
386         if (cpuc->enabled)
387                 config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
388
389         wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
390 }
391
392 static void hw_perf_enable(int idx, u64 config)
393 {
394         if (unlikely(!perf_counters_initialized))
395                 return;
396
397         pmc_ops->enable(idx, config);
398 }
399
400 static void pmc_intel_disable(int idx, u64 config)
401 {
402         wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
403 }
404
405 static void pmc_amd_disable(int idx, u64 config)
406 {
407         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
408
409         clear_bit(idx, cpuc->active_mask);
410         wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
411
412 }
413
414 static void hw_perf_disable(int idx, u64 config)
415 {
416         if (unlikely(!perf_counters_initialized))
417                 return;
418
419         pmc_ops->disable(idx, config);
420 }
421
422 static inline void
423 __pmc_fixed_disable(struct perf_counter *counter,
424                     struct hw_perf_counter *hwc, unsigned int __idx)
425 {
426         int idx = __idx - X86_PMC_IDX_FIXED;
427         u64 ctrl_val, mask;
428         int err;
429
430         mask = 0xfULL << (idx * 4);
431
432         rdmsrl(hwc->config_base, ctrl_val);
433         ctrl_val &= ~mask;
434         err = checking_wrmsrl(hwc->config_base, ctrl_val);
435 }
436
437 static inline void
438 __pmc_generic_disable(struct perf_counter *counter,
439                            struct hw_perf_counter *hwc, unsigned int idx)
440 {
441         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
442                 __pmc_fixed_disable(counter, hwc, idx);
443         else
444                 hw_perf_disable(idx, hwc->config);
445 }
446
447 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
448
449 /*
450  * Set the next IRQ period, based on the hwc->period_left value.
451  * To be called with the counter disabled in hw:
452  */
453 static void
454 __hw_perf_counter_set_period(struct perf_counter *counter,
455                              struct hw_perf_counter *hwc, int idx)
456 {
457         s64 left = atomic64_read(&hwc->period_left);
458         s64 period = hwc->irq_period;
459         int err;
460
461         /*
462          * If we are way outside a reasoable range then just skip forward:
463          */
464         if (unlikely(left <= -period)) {
465                 left = period;
466                 atomic64_set(&hwc->period_left, left);
467         }
468
469         if (unlikely(left <= 0)) {
470                 left += period;
471                 atomic64_set(&hwc->period_left, left);
472         }
473
474         per_cpu(prev_left[idx], smp_processor_id()) = left;
475
476         /*
477          * The hw counter starts counting from this counter offset,
478          * mark it to be able to extra future deltas:
479          */
480         atomic64_set(&hwc->prev_count, (u64)-left);
481
482         err = checking_wrmsrl(hwc->counter_base + idx,
483                              (u64)(-left) & counter_value_mask);
484 }
485
486 static inline void
487 __pmc_fixed_enable(struct perf_counter *counter,
488                    struct hw_perf_counter *hwc, unsigned int __idx)
489 {
490         int idx = __idx - X86_PMC_IDX_FIXED;
491         u64 ctrl_val, bits, mask;
492         int err;
493
494         /*
495          * Enable IRQ generation (0x8),
496          * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
497          * if requested:
498          */
499         bits = 0x8ULL;
500         if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
501                 bits |= 0x2;
502         if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
503                 bits |= 0x1;
504         bits <<= (idx * 4);
505         mask = 0xfULL << (idx * 4);
506
507         rdmsrl(hwc->config_base, ctrl_val);
508         ctrl_val &= ~mask;
509         ctrl_val |= bits;
510         err = checking_wrmsrl(hwc->config_base, ctrl_val);
511 }
512
513 static void
514 __pmc_generic_enable(struct perf_counter *counter,
515                           struct hw_perf_counter *hwc, int idx)
516 {
517         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
518                 __pmc_fixed_enable(counter, hwc, idx);
519         else
520                 hw_perf_enable(idx, hwc->config);
521 }
522
523 static int
524 fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
525 {
526         unsigned int event;
527
528         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
529                 return -1;
530
531         if (unlikely(hwc->nmi))
532                 return -1;
533
534         event = hwc->config & ARCH_PERFMON_EVENT_MASK;
535
536         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
537                 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
538         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
539                 return X86_PMC_IDX_FIXED_CPU_CYCLES;
540         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
541                 return X86_PMC_IDX_FIXED_BUS_CYCLES;
542
543         return -1;
544 }
545
546 /*
547  * Find a PMC slot for the freshly enabled / scheduled in counter:
548  */
549 static int pmc_generic_enable(struct perf_counter *counter)
550 {
551         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
552         struct hw_perf_counter *hwc = &counter->hw;
553         int idx;
554
555         idx = fixed_mode_idx(counter, hwc);
556         if (idx >= 0) {
557                 /*
558                  * Try to get the fixed counter, if that is already taken
559                  * then try to get a generic counter:
560                  */
561                 if (test_and_set_bit(idx, cpuc->used))
562                         goto try_generic;
563
564                 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
565                 /*
566                  * We set it so that counter_base + idx in wrmsr/rdmsr maps to
567                  * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
568                  */
569                 hwc->counter_base =
570                         MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
571                 hwc->idx = idx;
572         } else {
573                 idx = hwc->idx;
574                 /* Try to get the previous generic counter again */
575                 if (test_and_set_bit(idx, cpuc->used)) {
576 try_generic:
577                         idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
578                         if (idx == nr_counters_generic)
579                                 return -EAGAIN;
580
581                         set_bit(idx, cpuc->used);
582                         hwc->idx = idx;
583                 }
584                 hwc->config_base  = pmc_ops->eventsel;
585                 hwc->counter_base = pmc_ops->perfctr;
586         }
587
588         perf_counters_lapic_init(hwc->nmi);
589
590         __pmc_generic_disable(counter, hwc, idx);
591
592         cpuc->counters[idx] = counter;
593         /*
594          * Make it visible before enabling the hw:
595          */
596         smp_wmb();
597
598         __hw_perf_counter_set_period(counter, hwc, idx);
599         __pmc_generic_enable(counter, hwc, idx);
600
601         return 0;
602 }
603
604 void perf_counter_print_debug(void)
605 {
606         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
607         struct cpu_hw_counters *cpuc;
608         int cpu, idx;
609
610         if (!nr_counters_generic)
611                 return;
612
613         local_irq_disable();
614
615         cpu = smp_processor_id();
616         cpuc = &per_cpu(cpu_hw_counters, cpu);
617
618         if (intel_perfmon_version >= 2) {
619                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
620                 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
621                 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
622                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
623
624                 pr_info("\n");
625                 pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
626                 pr_info("CPU#%d: status:     %016llx\n", cpu, status);
627                 pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
628                 pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
629         }
630         pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
631
632         for (idx = 0; idx < nr_counters_generic; idx++) {
633                 rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
634                 rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
635
636                 prev_left = per_cpu(prev_left[idx], cpu);
637
638                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
639                         cpu, idx, pmc_ctrl);
640                 pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
641                         cpu, idx, pmc_count);
642                 pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
643                         cpu, idx, prev_left);
644         }
645         for (idx = 0; idx < nr_counters_fixed; idx++) {
646                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
647
648                 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
649                         cpu, idx, pmc_count);
650         }
651         local_irq_enable();
652 }
653
654 static void pmc_generic_disable(struct perf_counter *counter)
655 {
656         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
657         struct hw_perf_counter *hwc = &counter->hw;
658         unsigned int idx = hwc->idx;
659
660         __pmc_generic_disable(counter, hwc, idx);
661
662         clear_bit(idx, cpuc->used);
663         cpuc->counters[idx] = NULL;
664         /*
665          * Make sure the cleared pointer becomes visible before we
666          * (potentially) free the counter:
667          */
668         smp_wmb();
669
670         /*
671          * Drain the remaining delta count out of a counter
672          * that we are disabling:
673          */
674         x86_perf_counter_update(counter, hwc, idx);
675 }
676
677 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
678 {
679         struct perf_data *irqdata = counter->irqdata;
680
681         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
682                 irqdata->overrun++;
683         } else {
684                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
685
686                 *p = data;
687                 irqdata->len += sizeof(u64);
688         }
689 }
690
691 /*
692  * Save and restart an expired counter. Called by NMI contexts,
693  * so it has to be careful about preempting normal counter ops:
694  */
695 static void perf_save_and_restart(struct perf_counter *counter)
696 {
697         struct hw_perf_counter *hwc = &counter->hw;
698         int idx = hwc->idx;
699
700         x86_perf_counter_update(counter, hwc, idx);
701         __hw_perf_counter_set_period(counter, hwc, idx);
702
703         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
704                 __pmc_generic_enable(counter, hwc, idx);
705 }
706
707 static void
708 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
709 {
710         struct perf_counter *counter, *group_leader = sibling->group_leader;
711
712         /*
713          * Store sibling timestamps (if any):
714          */
715         list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
716
717                 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
718                 perf_store_irq_data(sibling, counter->hw_event.type);
719                 perf_store_irq_data(sibling, atomic64_read(&counter->count));
720         }
721 }
722
723 /*
724  * Maximum interrupt frequency of 100KHz per CPU
725  */
726 #define PERFMON_MAX_INTERRUPTS (100000/HZ)
727
728 /*
729  * This handler is triggered by the local APIC, so the APIC IRQ handling
730  * rules apply:
731  */
732 static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
733 {
734         int bit, cpu = smp_processor_id();
735         u64 ack, status;
736         struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
737         int ret = 0;
738
739         cpuc->throttle_ctrl = hw_perf_save_disable();
740
741         status = hw_perf_get_status(cpuc->throttle_ctrl);
742         if (!status)
743                 goto out;
744
745         ret = 1;
746 again:
747         inc_irq_stat(apic_perf_irqs);
748         ack = status;
749         for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
750                 struct perf_counter *counter = cpuc->counters[bit];
751
752                 clear_bit(bit, (unsigned long *) &status);
753                 if (!counter)
754                         continue;
755
756                 perf_save_and_restart(counter);
757
758                 switch (counter->hw_event.record_type) {
759                 case PERF_RECORD_SIMPLE:
760                         continue;
761                 case PERF_RECORD_IRQ:
762                         perf_store_irq_data(counter, instruction_pointer(regs));
763                         break;
764                 case PERF_RECORD_GROUP:
765                         perf_handle_group(counter, &status, &ack);
766                         break;
767                 }
768                 /*
769                  * From NMI context we cannot call into the scheduler to
770                  * do a task wakeup - but we mark these generic as
771                  * wakeup_pending and initate a wakeup callback:
772                  */
773                 if (nmi) {
774                         counter->wakeup_pending = 1;
775                         set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
776                 } else {
777                         wake_up(&counter->waitq);
778                 }
779         }
780
781         hw_perf_ack_status(ack);
782
783         /*
784          * Repeat if there is more work to be done:
785          */
786         status = hw_perf_get_status(cpuc->throttle_ctrl);
787         if (status)
788                 goto again;
789 out:
790         /*
791          * Restore - do not reenable when global enable is off or throttled:
792          */
793         if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
794                 hw_perf_restore(cpuc->throttle_ctrl);
795
796         return ret;
797 }
798
799 void perf_counter_unthrottle(void)
800 {
801         struct cpu_hw_counters *cpuc;
802
803         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
804                 return;
805
806         if (unlikely(!perf_counters_initialized))
807                 return;
808
809         cpuc = &__get_cpu_var(cpu_hw_counters);
810         if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
811                 if (printk_ratelimit())
812                         printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
813                 hw_perf_restore(cpuc->throttle_ctrl);
814         }
815         cpuc->interrupts = 0;
816 }
817
818 void smp_perf_counter_interrupt(struct pt_regs *regs)
819 {
820         irq_enter();
821         apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
822         ack_APIC_irq();
823         __smp_perf_counter_interrupt(regs, 0);
824         irq_exit();
825 }
826
827 /*
828  * This handler is triggered by NMI contexts:
829  */
830 void perf_counter_notify(struct pt_regs *regs)
831 {
832         struct cpu_hw_counters *cpuc;
833         unsigned long flags;
834         int bit, cpu;
835
836         local_irq_save(flags);
837         cpu = smp_processor_id();
838         cpuc = &per_cpu(cpu_hw_counters, cpu);
839
840         for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
841                 struct perf_counter *counter = cpuc->counters[bit];
842
843                 if (!counter)
844                         continue;
845
846                 if (counter->wakeup_pending) {
847                         counter->wakeup_pending = 0;
848                         wake_up(&counter->waitq);
849                 }
850         }
851
852         local_irq_restore(flags);
853 }
854
855 void perf_counters_lapic_init(int nmi)
856 {
857         u32 apic_val;
858
859         if (!perf_counters_initialized)
860                 return;
861         /*
862          * Enable the performance counter vector in the APIC LVT:
863          */
864         apic_val = apic_read(APIC_LVTERR);
865
866         apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
867         if (nmi)
868                 apic_write(APIC_LVTPC, APIC_DM_NMI);
869         else
870                 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
871         apic_write(APIC_LVTERR, apic_val);
872 }
873
874 static int __kprobes
875 perf_counter_nmi_handler(struct notifier_block *self,
876                          unsigned long cmd, void *__args)
877 {
878         struct die_args *args = __args;
879         struct pt_regs *regs;
880         int ret;
881
882         switch (cmd) {
883         case DIE_NMI:
884         case DIE_NMI_IPI:
885                 break;
886
887         default:
888                 return NOTIFY_DONE;
889         }
890
891         regs = args->regs;
892
893         apic_write(APIC_LVTPC, APIC_DM_NMI);
894         ret = __smp_perf_counter_interrupt(regs, 1);
895
896         return ret ? NOTIFY_STOP : NOTIFY_OK;
897 }
898
899 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
900         .notifier_call          = perf_counter_nmi_handler,
901         .next                   = NULL,
902         .priority               = 1
903 };
904
905 static struct pmc_x86_ops pmc_intel_ops = {
906         .save_disable_all       = pmc_intel_save_disable_all,
907         .restore_all            = pmc_intel_restore_all,
908         .get_status             = pmc_intel_get_status,
909         .ack_status             = pmc_intel_ack_status,
910         .enable                 = pmc_intel_enable,
911         .disable                = pmc_intel_disable,
912         .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
913         .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
914         .event_map              = pmc_intel_event_map,
915         .raw_event              = pmc_intel_raw_event,
916         .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
917 };
918
919 static struct pmc_x86_ops pmc_amd_ops = {
920         .save_disable_all       = pmc_amd_save_disable_all,
921         .restore_all            = pmc_amd_restore_all,
922         .get_status             = pmc_amd_get_status,
923         .ack_status             = pmc_amd_ack_status,
924         .enable                 = pmc_amd_enable,
925         .disable                = pmc_amd_disable,
926         .eventsel               = MSR_K7_EVNTSEL0,
927         .perfctr                = MSR_K7_PERFCTR0,
928         .event_map              = pmc_amd_event_map,
929         .raw_event              = pmc_amd_raw_event,
930         .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
931 };
932
933 static struct pmc_x86_ops *pmc_intel_init(void)
934 {
935         union cpuid10_edx edx;
936         union cpuid10_eax eax;
937         unsigned int unused;
938         unsigned int ebx;
939
940         /*
941          * Check whether the Architectural PerfMon supports
942          * Branch Misses Retired Event or not.
943          */
944         cpuid(10, &eax.full, &ebx, &unused, &edx.full);
945         if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
946                 return NULL;
947
948         intel_perfmon_version = eax.split.version_id;
949         if (intel_perfmon_version < 2)
950                 return NULL;
951
952         pr_info("Intel Performance Monitoring support detected.\n");
953         pr_info("... version:         %d\n", intel_perfmon_version);
954         pr_info("... bit width:       %d\n", eax.split.bit_width);
955         pr_info("... mask length:     %d\n", eax.split.mask_length);
956
957         nr_counters_generic = eax.split.num_counters;
958         nr_counters_fixed = edx.split.num_counters_fixed;
959         counter_value_mask = (1ULL << eax.split.bit_width) - 1;
960
961         return &pmc_intel_ops;
962 }
963
964 static struct pmc_x86_ops *pmc_amd_init(void)
965 {
966         nr_counters_generic = 4;
967         nr_counters_fixed = 0;
968         counter_value_mask = 0x0000FFFFFFFFFFFFULL;
969         counter_value_bits = 48;
970
971         pr_info("AMD Performance Monitoring support detected.\n");
972
973         return &pmc_amd_ops;
974 }
975
976 void __init init_hw_perf_counters(void)
977 {
978         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
979                 return;
980
981         switch (boot_cpu_data.x86_vendor) {
982         case X86_VENDOR_INTEL:
983                 pmc_ops = pmc_intel_init();
984                 break;
985         case X86_VENDOR_AMD:
986                 pmc_ops = pmc_amd_init();
987                 break;
988         }
989         if (!pmc_ops)
990                 return;
991
992         pr_info("... num counters:    %d\n", nr_counters_generic);
993         if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
994                 nr_counters_generic = X86_PMC_MAX_GENERIC;
995                 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
996                         nr_counters_generic, X86_PMC_MAX_GENERIC);
997         }
998         perf_counter_mask = (1 << nr_counters_generic) - 1;
999         perf_max_counters = nr_counters_generic;
1000
1001         pr_info("... value mask:      %016Lx\n", counter_value_mask);
1002
1003         if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
1004                 nr_counters_fixed = X86_PMC_MAX_FIXED;
1005                 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1006                         nr_counters_fixed, X86_PMC_MAX_FIXED);
1007         }
1008         pr_info("... fixed counters:  %d\n", nr_counters_fixed);
1009
1010         perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1011
1012         pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
1013         perf_counters_initialized = true;
1014
1015         perf_counters_lapic_init(0);
1016         register_die_notifier(&perf_counter_nmi_notifier);
1017 }
1018
1019 static void pmc_generic_read(struct perf_counter *counter)
1020 {
1021         x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1022 }
1023
1024 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
1025         .enable         = pmc_generic_enable,
1026         .disable        = pmc_generic_disable,
1027         .read           = pmc_generic_read,
1028 };
1029
1030 const struct hw_perf_counter_ops *
1031 hw_perf_counter_init(struct perf_counter *counter)
1032 {
1033         int err;
1034
1035         err = __hw_perf_counter_init(counter);
1036         if (err)
1037                 return NULL;
1038
1039         return &x86_perf_counter_ops;
1040 }