2 * local apic based NMI watchdog for various CPUs.
4 * This file also handles reservation of performance counters for coordination
5 * with other users (like oprofile).
7 * Note that these events normally don't tick when the CPU idles. This means
8 * the frequency varies with CPU load.
10 * Original code for K7/P6 written by Keith Owens
14 #include <linux/percpu.h>
15 #include <linux/module.h>
16 #include <linux/kernel.h>
17 #include <linux/bitops.h>
18 #include <linux/smp.h>
19 #include <linux/nmi.h>
21 #include <asm/intel_arch_perfmon.h>
23 struct nmi_watchdog_ctlblk {
24 unsigned int cccr_msr;
25 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
26 unsigned int evntsel_msr; /* the MSR to select the events to handle */
29 /* Interface defining a CPU specific perfctr watchdog */
32 void (*unreserve)(void);
33 int (*setup)(unsigned nmi_hz);
34 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
41 static const struct wd_ops *wd_ops;
44 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
45 * offset from MSR_P4_BSU_ESCR0.
47 * It will be the max for all platforms (for now)
49 #define NMI_MAX_COUNTER_BITS 66
52 * perfctr_nmi_owner tracks the ownership of the perfctr registers:
53 * evtsel_nmi_owner tracks the ownership of the event selection
54 * - different performance counters/ event selection may be reserved for
55 * different subsystems this reservation system just tries to coordinate
58 static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
59 static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
61 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
63 /* converts an msr to an appropriate reservation bit */
64 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
66 /* returns the bit offset of the performance counter register */
67 switch (boot_cpu_data.x86_vendor) {
69 return (msr - MSR_K7_PERFCTR0);
70 case X86_VENDOR_INTEL:
71 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
72 return (msr - MSR_ARCH_PERFMON_PERFCTR0);
74 switch (boot_cpu_data.x86) {
76 return (msr - MSR_P6_PERFCTR0);
78 return (msr - MSR_P4_BPU_PERFCTR0);
85 * converts an msr to an appropriate reservation bit
86 * returns the bit offset of the event selection register
88 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
90 /* returns the bit offset of the event selection register */
91 switch (boot_cpu_data.x86_vendor) {
93 return (msr - MSR_K7_EVNTSEL0);
94 case X86_VENDOR_INTEL:
95 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
96 return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
98 switch (boot_cpu_data.x86) {
100 return (msr - MSR_P6_EVNTSEL0);
102 return (msr - MSR_P4_BSU_ESCR0);
109 /* checks for a bit availability (hack for oprofile) */
110 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
112 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
114 return (!test_bit(counter, perfctr_nmi_owner));
117 /* checks the an msr for availability */
118 int avail_to_resrv_perfctr_nmi(unsigned int msr)
120 unsigned int counter;
122 counter = nmi_perfctr_msr_to_bit(msr);
123 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
125 return (!test_bit(counter, perfctr_nmi_owner));
127 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
129 int reserve_perfctr_nmi(unsigned int msr)
131 unsigned int counter;
133 counter = nmi_perfctr_msr_to_bit(msr);
134 /* register not managed by the allocator? */
135 if (counter > NMI_MAX_COUNTER_BITS)
138 if (!test_and_set_bit(counter, perfctr_nmi_owner))
142 EXPORT_SYMBOL(reserve_perfctr_nmi);
144 void release_perfctr_nmi(unsigned int msr)
146 unsigned int counter;
148 counter = nmi_perfctr_msr_to_bit(msr);
149 /* register not managed by the allocator? */
150 if (counter > NMI_MAX_COUNTER_BITS)
153 clear_bit(counter, perfctr_nmi_owner);
155 EXPORT_SYMBOL(release_perfctr_nmi);
157 int reserve_evntsel_nmi(unsigned int msr)
159 unsigned int counter;
161 counter = nmi_evntsel_msr_to_bit(msr);
162 /* register not managed by the allocator? */
163 if (counter > NMI_MAX_COUNTER_BITS)
166 if (!test_and_set_bit(counter, evntsel_nmi_owner))
170 EXPORT_SYMBOL(reserve_evntsel_nmi);
172 void release_evntsel_nmi(unsigned int msr)
174 unsigned int counter;
176 counter = nmi_evntsel_msr_to_bit(msr);
177 /* register not managed by the allocator? */
178 if (counter > NMI_MAX_COUNTER_BITS)
181 clear_bit(counter, evntsel_nmi_owner);
183 EXPORT_SYMBOL(release_evntsel_nmi);
185 void disable_lapic_nmi_watchdog(void)
187 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
189 if (atomic_read(&nmi_active) <= 0)
192 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
197 BUG_ON(atomic_read(&nmi_active) != 0);
200 void enable_lapic_nmi_watchdog(void)
202 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
204 /* are we already enabled */
205 if (atomic_read(&nmi_active) != 0)
208 /* are we lapic aware */
211 if (!wd_ops->reserve()) {
212 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
216 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
217 touch_nmi_watchdog();
221 * Activate the NMI watchdog via the local APIC.
224 static unsigned int adjust_for_32bit_ctr(unsigned int hz)
227 unsigned int retval = hz;
230 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
231 * are writable, with higher bits sign extending from bit 31.
232 * So, we can only program the counter with 31 bit values and
233 * 32nd bit should be 1, for 33.. to be 1.
234 * Find the appropriate nmi_hz
236 counter_val = (u64)cpu_khz * 1000;
237 do_div(counter_val, retval);
238 if (counter_val > 0x7fffffffULL) {
239 u64 count = (u64)cpu_khz * 1000;
240 do_div(count, 0x7fffffffUL);
246 static void write_watchdog_counter(unsigned int perfctr_msr,
247 const char *descr, unsigned nmi_hz)
249 u64 count = (u64)cpu_khz * 1000;
251 do_div(count, nmi_hz);
253 pr_debug("setting %s to -0x%08Lx\n", descr, count);
254 wrmsrl(perfctr_msr, 0 - count);
257 static void write_watchdog_counter32(unsigned int perfctr_msr,
258 const char *descr, unsigned nmi_hz)
260 u64 count = (u64)cpu_khz * 1000;
262 do_div(count, nmi_hz);
264 pr_debug("setting %s to -0x%08Lx\n", descr, count);
265 wrmsr(perfctr_msr, (u32)(-count), 0);
269 * AMD K7/K8/Family10h/Family11h support.
270 * AMD keeps this interface nicely stable so there is not much variety
272 #define K7_EVNTSEL_ENABLE (1 << 22)
273 #define K7_EVNTSEL_INT (1 << 20)
274 #define K7_EVNTSEL_OS (1 << 17)
275 #define K7_EVNTSEL_USR (1 << 16)
276 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
277 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
279 static int setup_k7_watchdog(unsigned nmi_hz)
281 unsigned int perfctr_msr, evntsel_msr;
282 unsigned int evntsel;
283 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
285 perfctr_msr = wd_ops->perfctr;
286 evntsel_msr = wd_ops->evntsel;
288 wrmsrl(perfctr_msr, 0UL);
290 evntsel = K7_EVNTSEL_INT
295 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
302 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */
308 static void single_msr_stop_watchdog(void)
310 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
312 wrmsr(wd->evntsel_msr, 0, 0);
315 static int single_msr_reserve(void)
317 if (!reserve_perfctr_nmi(wd_ops->perfctr))
320 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
321 release_perfctr_nmi(wd_ops->perfctr);
327 static void single_msr_unreserve(void)
329 release_evntsel_nmi(wd_ops->evntsel);
330 release_perfctr_nmi(wd_ops->perfctr);
333 static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
335 /* start the cycle over again */
336 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
339 static const struct wd_ops k7_wd_ops = {
340 .reserve = single_msr_reserve,
341 .unreserve = single_msr_unreserve,
342 .setup = setup_k7_watchdog,
343 .rearm = single_msr_rearm,
344 .stop = single_msr_stop_watchdog,
345 .perfctr = MSR_K7_PERFCTR0,
346 .evntsel = MSR_K7_EVNTSEL0,
347 .checkbit = 1ULL << 47,
351 * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
353 #define P6_EVNTSEL0_ENABLE (1 << 22)
354 #define P6_EVNTSEL_INT (1 << 20)
355 #define P6_EVNTSEL_OS (1 << 17)
356 #define P6_EVNTSEL_USR (1 << 16)
357 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
358 #define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
360 static int setup_p6_watchdog(unsigned nmi_hz)
362 unsigned int perfctr_msr, evntsel_msr;
363 unsigned int evntsel;
364 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
366 perfctr_msr = wd_ops->perfctr;
367 evntsel_msr = wd_ops->evntsel;
369 /* KVM doesn't implement this MSR */
370 if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
373 evntsel = P6_EVNTSEL_INT
378 /* setup the timer */
379 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
386 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */
392 static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
395 * P6 based Pentium M need to re-unmask
396 * the apic vector but it doesn't hurt
398 * ArchPerfom/Core Duo also needs this
400 apic_write(APIC_LVTPC, APIC_DM_NMI);
402 /* P6/ARCH_PERFMON has 32 bit counter write */
403 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
406 static const struct wd_ops p6_wd_ops = {
407 .reserve = single_msr_reserve,
408 .unreserve = single_msr_unreserve,
409 .setup = setup_p6_watchdog,
411 .stop = single_msr_stop_watchdog,
412 .perfctr = MSR_P6_PERFCTR0,
413 .evntsel = MSR_P6_EVNTSEL0,
414 .checkbit = 1ULL << 39,
418 * Intel P4 performance counters.
419 * By far the most complicated of all.
421 #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
422 #define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
423 #define P4_ESCR_OS (1 << 3)
424 #define P4_ESCR_USR (1 << 2)
425 #define P4_CCCR_OVF_PMI0 (1 << 26)
426 #define P4_CCCR_OVF_PMI1 (1 << 27)
427 #define P4_CCCR_THRESHOLD(N) ((N) << 20)
428 #define P4_CCCR_COMPLEMENT (1 << 19)
429 #define P4_CCCR_COMPARE (1 << 18)
430 #define P4_CCCR_REQUIRED (3 << 16)
431 #define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
432 #define P4_CCCR_ENABLE (1 << 12)
433 #define P4_CCCR_OVF (1 << 31)
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented
438 * max threshold. [IA32-Vol3, Section 14.9.9]
440 static int setup_p4_watchdog(unsigned nmi_hz)
442 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
443 unsigned int evntsel, cccr_val;
444 unsigned int misc_enable, dummy;
446 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
448 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
449 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
453 /* detect which hyperthread we are on */
454 if (smp_num_siblings == 2) {
455 unsigned int ebx, apicid;
458 apicid = (ebx >> 24) & 0xff;
465 * performance counters are shared resources
466 * assign each hyperthread its own set
467 * (re-use the ESCR0 register, seems safe
468 * and keeps the cccr_val the same)
472 perfctr_msr = MSR_P4_IQ_PERFCTR0;
473 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
478 perfctr_msr = MSR_P4_IQ_PERFCTR1;
479 evntsel_msr = MSR_P4_CRU_ESCR0;
480 cccr_msr = MSR_P4_IQ_CCCR1;
481 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
484 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
488 cccr_val |= P4_CCCR_THRESHOLD(15)
493 wrmsr(evntsel_msr, evntsel, 0);
494 wrmsr(cccr_msr, cccr_val, 0);
495 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
496 apic_write(APIC_LVTPC, APIC_DM_NMI);
497 cccr_val |= P4_CCCR_ENABLE;
498 wrmsr(cccr_msr, cccr_val, 0);
499 wd->perfctr_msr = perfctr_msr;
500 wd->evntsel_msr = evntsel_msr;
501 wd->cccr_msr = cccr_msr;
505 static void stop_p4_watchdog(void)
507 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
508 wrmsr(wd->cccr_msr, 0, 0);
509 wrmsr(wd->evntsel_msr, 0, 0);
512 static int p4_reserve(void)
514 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
517 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
520 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
522 /* RED-PEN why is ESCR1 not reserved here? */
526 if (smp_num_siblings > 1)
527 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
530 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
534 static void p4_unreserve(void)
537 if (smp_num_siblings > 1)
538 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
540 release_evntsel_nmi(MSR_P4_CRU_ESCR0);
541 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
544 static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
549 * - An overflown perfctr will assert its interrupt
550 * until the OVF flag in its CCCR is cleared.
551 * - LVTPC is masked on interrupt and must be
552 * unmasked by the LVTPC handler.
554 rdmsrl(wd->cccr_msr, dummy);
555 dummy &= ~P4_CCCR_OVF;
556 wrmsrl(wd->cccr_msr, dummy);
557 apic_write(APIC_LVTPC, APIC_DM_NMI);
558 /* start the cycle over again */
559 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
562 static const struct wd_ops p4_wd_ops = {
563 .reserve = p4_reserve,
564 .unreserve = p4_unreserve,
565 .setup = setup_p4_watchdog,
567 .stop = stop_p4_watchdog,
568 /* RED-PEN this is wrong for the other sibling */
569 .perfctr = MSR_P4_BPU_PERFCTR0,
570 .evntsel = MSR_P4_BSU_ESCR0,
571 .checkbit = 1ULL << 39,
575 * Watchdog using the Intel architected PerfMon.
576 * Used for Core2 and hopefully all future Intel CPUs.
578 #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
579 #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
581 static struct wd_ops intel_arch_wd_ops;
583 static int setup_intel_arch_watchdog(unsigned nmi_hz)
586 union cpuid10_eax eax;
588 unsigned int perfctr_msr, evntsel_msr;
589 unsigned int evntsel;
590 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
593 * Check whether the Architectural PerfMon supports
594 * Unhalted Core Cycles Event or not.
595 * NOTE: Corresponding bit = 0 in ebx indicates event present.
597 cpuid(10, &(eax.full), &ebx, &unused, &unused);
598 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
599 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
602 perfctr_msr = wd_ops->perfctr;
603 evntsel_msr = wd_ops->evntsel;
605 wrmsrl(perfctr_msr, 0UL);
607 evntsel = ARCH_PERFMON_EVENTSEL_INT
608 | ARCH_PERFMON_EVENTSEL_OS
609 | ARCH_PERFMON_EVENTSEL_USR
610 | ARCH_PERFMON_NMI_EVENT_SEL
611 | ARCH_PERFMON_NMI_EVENT_UMASK;
613 /* setup the timer */
614 wrmsr(evntsel_msr, evntsel, 0);
615 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
616 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
617 apic_write(APIC_LVTPC, APIC_DM_NMI);
618 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
619 wrmsr(evntsel_msr, evntsel, 0);
621 wd->perfctr_msr = perfctr_msr;
622 wd->evntsel_msr = evntsel_msr;
623 wd->cccr_msr = 0; /* unused */
624 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
628 static struct wd_ops intel_arch_wd_ops __read_mostly = {
629 .reserve = single_msr_reserve,
630 .unreserve = single_msr_unreserve,
631 .setup = setup_intel_arch_watchdog,
633 .stop = single_msr_stop_watchdog,
634 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
635 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
638 static void probe_nmi_watchdog(void)
640 switch (boot_cpu_data.x86_vendor) {
642 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
643 boot_cpu_data.x86 != 16)
647 case X86_VENDOR_INTEL:
649 * Work around Core Duo (Yonah) errata AE49 where perfctr1
650 * doesn't have a working enable bit.
652 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
653 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
654 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
656 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
657 wd_ops = &intel_arch_wd_ops;
660 switch (boot_cpu_data.x86) {
662 if (boot_cpu_data.x86_model > 13)
677 /* Interface to nmi.c */
679 int lapic_watchdog_init(unsigned nmi_hz)
682 probe_nmi_watchdog();
684 printk(KERN_INFO "NMI watchdog: CPU not supported\n");
688 if (!wd_ops->reserve()) {
690 "NMI watchdog: cannot reserve perfctrs\n");
695 if (!(wd_ops->setup(nmi_hz))) {
696 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
697 raw_smp_processor_id());
704 void lapic_watchdog_stop(void)
710 unsigned lapic_adjust_nmi_hz(unsigned hz)
712 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
713 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
714 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
715 hz = adjust_for_32bit_ctr(hz);
719 int lapic_wd_event(unsigned nmi_hz)
721 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
724 rdmsrl(wd->perfctr_msr, ctr);
725 if (ctr & wd_ops->checkbit) /* perfctr still running? */
728 wd_ops->rearm(wd, nmi_hz);
732 int lapic_watchdog_ok(void)
734 return wd_ops != NULL;