2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/capability.h>
19 #include <linux/cpu.h>
20 #include <linux/percpu.h>
21 #include <linux/poll.h>
22 #include <linux/thread_info.h>
23 #include <linux/ctype.h>
24 #include <linux/kmod.h>
25 #include <linux/kdebug.h>
26 #include <asm/processor.h>
29 #include <asm/uaccess.h>
33 #define MISC_MCELOG_MINOR 227
38 static int mce_dont_init;
40 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
41 3: never panic or exit (for testing only) */
42 static int tolerant = 1;
44 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
45 static unsigned long notify_user;
47 static int mce_bootlog = 1;
48 static atomic_t mce_events;
50 static char trigger[128];
51 static char *trigger_argv[2] = { trigger, NULL };
53 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
56 * Lockless MCE logging infrastructure.
57 * This avoids deadlocks on printk locks without having to break locks. Also
58 * separate MCEs from kernel messages to avoid bogus bug reports.
61 struct mce_log mcelog = {
66 void mce_log(struct mce *mce)
69 atomic_inc(&mce_events);
73 entry = rcu_dereference(mcelog.next);
74 /* The rmb forces the compiler to reload next in each
78 /* When the buffer fills up discard new entries. Assume
79 that the earlier errors are the more interesting. */
80 if (entry >= MCE_LOG_LEN) {
81 set_bit(MCE_OVERFLOW, &mcelog.flags);
84 /* Old left over entry. Skip. */
85 if (mcelog.entry[entry].finished) {
93 if (cmpxchg(&mcelog.next, entry, next) == entry)
96 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
98 mcelog.entry[entry].finished = 1;
101 set_bit(0, ¬ify_user);
104 static void print_mce(struct mce *m)
106 printk(KERN_EMERG "\n"
107 KERN_EMERG "HARDWARE ERROR\n"
109 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
110 m->cpu, m->mcgstatus, m->bank, m->status);
113 "RIP%s %02x:<%016Lx> ",
114 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
116 if (m->cs == __KERNEL_CS)
117 print_symbol("{%s}", m->rip);
120 printk(KERN_EMERG "TSC %Lx ", m->tsc);
122 printk("ADDR %Lx ", m->addr);
124 printk("MISC %Lx ", m->misc);
126 printk(KERN_EMERG "This is not a software problem!\n");
128 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
131 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
139 for (i = 0; i < MCE_LOG_LEN; i++) {
140 unsigned long tsc = mcelog.entry[i].tsc;
141 if (time_before(tsc, start))
143 print_mce(&mcelog.entry[i]);
144 if (backup && mcelog.entry[i].tsc == backup->tsc)
152 static int mce_available(struct cpuinfo_x86 *c)
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
157 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
159 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
167 /* Assume the RIP in the MSR is exact. Is this true? */
168 m->mcgstatus |= MCG_STATUS_EIPV;
169 rdmsrl(rip_msr, m->rip);
175 * The actual machine check handler
178 void do_machine_check(struct pt_regs * regs, long error_code)
180 struct mce m, panicm;
181 int nowayout = (tolerant < 1);
185 int panicm_found = 0;
187 atomic_inc(&mce_entry);
190 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
194 memset(&m, 0, sizeof(struct mce));
195 m.cpu = smp_processor_id();
196 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
197 if (!(m.mcgstatus & MCG_STATUS_RIPV))
203 for (i = 0; i < banks; i++) {
212 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
213 if ((m.status & MCI_STATUS_VAL) == 0)
216 if (m.status & MCI_STATUS_EN) {
217 /* In theory _OVER could be a nowayout too, but
218 assume any overflowed errors were no fatal. */
219 nowayout |= !!(m.status & MCI_STATUS_PCC);
220 kill_it |= !!(m.status & MCI_STATUS_UC);
223 if (m.status & MCI_STATUS_MISCV)
224 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
225 if (m.status & MCI_STATUS_ADDRV)
226 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
228 mce_get_rip(&m, regs);
231 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
232 if (error_code != -2)
235 /* Did this bank cause the exception? */
236 /* Assume that the bank with uncorrectable errors did it,
237 and that there is only a single one. */
238 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
243 add_taint(TAINT_MACHINE_CHECK);
246 /* Never do anything final in the polling timer */
250 /* If we didn't find an uncorrectable error, pick
251 the last one (shouldn't happen, just being safe). */
255 mce_panic("Machine check", &panicm, mcestart);
259 if (m.mcgstatus & MCG_STATUS_RIPV)
260 user_space = panicm.rip && (panicm.cs & 3);
262 /* When the machine was in user space and the CPU didn't get
263 confused it's normally not necessary to panic, unless you
264 are paranoid (tolerant == 0)
266 RED-PEN could be more tolerant for MCEs in idle,
267 but most likely they occur at boot anyways, where
268 it is best to just halt the machine. */
269 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
270 (unsigned)current->pid <= 1)
271 mce_panic("Uncorrected machine check", &panicm, mcestart);
273 /* do_exit takes an awful lot of locks and has as
274 slight risk of deadlocking. If you don't want that
275 don't set tolerant >= 2 */
280 /* notify userspace ASAP */
281 set_thread_flag(TIF_MCE_NOTIFY);
284 /* Last thing done in the machine check exception to clear state. */
285 wrmsrl(MSR_IA32_MCG_STATUS, 0);
287 atomic_dec(&mce_entry);
290 #ifdef CONFIG_X86_MCE_INTEL
292 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
293 * @cpu: The CPU on which the event occured.
294 * @status: Event status information
296 * This function should be called by the thermal interrupt after the
297 * event has been processed and the decision was made to log the event
300 * The status parameter will be saved to the 'status' field of 'struct mce'
301 * and historically has been the register value of the
302 * MSR_IA32_THERMAL_STATUS (Intel) msr.
304 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
308 memset(&m, 0, sizeof(m));
310 m.bank = MCE_THERMAL_BANK;
315 #endif /* CONFIG_X86_MCE_INTEL */
318 * Periodic polling timer for "silent" machine check errors. If the
319 * poller finds an MCE, poll 2x faster. When the poller finds no more
320 * errors, poll 2x slower (up to check_interval seconds).
323 static int check_interval = 5 * 60; /* 5 minutes */
324 static int next_interval; /* in jiffies */
325 static void mcheck_timer(struct work_struct *work);
326 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
328 static void mcheck_check_cpu(void *info)
330 if (mce_available(¤t_cpu_data))
331 do_machine_check(NULL, 0);
334 static void mcheck_timer(struct work_struct *work)
336 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
339 * Alert userspace if needed. If we logged an MCE, reduce the
340 * polling interval, otherwise increase the polling interval.
342 if (mce_notify_user()) {
343 next_interval = max(next_interval/2, HZ/100);
345 next_interval = min(next_interval*2, check_interval*HZ);
348 schedule_delayed_work(&mcheck_work, next_interval);
352 * This is only called from process context. This is where we do
353 * anything we need to alert userspace about new MCEs. This is called
354 * directly from the poller and also from entry.S and idle, thanks to
357 int mce_notify_user(void)
359 clear_thread_flag(TIF_MCE_NOTIFY);
360 if (test_and_clear_bit(0, ¬ify_user)) {
361 static unsigned long last_print;
362 unsigned long now = jiffies;
364 wake_up_interruptible(&mce_wait);
366 call_usermodehelper(trigger, trigger_argv, NULL,
369 if (time_after_eq(now, last_print + (check_interval*HZ))) {
371 printk(KERN_INFO "Machine check events logged\n");
379 /* see if the idle task needs to notify userspace */
381 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
383 /* IDLE_END should be safe - interrupts are back on */
384 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
390 static struct notifier_block mce_idle_notifier = {
391 .notifier_call = mce_idle_callback,
394 static __init int periodic_mcheck_init(void)
396 next_interval = check_interval * HZ;
398 schedule_delayed_work(&mcheck_work, next_interval);
399 idle_notifier_register(&mce_idle_notifier);
402 __initcall(periodic_mcheck_init);
406 * Initialize Machine Checks for a CPU.
408 static void mce_init(void *dummy)
413 rdmsrl(MSR_IA32_MCG_CAP, cap);
415 if (banks > NR_BANKS) {
416 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
419 /* Use accurate RIP reporting if available. */
420 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
421 rip_msr = MSR_IA32_MCG_EIP;
423 /* Log the machine checks left over from the previous reset.
424 This also clears all registers */
425 do_machine_check(NULL, mce_bootlog ? -1 : -2);
427 set_in_cr4(X86_CR4_MCE);
430 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
432 for (i = 0; i < banks; i++) {
433 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
434 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
438 /* Add per CPU specific workarounds here */
439 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
441 /* This should be disabled by the BIOS, but isn't always */
442 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
443 /* disable GART TBL walk error reporting, which trips off
444 incorrectly with the IOMMU & 3ware & Cerberus. */
445 clear_bit(10, &bank[4]);
446 /* Lots of broken BIOS around that don't clear them
447 by default and leave crap in there. Don't log. */
453 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
455 switch (c->x86_vendor) {
456 case X86_VENDOR_INTEL:
457 mce_intel_feature_init(c);
460 mce_amd_feature_init(c);
468 * Called for each booted CPU to set up machine checks.
469 * Must be called with preempt off.
471 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
473 static cpumask_t mce_cpus = CPU_MASK_NONE;
478 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
487 * Character device to read and clear the MCE log.
490 static DEFINE_SPINLOCK(mce_state_lock);
491 static int open_count; /* #times opened */
492 static int open_exclu; /* already open exclusive? */
494 static int mce_open(struct inode *inode, struct file *file)
496 spin_lock(&mce_state_lock);
498 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
499 spin_unlock(&mce_state_lock);
503 if (file->f_flags & O_EXCL)
507 spin_unlock(&mce_state_lock);
512 static int mce_release(struct inode *inode, struct file *file)
514 spin_lock(&mce_state_lock);
519 spin_unlock(&mce_state_lock);
524 static void collect_tscs(void *data)
526 unsigned long *cpu_tsc = (unsigned long *)data;
527 rdtscll(cpu_tsc[smp_processor_id()]);
530 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
532 unsigned long *cpu_tsc;
533 static DECLARE_MUTEX(mce_read_sem);
535 char __user *buf = ubuf;
538 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
543 next = rcu_dereference(mcelog.next);
545 /* Only supports full reads right now */
546 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
553 for (i = 0; i < next; i++) {
554 unsigned long start = jiffies;
555 while (!mcelog.entry[i].finished) {
556 if (time_after_eq(jiffies, start + 2)) {
557 memset(mcelog.entry + i,0, sizeof(struct mce));
563 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
564 buf += sizeof(struct mce);
569 memset(mcelog.entry, 0, next * sizeof(struct mce));
574 /* Collect entries that were still getting written before the synchronize. */
576 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
577 for (i = next; i < MCE_LOG_LEN; i++) {
578 if (mcelog.entry[i].finished &&
579 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
580 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
582 buf += sizeof(struct mce);
583 memset(&mcelog.entry[i], 0, sizeof(struct mce));
588 return err ? -EFAULT : buf - ubuf;
591 static unsigned int mce_poll(struct file *file, poll_table *wait)
593 poll_wait(file, &mce_wait, wait);
594 if (rcu_dereference(mcelog.next))
595 return POLLIN | POLLRDNORM;
599 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
601 int __user *p = (int __user *)arg;
602 if (!capable(CAP_SYS_ADMIN))
605 case MCE_GET_RECORD_LEN:
606 return put_user(sizeof(struct mce), p);
607 case MCE_GET_LOG_LEN:
608 return put_user(MCE_LOG_LEN, p);
609 case MCE_GETCLEAR_FLAGS: {
612 flags = mcelog.flags;
613 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
614 return put_user(flags, p);
621 static const struct file_operations mce_chrdev_ops = {
623 .release = mce_release,
629 static struct miscdevice mce_log_device = {
636 * Old style boot options parsing. Only for compatibility.
639 static int __init mcheck_disable(char *str)
645 /* mce=off disables machine check. Note you can reenable it later
647 mce=TOLERANCELEVEL (number, see above)
648 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
649 mce=nobootlog Don't log MCEs from before booting. */
650 static int __init mcheck_enable(char *str)
654 if (!strcmp(str, "off"))
656 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
657 mce_bootlog = str[0] == 'b';
658 else if (isdigit(str[0]))
659 get_option(&str, &tolerant);
661 printk("mce= argument %s ignored. Please use /sys", str);
665 __setup("nomce", mcheck_disable);
666 __setup("mce", mcheck_enable);
672 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
673 Only one CPU is active at this time, the others get readded later using
675 static int mce_resume(struct sys_device *dev)
681 /* Reinit MCEs after user configuration changes */
682 static void mce_restart(void)
685 cancel_delayed_work(&mcheck_work);
686 /* Timer race is harmless here */
687 on_each_cpu(mce_init, NULL, 1, 1);
688 next_interval = check_interval * HZ;
690 schedule_delayed_work(&mcheck_work, next_interval);
693 static struct sysdev_class mce_sysclass = {
694 .resume = mce_resume,
695 set_kset_name("machinecheck"),
698 DEFINE_PER_CPU(struct sys_device, device_mce);
700 /* Why are there no generic functions for this? */
701 #define ACCESSOR(name, var, start) \
702 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
703 return sprintf(buf, "%lx\n", (unsigned long)var); \
705 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
707 unsigned long new = simple_strtoul(buf, &end, 0); \
708 if (end == buf) return -EINVAL; \
713 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
715 /* TBD should generate these dynamically based on number of available banks */
716 ACCESSOR(bank0ctl,bank[0],mce_restart())
717 ACCESSOR(bank1ctl,bank[1],mce_restart())
718 ACCESSOR(bank2ctl,bank[2],mce_restart())
719 ACCESSOR(bank3ctl,bank[3],mce_restart())
720 ACCESSOR(bank4ctl,bank[4],mce_restart())
721 ACCESSOR(bank5ctl,bank[5],mce_restart())
723 static ssize_t show_trigger(struct sys_device *s, char *buf)
725 strcpy(buf, trigger);
727 return strlen(trigger) + 1;
730 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
734 strncpy(trigger, buf, sizeof(trigger));
735 trigger[sizeof(trigger)-1] = 0;
736 len = strlen(trigger);
737 p = strchr(trigger, '\n');
742 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
743 ACCESSOR(tolerant,tolerant,)
744 ACCESSOR(check_interval,check_interval,mce_restart())
745 static struct sysdev_attribute *mce_attributes[] = {
746 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
747 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
748 &attr_tolerant, &attr_check_interval, &attr_trigger,
752 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
753 static __cpuinit int mce_create_device(unsigned int cpu)
757 if (!mce_available(&cpu_data[cpu]))
760 per_cpu(device_mce,cpu).id = cpu;
761 per_cpu(device_mce,cpu).cls = &mce_sysclass;
763 err = sysdev_register(&per_cpu(device_mce,cpu));
766 for (i = 0; mce_attributes[i]; i++)
767 sysdev_create_file(&per_cpu(device_mce,cpu),
773 static void mce_remove_device(unsigned int cpu)
777 for (i = 0; mce_attributes[i]; i++)
778 sysdev_remove_file(&per_cpu(device_mce,cpu),
780 sysdev_unregister(&per_cpu(device_mce,cpu));
781 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
784 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
786 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
788 unsigned int cpu = (unsigned long)hcpu;
792 case CPU_ONLINE_FROZEN:
793 mce_create_device(cpu);
796 case CPU_DEAD_FROZEN:
797 mce_remove_device(cpu);
803 static struct notifier_block mce_cpu_notifier = {
804 .notifier_call = mce_cpu_callback,
807 static __init int mce_init_device(void)
812 if (!mce_available(&boot_cpu_data))
814 err = sysdev_class_register(&mce_sysclass);
816 for_each_online_cpu(i) {
817 mce_create_device(i);
820 register_hotcpu_notifier(&mce_cpu_notifier);
821 misc_register(&mce_log_device);
825 device_initcall(mce_init_device);