err.no Git - linux-2.6/blob - arch/x86_64/kernel/mce.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/capability.h>
  19 #include <linux/cpu.h>
  20 #include <linux/percpu.h>
  21 #include <linux/poll.h>
  22 #include <linux/thread_info.h>
  23 #include <linux/ctype.h>
  24 #include <linux/kmod.h>
  25 #include <linux/kdebug.h>
  26 #include <asm/processor.h>
  27 #include <asm/msr.h>
  28 #include <asm/mce.h>
  29 #include <asm/uaccess.h>
  30 #include <asm/smp.h>
  31 #include <asm/idle.h>
  32
  33 #define MISC_MCELOG_MINOR 227
  34 #define NR_BANKS 6
  35
  36 atomic_t mce_entry;
  37
  38 static int mce_dont_init;
  39
  40 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  41    3: never panic or exit (for testing only) */
  42 static int tolerant = 1;
  43 static int banks;
  44 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  45 static unsigned long notify_user;
  46 static int rip_msr;
  47 static int mce_bootlog = 1;
  48 static atomic_t mce_events;
  49
  50 static char trigger[128];
  51 static char *trigger_argv[2] = { trigger, NULL };
  52
  53 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  54
  55 /*
  56  * Lockless MCE logging infrastructure.
  57  * This avoids deadlocks on printk locks without having to break locks. Also
  58  * separate MCEs from kernel messages to avoid bogus bug reports.
  59  */
  60
  61 struct mce_log mcelog = {
  62         MCE_LOG_SIGNATURE,
  63         MCE_LOG_LEN,
  64 };
  65
  66 void mce_log(struct mce *mce)
  67 {
  68         unsigned next, entry;
  69         atomic_inc(&mce_events);
  70         mce->finished = 0;
  71         wmb();
  72         for (;;) {
  73                 entry = rcu_dereference(mcelog.next);
  74                 /* The rmb forces the compiler to reload next in each
  75                     iteration */
  76                 rmb();
  77                 for (;;) {
  78                         /* When the buffer fills up discard new entries. Assume
  79                            that the earlier errors are the more interesting. */
  80                         if (entry >= MCE_LOG_LEN) {
  81                                 set_bit(MCE_OVERFLOW, &mcelog.flags);
  82                                 return;
  83                         }
  84                         /* Old left over entry. Skip. */
  85                         if (mcelog.entry[entry].finished) {
  86                                 entry++;
  87                                 continue;
  88                         }
  89                         break;
  90                 }
  91                 smp_rmb();
  92                 next = entry + 1;
  93                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  94                         break;
  95         }
  96         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  97         wmb();
  98         mcelog.entry[entry].finished = 1;
  99         wmb();
 100
 101         set_bit(0, &notify_user);
 102 }
 103
 104 static void print_mce(struct mce *m)
 105 {
 106         printk(KERN_EMERG "\n"
 107                KERN_EMERG "HARDWARE ERROR\n"
 108                KERN_EMERG
 109                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 110                m->cpu, m->mcgstatus, m->bank, m->status);
 111         if (m->rip) {
 112                 printk(KERN_EMERG
 113                        "RIP%s %02x:<%016Lx> ",
 114                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 115                        m->cs, m->rip);
 116                 if (m->cs == __KERNEL_CS)
 117                         print_symbol("{%s}", m->rip);
 118                 printk("\n");
 119         }
 120         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 121         if (m->addr)
 122                 printk("ADDR %Lx ", m->addr);
 123         if (m->misc)
 124                 printk("MISC %Lx ", m->misc);
 125         printk("\n");
 126         printk(KERN_EMERG "This is not a software problem!\n");
 127         printk(KERN_EMERG
 128     "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 129 }
 130
 131 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 132 {
 133         int i;
 134
 135         if (tolerant >= 3)
 136                 return;
 137
 138         oops_begin();
 139         for (i = 0; i < MCE_LOG_LEN; i++) {
 140                 unsigned long tsc = mcelog.entry[i].tsc;
 141                 if (time_before(tsc, start))
 142                         continue;
 143                 print_mce(&mcelog.entry[i]);
 144                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 145                         backup = NULL;
 146         }
 147         if (backup)
 148                 print_mce(backup);
 149         panic(msg);
 150 }
 151
 152 static int mce_available(struct cpuinfo_x86 *c)
 153 {
 154         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 155 }
 156
 157 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 158 {
 159         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 160                 m->rip = regs->rip;
 161                 m->cs = regs->cs;
 162         } else {
 163                 m->rip = 0;
 164                 m->cs = 0;
 165         }
 166         if (rip_msr) {
 167                 /* Assume the RIP in the MSR is exact. Is this true? */
 168                 m->mcgstatus |= MCG_STATUS_EIPV;
 169                 rdmsrl(rip_msr, m->rip);
 170                 m->cs = 0;
 171         }
 172 }
 173
 174 /*
 175  * The actual machine check handler
 176  */
 177
 178 void do_machine_check(struct pt_regs * regs, long error_code)
 179 {
 180         struct mce m, panicm;
 181         int nowayout = (tolerant < 1);
 182         int kill_it = 0;
 183         u64 mcestart = 0;
 184         int i;
 185         int panicm_found = 0;
 186
 187         atomic_inc(&mce_entry);
 188
 189         if (regs)
 190                 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
 191         if (!banks)
 192                 goto out2;
 193
 194         memset(&m, 0, sizeof(struct mce));
 195         m.cpu = smp_processor_id();
 196         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 197         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 198                 kill_it = 1;
 199
 200         rdtscll(mcestart);
 201         barrier();
 202
 203         for (i = 0; i < banks; i++) {
 204                 if (!bank[i])
 205                         continue;
 206
 207                 m.misc = 0;
 208                 m.addr = 0;
 209                 m.bank = i;
 210                 m.tsc = 0;
 211
 212                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 213                 if ((m.status & MCI_STATUS_VAL) == 0)
 214                         continue;
 215
 216                 if (m.status & MCI_STATUS_EN) {
 217                         /* In theory _OVER could be a nowayout too, but
 218                            assume any overflowed errors were no fatal. */
 219                         nowayout |= !!(m.status & MCI_STATUS_PCC);
 220                         kill_it |= !!(m.status & MCI_STATUS_UC);
 221                 }
 222
 223                 if (m.status & MCI_STATUS_MISCV)
 224                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 225                 if (m.status & MCI_STATUS_ADDRV)
 226                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 227
 228                 mce_get_rip(&m, regs);
 229                 if (error_code >= 0)
 230                         rdtscll(m.tsc);
 231                 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
 232                 if (error_code != -2)
 233                         mce_log(&m);
 234
 235                 /* Did this bank cause the exception? */
 236                 /* Assume that the bank with uncorrectable errors did it,
 237                    and that there is only a single one. */
 238                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 239                         panicm = m;
 240                         panicm_found = 1;
 241                 }
 242
 243                 add_taint(TAINT_MACHINE_CHECK);
 244         }
 245
 246         /* Never do anything final in the polling timer */
 247         if (!regs)
 248                 goto out;
 249
 250         /* If we didn't find an uncorrectable error, pick
 251            the last one (shouldn't happen, just being safe). */
 252         if (!panicm_found)
 253                 panicm = m;
 254         if (nowayout)
 255                 mce_panic("Machine check", &panicm, mcestart);
 256         if (kill_it) {
 257                 int user_space = 0;
 258
 259                 if (m.mcgstatus & MCG_STATUS_RIPV)
 260                         user_space = panicm.rip && (panicm.cs & 3);
 261
 262                 /* When the machine was in user space and the CPU didn't get
 263                    confused it's normally not necessary to panic, unless you
 264                    are paranoid (tolerant == 0)
 265
 266                    RED-PEN could be more tolerant for MCEs in idle,
 267                    but most likely they occur at boot anyways, where
 268                    it is best to just halt the machine. */
 269                 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
 270                     (unsigned)current->pid <= 1)
 271                         mce_panic("Uncorrected machine check", &panicm, mcestart);
 272
 273                 /* do_exit takes an awful lot of locks and has as
 274                    slight risk of deadlocking. If you don't want that
 275                    don't set tolerant >= 2 */
 276                 if (tolerant < 3)
 277                         do_exit(SIGBUS);
 278         }
 279
 280         /* notify userspace ASAP */
 281         set_thread_flag(TIF_MCE_NOTIFY);
 282
 283  out:
 284         /* Last thing done in the machine check exception to clear state. */
 285         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 286  out2:
 287         atomic_dec(&mce_entry);
 288 }
 289
 290 #ifdef CONFIG_X86_MCE_INTEL
 291 /***
 292  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 293  * @cpu: The CPU on which the event occured.
 294  * @status: Event status information
 295  *
 296  * This function should be called by the thermal interrupt after the
 297  * event has been processed and the decision was made to log the event
 298  * further.
 299  *
 300  * The status parameter will be saved to the 'status' field of 'struct mce'
 301  * and historically has been the register value of the
 302  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 303  */
 304 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 305 {
 306         struct mce m;
 307
 308         memset(&m, 0, sizeof(m));
 309         m.cpu = cpu;
 310         m.bank = MCE_THERMAL_BANK;
 311         m.status = status;
 312         rdtscll(m.tsc);
 313         mce_log(&m);
 314 }
 315 #endif /* CONFIG_X86_MCE_INTEL */
 316
 317 /*
 318  * Periodic polling timer for "silent" machine check errors.  If the
 319  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 320  * errors, poll 2x slower (up to check_interval seconds).
 321  */
 322
 323 static int check_interval = 5 * 60; /* 5 minutes */
 324 static int next_interval; /* in jiffies */
 325 static void mcheck_timer(struct work_struct *work);
 326 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 327
 328 static void mcheck_check_cpu(void *info)
 329 {
 330         if (mce_available(&current_cpu_data))
 331                 do_machine_check(NULL, 0);
 332 }
 333
 334 static void mcheck_timer(struct work_struct *work)
 335 {
 336         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 337
 338         /*
 339          * Alert userspace if needed.  If we logged an MCE, reduce the
 340          * polling interval, otherwise increase the polling interval.
 341          */
 342         if (mce_notify_user()) {
 343                 next_interval = max(next_interval/2, HZ/100);
 344         } else {
 345                 next_interval = min(next_interval*2, check_interval*HZ);
 346         }
 347
 348         schedule_delayed_work(&mcheck_work, next_interval);
 349 }
 350
 351 /*
 352  * This is only called from process context.  This is where we do
 353  * anything we need to alert userspace about new MCEs.  This is called
 354  * directly from the poller and also from entry.S and idle, thanks to
 355  * TIF_MCE_NOTIFY.
 356  */
 357 int mce_notify_user(void)
 358 {
 359         clear_thread_flag(TIF_MCE_NOTIFY);
 360         if (test_and_clear_bit(0, &notify_user)) {
 361                 static unsigned long last_print;
 362                 unsigned long now = jiffies;
 363
 364                 wake_up_interruptible(&mce_wait);
 365                 if (trigger[0])
 366                         call_usermodehelper(trigger, trigger_argv, NULL,
 367                                                 UMH_NO_WAIT);
 368
 369                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 370                         last_print = now;
 371                         printk(KERN_INFO "Machine check events logged\n");
 372                 }
 373
 374                 return 1;
 375         }
 376         return 0;
 377 }
 378
 379 /* see if the idle task needs to notify userspace */
 380 static int
 381 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 382 {
 383         /* IDLE_END should be safe - interrupts are back on */
 384         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 385                 mce_notify_user();
 386
 387         return NOTIFY_OK;
 388 }
 389
 390 static struct notifier_block mce_idle_notifier = {
 391         .notifier_call = mce_idle_callback,
 392 };
 393
 394 static __init int periodic_mcheck_init(void)
 395 {
 396         next_interval = check_interval * HZ;
 397         if (next_interval)
 398                 schedule_delayed_work(&mcheck_work, next_interval);
 399         idle_notifier_register(&mce_idle_notifier);
 400         return 0;
 401 }
 402 __initcall(periodic_mcheck_init);
 403
 404
 405 /*
 406  * Initialize Machine Checks for a CPU.
 407  */
 408 static void mce_init(void *dummy)
 409 {
 410         u64 cap;
 411         int i;
 412
 413         rdmsrl(MSR_IA32_MCG_CAP, cap);
 414         banks = cap & 0xff;
 415         if (banks > NR_BANKS) {
 416                 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
 417                 banks = NR_BANKS;
 418         }
 419         /* Use accurate RIP reporting if available. */
 420         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 421                 rip_msr = MSR_IA32_MCG_EIP;
 422
 423         /* Log the machine checks left over from the previous reset.
 424            This also clears all registers */
 425         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 426
 427         set_in_cr4(X86_CR4_MCE);
 428
 429         if (cap & MCG_CTL_P)
 430                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 431
 432         for (i = 0; i < banks; i++) {
 433                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 434                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 435         }
 436 }
 437
 438 /* Add per CPU specific workarounds here */
 439 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 440 {
 441         /* This should be disabled by the BIOS, but isn't always */
 442         if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
 443                 /* disable GART TBL walk error reporting, which trips off
 444                    incorrectly with the IOMMU & 3ware & Cerberus. */
 445                 clear_bit(10, &bank[4]);
 446                 /* Lots of broken BIOS around that don't clear them
 447                    by default and leave crap in there. Don't log. */
 448                 mce_bootlog = 0;
 449         }
 450
 451 }
 452
 453 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 454 {
 455         switch (c->x86_vendor) {
 456         case X86_VENDOR_INTEL:
 457                 mce_intel_feature_init(c);
 458                 break;
 459         case X86_VENDOR_AMD:
 460                 mce_amd_feature_init(c);
 461                 break;
 462         default:
 463                 break;
 464         }
 465 }
 466
 467 /*
 468  * Called for each booted CPU to set up machine checks.
 469  * Must be called with preempt off.
 470  */
 471 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 472 {
 473         static cpumask_t mce_cpus = CPU_MASK_NONE;
 474
 475         mce_cpu_quirks(c);
 476
 477         if (mce_dont_init ||
 478             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 479             !mce_available(c))
 480                 return;
 481
 482         mce_init(NULL);
 483         mce_cpu_features(c);
 484 }
 485
 486 /*
 487  * Character device to read and clear the MCE log.
 488  */
 489
 490 static DEFINE_SPINLOCK(mce_state_lock);
 491 static int open_count;  /* #times opened */
 492 static int open_exclu;  /* already open exclusive? */
 493
 494 static int mce_open(struct inode *inode, struct file *file)
 495 {
 496         spin_lock(&mce_state_lock);
 497
 498         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 499                 spin_unlock(&mce_state_lock);
 500                 return -EBUSY;
 501         }
 502
 503         if (file->f_flags & O_EXCL)
 504                 open_exclu = 1;
 505         open_count++;
 506
 507         spin_unlock(&mce_state_lock);
 508
 509         return 0;
 510 }
 511
 512 static int mce_release(struct inode *inode, struct file *file)
 513 {
 514         spin_lock(&mce_state_lock);
 515
 516         open_count--;
 517         open_exclu = 0;
 518
 519         spin_unlock(&mce_state_lock);
 520
 521         return 0;
 522 }
 523
 524 static void collect_tscs(void *data)
 525 {
 526         unsigned long *cpu_tsc = (unsigned long *)data;
 527         rdtscll(cpu_tsc[smp_processor_id()]);
 528 }
 529
 530 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
 531 {
 532         unsigned long *cpu_tsc;
 533         static DECLARE_MUTEX(mce_read_sem);
 534         unsigned next;
 535         char __user *buf = ubuf;
 536         int i, err;
 537
 538         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 539         if (!cpu_tsc)
 540                 return -ENOMEM;
 541
 542         down(&mce_read_sem);
 543         next = rcu_dereference(mcelog.next);
 544
 545         /* Only supports full reads right now */
 546         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 547                 up(&mce_read_sem);
 548                 kfree(cpu_tsc);
 549                 return -EINVAL;
 550         }
 551
 552         err = 0;
 553         for (i = 0; i < next; i++) {
 554                 unsigned long start = jiffies;
 555                 while (!mcelog.entry[i].finished) {
 556                         if (time_after_eq(jiffies, start + 2)) {
 557                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 558                                 goto timeout;
 559                         }
 560                         cpu_relax();
 561                 }
 562                 smp_rmb();
 563                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 564                 buf += sizeof(struct mce);
 565  timeout:
 566                 ;
 567         }
 568
 569         memset(mcelog.entry, 0, next * sizeof(struct mce));
 570         mcelog.next = 0;
 571
 572         synchronize_sched();
 573
 574         /* Collect entries that were still getting written before the synchronize. */
 575
 576         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 577         for (i = next; i < MCE_LOG_LEN; i++) {
 578                 if (mcelog.entry[i].finished &&
 579                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 580                         err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
 581                         smp_rmb();
 582                         buf += sizeof(struct mce);
 583                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 584                 }
 585         }
 586         up(&mce_read_sem);
 587         kfree(cpu_tsc);
 588         return err ? -EFAULT : buf - ubuf;
 589 }
 590
 591 static unsigned int mce_poll(struct file *file, poll_table *wait)
 592 {
 593         poll_wait(file, &mce_wait, wait);
 594         if (rcu_dereference(mcelog.next))
 595                 return POLLIN | POLLRDNORM;
 596         return 0;
 597 }
 598
 599 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 600 {
 601         int __user *p = (int __user *)arg;
 602         if (!capable(CAP_SYS_ADMIN))
 603                 return -EPERM;
 604         switch (cmd) {
 605         case MCE_GET_RECORD_LEN:
 606                 return put_user(sizeof(struct mce), p);
 607         case MCE_GET_LOG_LEN:
 608                 return put_user(MCE_LOG_LEN, p);
 609         case MCE_GETCLEAR_FLAGS: {
 610                 unsigned flags;
 611                 do {
 612                         flags = mcelog.flags;
 613                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 614                 return put_user(flags, p);
 615         }
 616         default:
 617                 return -ENOTTY;
 618         }
 619 }
 620
 621 static const struct file_operations mce_chrdev_ops = {
 622         .open = mce_open,
 623         .release = mce_release,
 624         .read = mce_read,
 625         .poll = mce_poll,
 626         .ioctl = mce_ioctl,
 627 };
 628
 629 static struct miscdevice mce_log_device = {
 630         MISC_MCELOG_MINOR,
 631         "mcelog",
 632         &mce_chrdev_ops,
 633 };
 634
 635 /*
 636  * Old style boot options parsing. Only for compatibility.
 637  */
 638
 639 static int __init mcheck_disable(char *str)
 640 {
 641         mce_dont_init = 1;
 642         return 1;
 643 }
 644
 645 /* mce=off disables machine check. Note you can reenable it later
 646    using sysfs.
 647    mce=TOLERANCELEVEL (number, see above)
 648    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 649    mce=nobootlog Don't log MCEs from before booting. */
 650 static int __init mcheck_enable(char *str)
 651 {
 652         if (*str == '=')
 653                 str++;
 654         if (!strcmp(str, "off"))
 655                 mce_dont_init = 1;
 656         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 657                 mce_bootlog = str[0] == 'b';
 658         else if (isdigit(str[0]))
 659                 get_option(&str, &tolerant);
 660         else
 661                 printk("mce= argument %s ignored. Please use /sys", str);
 662         return 1;
 663 }
 664
 665 __setup("nomce", mcheck_disable);
 666 __setup("mce", mcheck_enable);
 667
 668 /*
 669  * Sysfs support
 670  */
 671
 672 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 673    Only one CPU is active at this time, the others get readded later using
 674    CPU hotplug. */
 675 static int mce_resume(struct sys_device *dev)
 676 {
 677         mce_init(NULL);
 678         return 0;
 679 }
 680
 681 /* Reinit MCEs after user configuration changes */
 682 static void mce_restart(void)
 683 {
 684         if (next_interval)
 685                 cancel_delayed_work(&mcheck_work);
 686         /* Timer race is harmless here */
 687         on_each_cpu(mce_init, NULL, 1, 1);
 688         next_interval = check_interval * HZ;
 689         if (next_interval)
 690                 schedule_delayed_work(&mcheck_work, next_interval);
 691 }
 692
 693 static struct sysdev_class mce_sysclass = {
 694         .resume = mce_resume,
 695         set_kset_name("machinecheck"),
 696 };
 697
 698 DEFINE_PER_CPU(struct sys_device, device_mce);
 699
 700 /* Why are there no generic functions for this? */
 701 #define ACCESSOR(name, var, start) \
 702         static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
 703                 return sprintf(buf, "%lx\n", (unsigned long)var);                  \
 704         }                                                                          \
 705         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 706                 char *end;                                                         \
 707                 unsigned long new = simple_strtoul(buf, &end, 0);                  \
 708                 if (end == buf) return -EINVAL;                                    \
 709                 var = new;                                                         \
 710                 start;                                                             \
 711                 return end-buf;                                                    \
 712         }                                                                          \
 713         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 714
 715 /* TBD should generate these dynamically based on number of available banks */
 716 ACCESSOR(bank0ctl,bank[0],mce_restart())
 717 ACCESSOR(bank1ctl,bank[1],mce_restart())
 718 ACCESSOR(bank2ctl,bank[2],mce_restart())
 719 ACCESSOR(bank3ctl,bank[3],mce_restart())
 720 ACCESSOR(bank4ctl,bank[4],mce_restart())
 721 ACCESSOR(bank5ctl,bank[5],mce_restart())
 722
 723 static ssize_t show_trigger(struct sys_device *s, char *buf)
 724 {
 725         strcpy(buf, trigger);
 726         strcat(buf, "\n");
 727         return strlen(trigger) + 1;
 728 }
 729
 730 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
 731 {
 732         char *p;
 733         int len;
 734         strncpy(trigger, buf, sizeof(trigger));
 735         trigger[sizeof(trigger)-1] = 0;
 736         len = strlen(trigger);
 737         p = strchr(trigger, '\n');
 738         if (*p) *p = 0;
 739         return len;
 740 }
 741
 742 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 743 ACCESSOR(tolerant,tolerant,)
 744 ACCESSOR(check_interval,check_interval,mce_restart())
 745 static struct sysdev_attribute *mce_attributes[] = {
 746         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 747         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 748         &attr_tolerant, &attr_check_interval, &attr_trigger,
 749         NULL
 750 };
 751
 752 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 753 static __cpuinit int mce_create_device(unsigned int cpu)
 754 {
 755         int err;
 756         int i;
 757         if (!mce_available(&cpu_data[cpu]))
 758                 return -EIO;
 759
 760         per_cpu(device_mce,cpu).id = cpu;
 761         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 762
 763         err = sysdev_register(&per_cpu(device_mce,cpu));
 764
 765         if (!err) {
 766                 for (i = 0; mce_attributes[i]; i++)
 767                         sysdev_create_file(&per_cpu(device_mce,cpu),
 768                                 mce_attributes[i]);
 769         }
 770         return err;
 771 }
 772
 773 static void mce_remove_device(unsigned int cpu)
 774 {
 775         int i;
 776
 777         for (i = 0; mce_attributes[i]; i++)
 778                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 779                         mce_attributes[i]);
 780         sysdev_unregister(&per_cpu(device_mce,cpu));
 781         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 782 }
 783
 784 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 785 static int
 786 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 787 {
 788         unsigned int cpu = (unsigned long)hcpu;
 789
 790         switch (action) {
 791         case CPU_ONLINE:
 792         case CPU_ONLINE_FROZEN:
 793                 mce_create_device(cpu);
 794                 break;
 795         case CPU_DEAD:
 796         case CPU_DEAD_FROZEN:
 797                 mce_remove_device(cpu);
 798                 break;
 799         }
 800         return NOTIFY_OK;
 801 }
 802
 803 static struct notifier_block mce_cpu_notifier = {
 804         .notifier_call = mce_cpu_callback,
 805 };
 806
 807 static __init int mce_init_device(void)
 808 {
 809         int err;
 810         int i = 0;
 811
 812         if (!mce_available(&boot_cpu_data))
 813                 return -EIO;
 814         err = sysdev_class_register(&mce_sysclass);
 815
 816         for_each_online_cpu(i) {
 817                 mce_create_device(i);
 818         }
 819
 820         register_hotcpu_notifier(&mce_cpu_notifier);
 821         misc_register(&mce_log_device);
 822         return err;
 823 }
 824
 825 device_initcall(mce_init_device);