x86, mce: switch x86 machine check handler to Monarch election.

author Andi Kleen <andi@firstfloor.org>

Wed, 27 May 2009 19:56:55 +0000 (21:56 +0200)

committer H. Peter Anvin <hpa@zytor.com>

Wed, 3 Jun 2009 21:45:12 +0000 (14:45 -0700)
author Andi Kleen <andi@firstfloor.org>
Wed, 27 May 2009 19:56:55 +0000 (21:56 +0200)
committer H. Peter Anvin <hpa@zytor.com>
Wed, 3 Jun 2009 21:45:12 +0000 (14:45 -0700)
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt

index 63fca71..0ee5e3b 100644 (file)
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -15,13 +15,17 @@ Machine check
                 in a reboot. On Intel systems it is enabled by default.
     mce=nobootlog
                 Disable boot machine check logging.
-   mce=tolerancelevel (number)
+   mce=tolerancelevel[,monarchtimeout] (number,number)
+               tolerance levels:
                 0: always panic on uncorrected errors, log corrected errors
                 1: panic or SIGBUS on uncorrected errors, log corrected errors
                 2: SIGBUS or log uncorrected errors, log corrected errors
                 3: never panic or SIGBUS, log all errors (for testing only)
                 Default is 1
                 Can be also set using sysfs which is preferable.
+               monarchtimeout:
+               Sets the time in us to wait for other CPUs on machine checks. 0
+               to disable.
  
     nomce (for compatibility with i386): same as mce=off
  
diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck

index a4fdb25..b1fb302 100644 (file)
--- a/Documentation/x86/x86_64/machinecheck
+++ b/Documentation/x86/x86_64/machinecheck
@@ -69,6 +69,10 @@ trigger
         Program to run when a machine check event is detected.
         This is an alternative to running mcelog regularly from cron
         and allows to detect events faster.
+monarch_timeout
+       How long to wait for the other CPUs to machine check too on a
+       exception. 0 to disable waiting for other CPUs.
+       Unit: us
  
  TBD document entries for AMD threshold interrupt configuration
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index 421020f..ba43189 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
  #include <linux/percpu.h>
  #include <linux/string.h>
  #include <linux/sysdev.h>
+#include <linux/delay.h>
  #include <linux/ctype.h>
  #include <linux/sched.h>
  #include <linux/sysfs.h>
@@ -28,6 +29,7 @@
  #include <linux/init.h>
  #include <linux/kmod.h>
  #include <linux/poll.h>
+#include <linux/nmi.h>
  #include <linux/cpu.h>
  #include <linux/smp.h>
  #include <linux/fs.h>
@@ -60,6 +62,8 @@ int                           mce_disabled;
  
  #define MISC_MCELOG_MINOR      227
  
+#define SPINUNIT 100   /* 100ns */
+
  atomic_t mce_entry;
  
  DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -77,6 +81,7 @@ static u64                    *bank;
  static unsigned long           notify_user;
  static int                     rip_msr;
  static int                     mce_bootlog = -1;
+static int                     monarch_timeout = -1;
  
  static char                    trigger[128];
  static char                    *trigger_argv[2] = { trigger, NULL };
@@ -84,6 +89,9 @@ static char                   *trigger_argv[2] = { trigger, NULL };
  static unsigned long           dont_init_banks;
  
  static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static int                     cpu_missing;
+
  
  /* MCA banks polled by the period polling timer for corrected events */
  DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
         }
         if (final)
                 print_mce(final);
+       if (cpu_missing)
+               printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
         if (exp)
                 printk(KERN_EMERG "Machine check: %s\n", exp);
         panic(msg);
@@ -451,18 +461,287 @@ static int mce_no_way_out(struct mce *m, char **msg)
  }
  
  /*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t)
+{
+       /*
+        * The others already did panic for some reason.
+        * Bail out like in a timeout.
+        * rmb() to tell the compiler that system_state
+        * might have been modified by someone else.
+        */
+       rmb();
+       if (atomic_read(&mce_paniced))
+               wait_for_panic();
+       if (!monarch_timeout)
+               goto out;
+       if ((s64)*t < SPINUNIT) {
+               /* CHECKME: Make panic default for 1 too? */
+               if (tolerant < 1)
+                       mce_panic("Timeout synchronizing machine check over CPUs",
+                                 NULL, NULL);
+               cpu_missing = 1;
+               return 1;
+       }
+       *t -= SPINUNIT;
+out:
+       touch_nmi_watchdog();
+       return 0;
+}
+
+/*
+ * The Monarch's reign.  The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of an machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+       int cpu;
+       struct mce *m = NULL;
+       int global_worst = 0;
+       char *msg = NULL;
+       char *nmsg = NULL;
+
+       /*
+        * This CPU is the Monarch and the other CPUs have run
+        * through their handlers.
+        * Grade the severity of the errors of all the CPUs.
+        */
+       for_each_possible_cpu(cpu) {
+               int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+                                           &nmsg);
+               if (severity > global_worst) {
+                       msg = nmsg;
+                       global_worst = severity;
+                       m = &per_cpu(mces_seen, cpu);
+               }
+       }
+
+       /*
+        * Cannot recover? Panic here then.
+        * This dumps all the mces in the log buffer and stops the
+        * other CPUs.
+        */
+       if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+               mce_panic("Fatal machine check", m, msg);
+
+       /*
+        * For UC somewhere we let the CPU who detects it handle it.
+        * Also must let continue the others, otherwise the handling
+        * CPU could deadlock on a lock.
+        */
+
+       /*
+        * No machine check event found. Must be some external
+        * source or one CPU is hung. Panic.
+        */
+       if (!m && tolerant < 3)
+               mce_panic("Machine check from unknown source", NULL, NULL);
+
+       /*
+        * Now clear all the mces_seen so that they don't reappear on
+        * the next mce.
+        */
+       for_each_possible_cpu(cpu)
+               memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int no_way_out, int *order)
+{
+       int nwo;
+       int cpus = num_online_cpus();
+       u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+       if (!timeout) {
+               *order = -1;
+               return no_way_out;
+       }
+
+       atomic_add(no_way_out, &global_nwo);
+
+       /*
+        * Wait for everyone.
+        */
+       while (atomic_read(&mce_callin) != cpus) {
+               if (mce_timed_out(&timeout)) {
+                       atomic_set(&global_nwo, 0);
+                       *order = -1;
+                       return no_way_out;
+               }
+               ndelay(SPINUNIT);
+       }
+
+       /*
+        * Cache the global no_way_out state.
+        */
+       nwo = atomic_read(&global_nwo);
+
+       /*
+        * Monarch starts executing now, the others wait.
+        */
+       if (*order == 1) {
+               atomic_set(&mce_executing, 1);
+               return nwo;
+       }
+
+       /*
+        * Now start the scanning loop one by one
+        * in the original callin order.
+        * This way when there are any shared banks it will
+        * be only seen by one CPU before cleared, avoiding duplicates.
+        */
+       while (atomic_read(&mce_executing) < *order) {
+               if (mce_timed_out(&timeout)) {
+                       atomic_set(&global_nwo, 0);
+                       *order = -1;
+                       return no_way_out;
+               }
+               ndelay(SPINUNIT);
+       }
+       return nwo;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+       int ret = -1;
+       u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+       if (!timeout)
+               goto reset;
+       if (order < 0)
+               goto reset;
+
+       /*
+        * Allow others to run.
+        */
+       atomic_inc(&mce_executing);
+
+       if (order == 1) {
+               /* CHECKME: Can this race with a parallel hotplug? */
+               int cpus = num_online_cpus();
+
+               /*
+                * Monarch: Wait for everyone to go through their scanning
+                * loops.
+                */
+               while (atomic_read(&mce_executing) <= cpus) {
+                       if (mce_timed_out(&timeout))
+                               goto reset;
+                       ndelay(SPINUNIT);
+               }
+
+               mce_reign();
+               barrier();
+               ret = 0;
+       } else {
+               /*
+                * Subject: Wait for Monarch to finish.
+                */
+               while (atomic_read(&mce_executing) != 0) {
+                       if (mce_timed_out(&timeout))
+                               goto reset;
+                       ndelay(SPINUNIT);
+               }
+
+               /*
+                * Don't reset anything. That's done by the Monarch.
+                */
+               return 0;
+       }
+
+       /*
+        * Reset all global state.
+        */
+reset:
+       atomic_set(&global_nwo, 0);
+       atomic_set(&mce_callin, 0);
+       barrier();
+
+       /*
+        * Let others run again.
+        */
+       atomic_set(&mce_executing, 0);
+       return ret;
+}
+
+static void mce_clear_state(unsigned long *toclear)
+{
+       int i;
+
+       for (i = 0; i < banks; i++) {
+               if (test_bit(i, toclear))
+                       mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       }
+}
+
+/*
   * The actual machine check handler. This only handles real
   * exceptions when something got corrupted coming in through int 18.
   *
   * This is executed in NMI context not subject to normal locking rules. This
   * implies that most kernel services cannot be safely used. Don't even
   * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
   */
  void do_machine_check(struct pt_regs *regs, long error_code)
  {
-       struct mce m, panicm;
-       int panicm_found = 0;
+       struct mce m, *final;
         int i;
+       int worst = 0;
+       int severity;
+       /*
+        * Establish sequential order between the CPUs entering the machine
+        * check handler.
+        */
+       int order;
+
         /*
          * If no_way_out gets set, there is no safe way to recover from this
          * MCE.  If tolerant is cranked up, we'll try anyway.
@@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
         if (!banks)
                 goto out;
  
+       order = atomic_add_return(1, &mce_callin);
         mce_setup(&m);
  
         m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
         no_way_out = mce_no_way_out(&m, &msg);
  
+       final = &__get_cpu_var(mces_seen);
+       *final = m;
+
         barrier();
  
+       /*
+        * Go through all the banks in exclusion of the other CPUs.
+        * This way we don't report duplicated events on shared banks
+        * because the first one to see it will clear it.
+        */
+       no_way_out = mce_start(no_way_out, &order);
         for (i = 0; i < banks; i++) {
                 __clear_bit(i, toclear);
                 if (!bank[i])
@@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                 mce_get_rip(&m, regs);
                 mce_log(&m);
  
-               /*
-                * Did this bank cause the exception?
-                *
-                * Assume that the bank with uncorrectable errors did it,
-                * and that there is only a single one:
-                */
-               if ((m.status & MCI_STATUS_UC) &&
-                                       (m.status & MCI_STATUS_EN)) {
-                       panicm = m;
-                       panicm_found = 1;
+               severity = mce_severity(&m, tolerant, NULL);
+               if (severity > worst) {
+                       *final = m;
+                       worst = severity;
                 }
         }
  
+       if (!no_way_out)
+               mce_clear_state(toclear);
+
         /*
-        * If we didn't find an uncorrectable error, pick
-        * the last one (shouldn't happen, just being safe).
+        * Do most of the synchronization with other CPUs.
+        * When there's any problem use only local no_way_out state.
          */
-       if (!panicm_found)
-               panicm = m;
+       if (mce_end(order) < 0)
+               no_way_out = worst >= MCE_PANIC_SEVERITY;
  
         /*
          * If we have decided that we just CAN'T continue, and the user
          * has not set tolerant to an insane level, give up and die.
+        *
+        * This is mainly used in the case when the system doesn't
+        * support MCE broadcasting or it has been disabled.
          */
         if (no_way_out && tolerant < 3)
-               mce_panic("Machine check", &panicm, msg);
+               mce_panic("Machine check", final, msg);
  
         /*
          * If the error seems to be unrecoverable, something should be
@@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                  * instruction which caused the MCE.
                  */
                 if (m.mcgstatus & MCG_STATUS_EIPV)
-                       user_space = panicm.ip && (panicm.cs & 3);
+                       user_space = final->ip && (final->cs & 3);
  
                 /*
                  * If we know that the error was in user space, send a
@@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                 if (user_space) {
                         force_sig(SIGBUS, current);
                 } else if (panic_on_oops || tolerant < 2) {
-                       mce_panic("Uncorrected machine check", &panicm, msg);
+                       mce_panic("Uncorrected machine check", final, msg);
                 }
         }
  
         /* notify userspace ASAP */
         set_thread_flag(TIF_MCE_NOTIFY);
  
-       mce_report_event(regs);
-
-       /* the last thing we do is clear state */
-       for (i = 0; i < banks; i++) {
-               if (test_bit(i, toclear))
-                       mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
-       }
+       if (worst > 0)
+               mce_report_event(regs);
         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out:
         atomic_dec(&mce_entry);
@@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
  
                 if (c->x86 == 6 && c->x86_model < 0x1A)
                         __set_bit(0, &dont_init_banks);
+
+               /*
+                * All newer Intel systems support MCE broadcasting. Enable
+                * synchronization with a one second timeout.
+                */
+               if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+                       monarch_timeout < 0)
+                       monarch_timeout = USEC_PER_SEC;
         }
+       if (monarch_timeout < 0)
+               monarch_timeout = 0;
  }
  
  static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = {
  
  /*
   * mce=off disables machine check
- * mce=TOLERANCELEVEL (number, see above)
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ *     monarchtimeout is how long to wait for other CPUs on machine
+ *     check, or 0 to not wait
   * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
   * mce=nobootlog Don't log MCEs from before booting.
   */
@@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str)
                 mce_disabled = 1;
         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
                 mce_bootlog = (str[0] == 'b');
-       else if (isdigit(str[0]))
+       else if (isdigit(str[0])) {
                 get_option(&str, &tolerant);
-       else {
+               if (*str == ',') {
+                       ++str;
+                       get_option(&str, &monarch_timeout);
+               }
+       } else {
                 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
                        str);
                 return 0;
@@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
  
  static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
  static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
+static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
  
  static struct sysdev_ext_attribute attr_check_interval = {
         _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
@@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = {
  
  static struct sysdev_attribute *mce_attrs[] = {
         &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
+       &attr_monarch_timeout.attr,
         NULL
  };
author	Andi Kleen <andi@firstfloor.org>
	Wed, 27 May 2009 19:56:55 +0000 (21:56 +0200)
committer	H. Peter Anvin <hpa@zytor.com>
	Wed, 3 Jun 2009 21:45:12 +0000 (14:45 -0700)
Documentation/x86/x86_64/boot-options.txt		patch \| blob \| history
Documentation/x86/x86_64/machinecheck		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/mce.c		patch \| blob \| history