x86, UV: Fix NMI handler for UV platforms
authorJack Steiner <steiner@sgi.com>
Mon, 9 May 2011 16:35:19 +0000 (11:35 -0500)
committerIngo Molnar <mingo@elte.hu>
Tue, 10 May 2011 07:26:55 +0000 (09:26 +0200)
This fixes problems seen on UV systems handling NMIs from the
node controller.

I isolated the "dazed..." messages that I saw earlier to a bug in
the BMC on our platform. It was sending NMIs w/o properly setting
a register that indicated the source of NMI.

So rather than _assuming_ any unhandled NMI came from the UV system
maintenance console (SMC), add a check to verify that the SMC actually
sent the NMI.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: gorcunov@gmail.com
Cc: dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/x86/include/asm/uv/uv_hub.h
arch/x86/include/asm/uv/uv_mmrs.h
arch/x86/kernel/apic/x2apic_uv_x.c

index a501741c2335a854fa43d14f73761d8bf8f9c82e..4298002d0c83a9c09c19c725c22a33e43921d128 100644 (file)
@@ -398,6 +398,8 @@ struct uv_blade_info {
        unsigned short  nr_online_cpus;
        unsigned short  pnode;
        short           memory_nid;
+       spinlock_t      nmi_lock;
+       unsigned long   nmi_count;
 };
 extern struct uv_blade_info *uv_blade_info;
 extern short *uv_node_to_blade;
index 20cafeac7455594d73a3bc6837da0a34fcdee3eb..f5bb64a823d77a1d3ebf97d40e80e913c83e0d31 100644 (file)
@@ -5,7 +5,7 @@
  *
  * SGI UV MMR definitions
  *
- * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_X86_UV_UV_MMRS_H
@@ -1099,5 +1099,19 @@ union uvh_rtc1_int_config_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                               UVH_SCRATCH5                                */
+/* ========================================================================= */
+#define UVH_SCRATCH5 0x2d0200UL
+#define UVH_SCRATCH5_32 0x00778
+
+#define UVH_SCRATCH5_SCRATCH5_SHFT 0
+#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+union uvh_scratch5_u {
+    unsigned long      v;
+    struct uvh_scratch5_s {
+       unsigned long   scratch5 : 64;  /* RW, W1CS */
+    } s;
+};
 
 #endif /* __ASM_UV_MMRS_X86_H__ */
index 33b10a0fc095b9d458f851b20e4c5dda37cfc9ed..7acd2d2ac965f9932f39986ab1fe23361c6de90e 100644 (file)
 #include <asm/smp.h>
 #include <asm/x86_init.h>
 #include <asm/emergency-restart.h>
+#include <asm/nmi.h>
+
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR                            UVH_SCRATCH5
+#define UVH_NMI_MMR_CLEAR                      (UVH_NMI_MMR + 8)
+#define UV_NMI_PENDING_MASK                    (1UL << 63)
+DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
 
 DEFINE_PER_CPU(int, x2apic_extra_bits);
 
@@ -642,18 +649,46 @@ void __cpuinit uv_cpu_init(void)
  */
 int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 {
+       unsigned long real_uv_nmi;
+       int bid;
+
        if (reason != DIE_NMIUNKNOWN)
                return NOTIFY_OK;
 
        if (in_crash_kexec)
                /* do nothing if entering the crash kernel */
                return NOTIFY_OK;
+
        /*
-        * Use a lock so only one cpu prints at a time
-        * to prevent intermixed output.
+        * Each blade has an MMR that indicates when an NMI has been sent
+        * to cpus on the blade. If an NMI is detected, atomically
+        * clear the MMR and update a per-blade NMI count used to
+        * cause each cpu on the blade to notice a new NMI.
+        */
+       bid = uv_numa_blade_id();
+       real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+
+       if (unlikely(real_uv_nmi)) {
+               spin_lock(&uv_blade_info[bid].nmi_lock);
+               real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+               if (real_uv_nmi) {
+                       uv_blade_info[bid].nmi_count++;
+                       uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
+               }
+               spin_unlock(&uv_blade_info[bid].nmi_lock);
+       }
+
+       if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
+               return NOTIFY_DONE;
+
+       __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
+
+       /*
+        * Use a lock so only one cpu prints at a time.
+        * This prevents intermixed output.
         */
        spin_lock(&uv_nmi_lock);
-       pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
+       pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
        dump_stack();
        spin_unlock(&uv_nmi_lock);
 
@@ -661,7 +696,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 }
 
 static struct notifier_block uv_dump_stack_nmi_nb = {
-       .notifier_call  = uv_handle_nmi
+       .notifier_call  = uv_handle_nmi,
+       .priority = NMI_LOCAL_LOW_PRIOR - 1,
 };
 
 void uv_register_nmi_notifier(void)
@@ -720,8 +756,9 @@ void __init uv_system_init(void)
        printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
 
        bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
-       uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+       uv_blade_info = kzalloc(bytes, GFP_KERNEL);
        BUG_ON(!uv_blade_info);
+
        for (blade = 0; blade < uv_num_possible_blades(); blade++)
                uv_blade_info[blade].memory_nid = -1;
 
@@ -747,6 +784,7 @@ void __init uv_system_init(void)
                        uv_blade_info[blade].pnode = pnode;
                        uv_blade_info[blade].nr_possible_cpus = 0;
                        uv_blade_info[blade].nr_online_cpus = 0;
+                       spin_lock_init(&uv_blade_info[blade].nmi_lock);
                        max_pnode = max(pnode, max_pnode);
                        blade++;
                }