From d0a9964e98731c708500a2e712f28f9d39183647 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 12 Sep 2015 21:51:10 -0500 Subject: [PATCH] x86/platform/uv: Implement simple dump failover if kdump fails The ability to trigger a kdump using the system NMI command was added by commit 12ba6c990fab ("x86/UV: Add kdump to UV NMI handler") Author: Mike Travis Date: Mon Sep 23 16:25:03 2013 -0500 This is useful because when kdump is working the information gathered is more informative than the original per CPU stack traces or "dump" option. However a number of things can go wrong with kdump and then the stack traces are more useful than nothing. The two most common reasons for kdump to not be available are: 1) if a problem occurs during boot before the kdump service is started, or 2) the kdump daemon failed to start. In either case the call to crash_kexec() returns unexpectedly. When this happens uv_nmi_kdump() also sets the uv_nmi_kexec_failed flag which causes the slave CPU's to also return to the NMI handler. Upon this unexpected return to the NMI handler, the NMI handler will revert to the "dump" action which uses show_regs() to obtain a process trace dump for all the CPU's. Other minor changes: The "dump" action now generates both the show_regs() stack trace and show instruction pointer information. Whereas the "ips" action only shows instruction pointers for non-idle CPU's. This is more like an abbreviated "ps" display. Change printk(KERN_DEFAULT...) --> pr_info() Signed-off-by: Mike Travis Signed-off-by: George Beshers Cc: Alex Thorlton Cc: Dimitri Sivanich Cc: Hedi Berriche Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/uv_nmi.c | 53 ++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 5c9f63fa6abf..327f21c3bde1 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -376,38 +376,42 @@ static void uv_nmi_wait(int master) atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus()); } +/* Dump Instruction Pointer header */ static void uv_nmi_dump_cpu_ip_hdr(void) { - printk(KERN_DEFAULT - "\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n", + pr_info("\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n", "CPU", "PID", "COMMAND", "IP"); } +/* Dump Instruction Pointer info */ static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) { - printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ", - cpu, current->pid, current->comm); - + pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm); printk_address(regs->ip); } -/* Dump this cpu's state */ +/* + * Dump this CPU's state. If action was set to "kdump" and the crash_kexec + * failed, then we provide "dump" as an alternate action. Action "dump" now + * also includes the show "ips" (instruction pointers) action whereas the + * action "ips" only displays instruction pointers for the non-idle CPU's. + * This is an abbreviated form of the "ps" command. + */ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) { const char *dots = " ................................. "; - if (uv_nmi_action_is("ips")) { - if (cpu == 0) - uv_nmi_dump_cpu_ip_hdr(); + if (cpu == 0) + uv_nmi_dump_cpu_ip_hdr(); - if (current->pid != 0) - uv_nmi_dump_cpu_ip(cpu, regs); + if (current->pid != 0 || !uv_nmi_action_is("ips")) + uv_nmi_dump_cpu_ip(cpu, regs); - } else if (uv_nmi_action_is("dump")) { - printk(KERN_DEFAULT - "UV:%sNMI process trace for CPU %d\n", dots, cpu); + if (uv_nmi_action_is("dump")) { + pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu); show_regs(regs); } + this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); } @@ -469,8 +473,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) uv_nmi_trigger_dump(tcpu); } if (ignored) - printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n", - ignored); + pr_alert("UV: %d CPUs ignored NMI\n", ignored); console_loglevel = saved_console_loglevel; pr_alert("UV: process trace complete\n"); @@ -492,8 +495,9 @@ static void uv_nmi_touch_watchdogs(void) touch_nmi_watchdog(); } -#if defined(CONFIG_KEXEC_CORE) static atomic_t uv_nmi_kexec_failed; + +#if defined(CONFIG_KEXEC_CORE) static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { /* Call crash to dump system state */ @@ -502,10 +506,9 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) crash_kexec(regs); pr_emerg("UV: crash_kexec unexpectedly returned, "); + atomic_set(&uv_nmi_kexec_failed, 1); if (!kexec_crash_image) { pr_cont("crash kernel not loaded\n"); - atomic_set(&uv_nmi_kexec_failed, 1); - uv_nmi_sync_exit(1); return; } pr_cont("kexec busy, stalling cpus while waiting\n"); @@ -514,9 +517,6 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) /* If crash exec fails the slaves should return, otherwise stall */ while (atomic_read(&uv_nmi_kexec_failed) == 0) mdelay(10); - - /* Crash kernel most likely not loaded, return in an orderly fashion */ - uv_nmi_sync_exit(0); } #else /* !CONFIG_KEXEC_CORE */ @@ -524,6 +524,7 @@ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { if (master) pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); + atomic_set(&uv_nmi_kexec_failed, 1); } #endif /* !CONFIG_KEXEC_CORE */ @@ -613,9 +614,14 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) master = (atomic_read(&uv_nmi_cpu) == cpu); /* If NMI action is "kdump", then attempt to do it */ - if (uv_nmi_action_is("kdump")) + if (uv_nmi_action_is("kdump")) { uv_nmi_kdump(cpu, master, regs); + /* Unexpected return, revert action to "dump" */ + if (master) + strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action)); + } + /* Pause as all cpus enter the NMI handler */ uv_nmi_wait(master); @@ -640,6 +646,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) atomic_set(&uv_nmi_cpus_in_nmi, -1); atomic_set(&uv_nmi_cpu, -1); atomic_set(&uv_in_nmi, 0); + atomic_set(&uv_nmi_kexec_failed, 0); } uv_nmi_touch_watchdogs(); -- 2.20.1