mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-12 16:58:53 +00:00
x86/mce: Handle broadcasted MCE gracefully with kexec
When we are about to kexec a crash kernel and right then and there a broadcasted MCE fires while we're still in the first kernel and while the other CPUs remain in a holding pattern, the #MC handler of the first kernel will timeout and then panic due to never completing MCE synchronization. Handle this in a similar way as to when the CPUs are offlined when that broadcasted MCE happens. [ Boris: rewrote commit message and comments. ] Suggested-by: Borislav Petkov <bp@alien8.de> Signed-off-by: Xunlei Pang <xlpang@redhat.com> Signed-off-by: Borislav Petkov <bp@suse.de> Acked-by: Tony Luck <tony.luck@intel.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: kexec@lists.infradead.org Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/1487857012-9059-1-git-send-email-xlpang@redhat.com Link: http://lkml.kernel.org/r/20170313095019.19351-1-bp@alien8.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
This commit is contained in:
parent
4495c08e84
commit
5bc329503e
@ -15,6 +15,7 @@ struct machine_ops {
|
||||
};
|
||||
|
||||
extern struct machine_ops machine_ops;
|
||||
extern int crashing_cpu;
|
||||
|
||||
void native_machine_crash_shutdown(struct pt_regs *regs);
|
||||
void native_machine_shutdown(void);
|
||||
|
@ -49,6 +49,7 @@
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/reboot.h>
|
||||
|
||||
#include "mce-internal.h"
|
||||
|
||||
@ -1127,9 +1128,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
||||
* on Intel.
|
||||
*/
|
||||
int lmce = 1;
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
/* If this CPU is offline, just bail out. */
|
||||
if (cpu_is_offline(smp_processor_id())) {
|
||||
/*
|
||||
* Cases where we avoid rendezvous handler timeout:
|
||||
* 1) If this CPU is offline.
|
||||
*
|
||||
* 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
|
||||
* skip those CPUs which remain looping in the 1st kernel - see
|
||||
* crash_nmi_callback().
|
||||
*
|
||||
* Note: there still is a small window between kexec-ing and the new,
|
||||
* kdump kernel establishing a new #MC handler where a broadcasted MCE
|
||||
* might not get handled properly.
|
||||
*/
|
||||
if (cpu_is_offline(cpu) ||
|
||||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
|
||||
u64 mcgstatus;
|
||||
|
||||
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
|
||||
|
@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
|
||||
#endif
|
||||
|
||||
|
||||
/* This is the CPU performing the emergency shutdown work. */
|
||||
int crashing_cpu = -1;
|
||||
|
||||
#if defined(CONFIG_SMP)
|
||||
|
||||
/* This keeps a track of which one is crashing cpu. */
|
||||
static int crashing_cpu;
|
||||
static nmi_shootdown_cb shootdown_callback;
|
||||
|
||||
static atomic_t waiting_for_crash_ipi;
|
||||
|
Loading…
x
Reference in New Issue
Block a user