From 1a3ea611fc105759cf1023e922d00cf44569b8bb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 16 Dec 2022 15:57:51 -0800 Subject: [PATCH] x86/nmi: Accumulate NMI-progress evidence in exc_nmi() CPUs ignoring NMIs is often a sign of those CPUs going bad, but there are quite a few other reasons why a CPU might ignore NMIs. Therefore, accumulate evidence within exc_nmi() as to what might be preventing a given CPU from responding to an NMI. [ paulmck: Apply Peter Zijlstra feedback. ] Signed-off-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Reviewed-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 35 ++++++++++++++++++++++++++++++++++- lib/Kconfig.debug | 11 +++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index cec0bfa3bc04..701ff27a77ec 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -69,6 +69,15 @@ struct nmi_stats { unsigned int unknown; unsigned int external; unsigned int swallow; + unsigned long recv_jiffies; + unsigned long idt_seq; + unsigned long idt_nmi_seq; + unsigned long idt_ignored; + atomic_long_t idt_calls; + unsigned long idt_seq_snap; + unsigned long idt_nmi_seq_snap; + unsigned long idt_ignored_snap; + long idt_calls_snap; }; static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); @@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { irqentry_state_t irq_state; + struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats); /* * Re-enable NMIs right here when running as an SEV-ES guest. This might * cause nested NMIs, but those can be handled safely. */ sev_es_nmi_complete(); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) + arch_atomic_long_inc(&nsp->idt_calls); if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; @@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi) } this_cpu_write(nmi_state, NMI_EXECUTING); this_cpu_write(nmi_cr2, read_cr2()); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); + WARN_ON_ONCE(!(nsp->idt_seq & 0x1)); + WRITE_ONCE(nsp->recv_jiffies, jiffies); + } nmi_restart: /* @@ -509,8 +526,19 @@ DEFINE_IDTENTRY_RAW(exc_nmi) inc_irq_stat(__nmi_count); - if (!ignore_nmis) + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) { + WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1); + } else if (!ignore_nmis) { + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1); + WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1)); + } default_do_nmi(regs); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1); + WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1); + } + } irqentry_nmi_exit(regs, irq_state); @@ -525,6 +553,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi) if (user_mode(regs)) mds_user_clear_cpu_buffers(); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); + WARN_ON_ONCE(nsp->idt_seq & 0x1); + WRITE_ONCE(nsp->recv_jiffies, jiffies); + } } #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 881c3f84e88a..dad99197d269 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1552,6 +1552,17 @@ config TRACE_IRQFLAGS_NMI depends on TRACE_IRQFLAGS depends on TRACE_IRQFLAGS_NMI_SUPPORT +config NMI_CHECK_CPU + bool "Debugging for CPUs failing to respond to backtrace requests" + depends on DEBUG_KERNEL + depends on X86 + default n + help + Enables debug prints when a CPU fails to respond to a given + backtrace NMI. These prints provide some reasons why a CPU + might legitimately be failing to respond, for example, if it + is offline of if ignore_nmis is set. + config DEBUG_IRQFLAGS bool "Debug IRQ flag manipulation" help