EDAC/mce_amd: Add support for FRU text in MCA

A new "FRU Text in MCA" feature is defined where the Field Replaceable
Unit (FRU) Text for a device is represented by a string in the new
MCA_SYND1 and MCA_SYND2 registers. This feature is supported per MCA
bank, and it is advertised by the McaFruTextInMca bit (MCA_CONFIG[9]).

The FRU Text is populated dynamically for each individual error state
(MCA_STATUS, MCA_ADDR, et al.). Handle the case where an MCA bank covers
multiple devices, for example, a Unified Memory Controller (UMC) bank
that manages two DIMMs.

  [ Yazen: Add Avadhut as co-developer for wrapper changes. ]
  [ bp: Do not expose MCA_CONFIG to userspace yet. ]

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Co-developed-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20241022194158.110073-6-avadhut.naik@amd.com
This commit is contained in:
Yazen Ghannam 2024-10-22 19:36:31 +00:00 committed by Borislav Petkov (AMD)
parent e9876dafa2
commit 612c2addff
2 changed files with 13 additions and 6 deletions

View File

@ -61,6 +61,7 @@
* - TCC bit is present in MCx_STATUS. * - TCC bit is present in MCx_STATUS.
*/ */
#define MCI_CONFIG_MCAX 0x1 #define MCI_CONFIG_MCAX 0x1
#define MCI_CONFIG_FRUTEXT BIT_ULL(9)
#define MCI_IPID_MCATYPE 0xFFFF0000 #define MCI_IPID_MCATYPE 0xFFFF0000
#define MCI_IPID_HWID 0xFFF #define MCI_IPID_HWID 0xFFF

View File

@ -795,6 +795,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
struct mce *m = (struct mce *)data; struct mce *m = (struct mce *)data;
struct mce_hw_err *err = to_mce_hw_err(m); struct mce_hw_err *err = to_mce_hw_err(m);
unsigned int fam = x86_family(m->cpuid); unsigned int fam = x86_family(m->cpuid);
u32 mca_config_lo = 0, dummy;
int ecc; int ecc;
if (m->kflags & MCE_HANDLED_CEC) if (m->kflags & MCE_HANDLED_CEC)
@ -814,11 +815,9 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
if (boot_cpu_has(X86_FEATURE_SMCA)) { if (boot_cpu_has(X86_FEATURE_SMCA)) {
u32 low, high; rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
if (!rdmsr_safe(addr, &low, &high) && if (mca_config_lo & MCI_CONFIG_MCAX)
(low & MCI_CONFIG_MCAX))
pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
@ -853,8 +852,15 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (m->status & MCI_STATUS_SYNDV) { if (m->status & MCI_STATUS_SYNDV) {
pr_cont(", Syndrome: 0x%016llx\n", m->synd); pr_cont(", Syndrome: 0x%016llx\n", m->synd);
pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
err->vendor.amd.synd1, err->vendor.amd.synd2); char frutext[17];
frutext[16] = '\0';
memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
pr_emerg(HW_ERR "FRU Text: %s", frutext);
}
} }
pr_cont("\n"); pr_cont("\n");