mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-12-28 16:56:26 +00:00
- Log and handle twp new AMD-specific MCA registers: SYND1 and SYND2 and
report the Field Replaceable Unit text info reported through them - Add support for handling variable-sized SMCA BERT records - Add the capability for reporting vendor-specific RAS error info without adding vendor-specific fields to struct mce - Cleanups -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmc7OlEACgkQEsHwGGHe VUpXihAAgVdZExo/1Rmbh6s/259BH38GP6fL+ePaT1SlUzNi770TY2b7I4OYlms4 xa9t8LAIVMrrIMIg6w6q8JN4YHAQoVdcbRBvHQYB1a24xtoyxaEJxLKQNLA1soUQ Jc9asWMHBuXnLfR/4S8Y2vWrzByOSwxqDBzQCu0Ryqvbg7vdRicNt+Hk9oHHIAYy cquZpoDGL3W6BA8sXONbEW/6rcQ33JsEQ+Ub4qr1q2g+kNwXrrFuXZlojmz2MxIs xgqeYKyrxK6heX0l8dSiipCATA+sOXXWWzbZtdPjFtDGzwIlV3p4yXN3fucrmHm1 4Fg1gW5a1V82Qosn0FbGiZPojsahhOE2k1bz+yEMDM3Sg2qeRWcK+V3jiS5zKzPd WWqUbRtcaxayoEsAXnWrxrp3vxhlUUf1Ivtgk8mlMjhHPLijV5iranrRj+XHEikR H0D3Vm0T1LHCPf9AUsbmo0GAfAOeO9DTAB9LJdKv+OJ4ESVgSPJW/9NKWLXKq41p hhs7seJTYNw8sp67cL23TnkSp3S+9kd2U7Od3T1kubtd4fVxVnlowu8Fc6kjqd8v n+GbdLxhX7GbOgnT0z2OG5Xmc1pNW1JtRbuxSK59NFNia7r6ZkR7BE/OCtL82Rfm u7i76z1O0lV91y93GMCyP9DYn8K1ceU7gVCveY6mx/AHgzc87d8= =djpG -----END PGP SIGNATURE----- Merge tag 'ras_core_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull RAS updates from Borislav Petkov: - Log and handle twp new AMD-specific MCA registers: SYND1 and SYND2 and report the Field Replaceable Unit text info reported through them - Add support for handling variable-sized SMCA BERT records - Add the capability for reporting vendor-specific RAS error info without adding vendor-specific fields to struct mce - Cleanups * tag 'ras_core_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: EDAC/mce_amd: Add support for FRU text in MCA x86/mce/apei: Handle variable SMCA BERT record size x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers tracing: Add __print_dynamic_array() helper x86/mce: Add wrapper for struct mce to export vendor specific info x86/mce/intel: Use MCG_BANKCNT_MASK instead of 0xff x86/mce/mcelog: Use xchg() to get and clear the flags
This commit is contained in:
commit
c1f2ffe207
@ -61,6 +61,7 @@
|
||||
* - TCC bit is present in MCx_STATUS.
|
||||
*/
|
||||
#define MCI_CONFIG_MCAX 0x1
|
||||
#define MCI_CONFIG_FRUTEXT BIT_ULL(9)
|
||||
#define MCI_IPID_MCATYPE 0xFFFF0000
|
||||
#define MCI_IPID_HWID 0xFFF
|
||||
|
||||
@ -122,6 +123,9 @@
|
||||
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
|
||||
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
|
||||
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
|
||||
/* Registers MISC2 to MISC4 are at offsets B to D. */
|
||||
#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e
|
||||
#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f
|
||||
#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
|
||||
@ -132,6 +136,8 @@
|
||||
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
|
||||
#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
|
||||
#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))
|
||||
|
||||
#define XEC(x, mask) (((x) >> 16) & mask)
|
||||
|
||||
@ -187,6 +193,32 @@ enum mce_notifier_prios {
|
||||
MCE_PRIO_HIGHEST = MCE_PRIO_CEC
|
||||
};
|
||||
|
||||
/**
|
||||
* struct mce_hw_err - Hardware Error Record.
|
||||
* @m: Machine Check record.
|
||||
* @vendor: Vendor-specific error information.
|
||||
*
|
||||
* Vendor-specific fields should not be added to struct mce. Instead, vendors
|
||||
* should export their vendor-specific data through their structure in the
|
||||
* vendor union below.
|
||||
*
|
||||
* AMD's vendor data is parsed by error decoding tools for supplemental error
|
||||
* information. Thus, current offsets of existing fields must be maintained.
|
||||
* Only add new fields at the end of AMD's vendor structure.
|
||||
*/
|
||||
struct mce_hw_err {
|
||||
struct mce m;
|
||||
|
||||
union vendor_info {
|
||||
struct {
|
||||
u64 synd1; /* MCA_SYND1 MSR */
|
||||
u64 synd2; /* MCA_SYND2 MSR */
|
||||
} amd;
|
||||
} vendor;
|
||||
};
|
||||
|
||||
#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m)
|
||||
|
||||
struct notifier_block;
|
||||
extern void mce_register_decode_chain(struct notifier_block *nb);
|
||||
extern void mce_unregister_decode_chain(struct notifier_block *nb);
|
||||
@ -221,8 +253,8 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
|
||||
u64 lapic_id) { return -EINVAL; }
|
||||
#endif
|
||||
|
||||
void mce_prep_record(struct mce *m);
|
||||
void mce_log(struct mce *m);
|
||||
void mce_prep_record(struct mce_hw_err *err);
|
||||
void mce_log(struct mce_hw_err *err);
|
||||
DECLARE_PER_CPU(struct device *, mce_device);
|
||||
|
||||
/* Maximum number of MCA banks per CPU. */
|
||||
|
@ -8,7 +8,8 @@
|
||||
/*
|
||||
* Fields are zero when not available. Also, this struct is shared with
|
||||
* userspace mcelog and thus must keep existing fields at current offsets.
|
||||
* Only add new fields to the end of the structure
|
||||
* Only add new, shared fields to the end of the structure.
|
||||
* Do not add vendor-specific fields.
|
||||
*/
|
||||
struct mce {
|
||||
__u64 status; /* Bank's MCi_STATUS MSR */
|
||||
|
@ -778,29 +778,33 @@ bool amd_mce_usable_address(struct mce *m)
|
||||
|
||||
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
|
||||
{
|
||||
struct mce m;
|
||||
struct mce_hw_err err;
|
||||
struct mce *m = &err.m;
|
||||
|
||||
mce_prep_record(&m);
|
||||
mce_prep_record(&err);
|
||||
|
||||
m.status = status;
|
||||
m.misc = misc;
|
||||
m.bank = bank;
|
||||
m.tsc = rdtsc();
|
||||
m->status = status;
|
||||
m->misc = misc;
|
||||
m->bank = bank;
|
||||
m->tsc = rdtsc();
|
||||
|
||||
if (m.status & MCI_STATUS_ADDRV) {
|
||||
m.addr = addr;
|
||||
if (m->status & MCI_STATUS_ADDRV) {
|
||||
m->addr = addr;
|
||||
|
||||
smca_extract_err_addr(&m);
|
||||
smca_extract_err_addr(m);
|
||||
}
|
||||
|
||||
if (mce_flags.smca) {
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
|
||||
|
||||
if (m.status & MCI_STATUS_SYNDV)
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
|
||||
if (m->status & MCI_STATUS_SYNDV) {
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
|
||||
rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
|
||||
}
|
||||
}
|
||||
|
||||
mce_log(&m);
|
||||
mce_log(&err);
|
||||
}
|
||||
|
||||
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
|
||||
|
@ -28,7 +28,8 @@
|
||||
|
||||
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
|
||||
{
|
||||
struct mce m;
|
||||
struct mce_hw_err err;
|
||||
struct mce *m;
|
||||
int lsb;
|
||||
|
||||
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
|
||||
@ -44,31 +45,33 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
|
||||
else
|
||||
lsb = PAGE_SHIFT;
|
||||
|
||||
mce_prep_record(&m);
|
||||
m.bank = -1;
|
||||
mce_prep_record(&err);
|
||||
m = &err.m;
|
||||
m->bank = -1;
|
||||
/* Fake a memory read error with unknown channel */
|
||||
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
|
||||
m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
|
||||
m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
|
||||
m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
|
||||
|
||||
if (severity >= GHES_SEV_RECOVERABLE)
|
||||
m.status |= MCI_STATUS_UC;
|
||||
m->status |= MCI_STATUS_UC;
|
||||
|
||||
if (severity >= GHES_SEV_PANIC) {
|
||||
m.status |= MCI_STATUS_PCC;
|
||||
m.tsc = rdtsc();
|
||||
m->status |= MCI_STATUS_PCC;
|
||||
m->tsc = rdtsc();
|
||||
}
|
||||
|
||||
m.addr = mem_err->physical_addr;
|
||||
mce_log(&m);
|
||||
m->addr = mem_err->physical_addr;
|
||||
mce_log(&err);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
|
||||
|
||||
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
|
||||
{
|
||||
const u64 *i_mce = ((const u64 *) (ctx_info + 1));
|
||||
unsigned int cpu, num_regs;
|
||||
bool apicid_found = false;
|
||||
unsigned int cpu;
|
||||
struct mce m;
|
||||
struct mce_hw_err err;
|
||||
struct mce *m;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_SMCA))
|
||||
return -EINVAL;
|
||||
@ -86,16 +89,12 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* The register array size must be large enough to include all the
|
||||
* SMCA registers which need to be extracted.
|
||||
*
|
||||
* The number of registers in the register array is determined by
|
||||
* Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
|
||||
* The register layout is fixed and currently the raw data in the
|
||||
* register array includes 6 SMCA registers which the kernel can
|
||||
* extract.
|
||||
* Sanity-check registers array size.
|
||||
*/
|
||||
if (ctx_info->reg_arr_size < 48)
|
||||
num_regs = ctx_info->reg_arr_size >> 3;
|
||||
if (!num_regs)
|
||||
return -EINVAL;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
@ -108,18 +107,68 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
|
||||
if (!apicid_found)
|
||||
return -EINVAL;
|
||||
|
||||
mce_prep_record_common(&m);
|
||||
mce_prep_record_per_cpu(cpu, &m);
|
||||
m = &err.m;
|
||||
memset(&err, 0, sizeof(struct mce_hw_err));
|
||||
mce_prep_record_common(m);
|
||||
mce_prep_record_per_cpu(cpu, m);
|
||||
|
||||
m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
|
||||
m.status = *i_mce;
|
||||
m.addr = *(i_mce + 1);
|
||||
m.misc = *(i_mce + 2);
|
||||
/* Skipping MCA_CONFIG */
|
||||
m.ipid = *(i_mce + 4);
|
||||
m.synd = *(i_mce + 5);
|
||||
m->bank = (ctx_info->msr_addr >> 4) & 0xFF;
|
||||
|
||||
mce_log(&m);
|
||||
/*
|
||||
* The SMCA register layout is fixed and includes 16 registers.
|
||||
* The end of the array may be variable, but the beginning is known.
|
||||
* Cap the number of registers to expected max (15).
|
||||
*/
|
||||
if (num_regs > 15)
|
||||
num_regs = 15;
|
||||
|
||||
switch (num_regs) {
|
||||
/* MCA_SYND2 */
|
||||
case 15:
|
||||
err.vendor.amd.synd2 = *(i_mce + 14);
|
||||
fallthrough;
|
||||
/* MCA_SYND1 */
|
||||
case 14:
|
||||
err.vendor.amd.synd1 = *(i_mce + 13);
|
||||
fallthrough;
|
||||
/* MCA_MISC4 */
|
||||
case 13:
|
||||
/* MCA_MISC3 */
|
||||
case 12:
|
||||
/* MCA_MISC2 */
|
||||
case 11:
|
||||
/* MCA_MISC1 */
|
||||
case 10:
|
||||
/* MCA_DEADDR */
|
||||
case 9:
|
||||
/* MCA_DESTAT */
|
||||
case 8:
|
||||
/* reserved */
|
||||
case 7:
|
||||
/* MCA_SYND */
|
||||
case 6:
|
||||
m->synd = *(i_mce + 5);
|
||||
fallthrough;
|
||||
/* MCA_IPID */
|
||||
case 5:
|
||||
m->ipid = *(i_mce + 4);
|
||||
fallthrough;
|
||||
/* MCA_CONFIG */
|
||||
case 4:
|
||||
/* MCA_MISC0 */
|
||||
case 3:
|
||||
m->misc = *(i_mce + 2);
|
||||
fallthrough;
|
||||
/* MCA_ADDR */
|
||||
case 2:
|
||||
m->addr = *(i_mce + 1);
|
||||
fallthrough;
|
||||
/* MCA_STATUS */
|
||||
case 1:
|
||||
m->status = *i_mce;
|
||||
}
|
||||
|
||||
mce_log(&err);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = {
|
||||
.monarch_timeout = -1
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct mce, mces_seen);
|
||||
static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
|
||||
static unsigned long mce_need_notify;
|
||||
|
||||
/*
|
||||
@ -119,8 +119,6 @@ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
|
||||
|
||||
void mce_prep_record_common(struct mce *m)
|
||||
{
|
||||
memset(m, 0, sizeof(struct mce));
|
||||
|
||||
m->cpuid = cpuid_eax(1);
|
||||
m->cpuvendor = boot_cpu_data.x86_vendor;
|
||||
m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
|
||||
@ -138,9 +136,12 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
|
||||
m->socketid = topology_physical_package_id(cpu);
|
||||
}
|
||||
|
||||
/* Do initial initialization of a struct mce */
|
||||
void mce_prep_record(struct mce *m)
|
||||
/* Do initial initialization of struct mce_hw_err */
|
||||
void mce_prep_record(struct mce_hw_err *err)
|
||||
{
|
||||
struct mce *m = &err->m;
|
||||
|
||||
memset(err, 0, sizeof(struct mce_hw_err));
|
||||
mce_prep_record_common(m);
|
||||
mce_prep_record_per_cpu(smp_processor_id(), m);
|
||||
}
|
||||
@ -148,9 +149,9 @@ void mce_prep_record(struct mce *m)
|
||||
DEFINE_PER_CPU(struct mce, injectm);
|
||||
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
|
||||
|
||||
void mce_log(struct mce *m)
|
||||
void mce_log(struct mce_hw_err *err)
|
||||
{
|
||||
if (!mce_gen_pool_add(m))
|
||||
if (!mce_gen_pool_add(err))
|
||||
irq_work_queue(&mce_irq_work);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mce_log);
|
||||
@ -171,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
|
||||
|
||||
static void __print_mce(struct mce *m)
|
||||
static void __print_mce(struct mce_hw_err *err)
|
||||
{
|
||||
struct mce *m = &err->m;
|
||||
|
||||
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
|
||||
m->extcpu,
|
||||
(m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
|
||||
@ -199,6 +202,10 @@ static void __print_mce(struct mce *m)
|
||||
if (mce_flags.smca) {
|
||||
if (m->synd)
|
||||
pr_cont("SYND %llx ", m->synd);
|
||||
if (err->vendor.amd.synd1)
|
||||
pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
|
||||
if (err->vendor.amd.synd2)
|
||||
pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
|
||||
if (m->ipid)
|
||||
pr_cont("IPID %llx ", m->ipid);
|
||||
}
|
||||
@ -214,9 +221,11 @@ static void __print_mce(struct mce *m)
|
||||
m->microcode);
|
||||
}
|
||||
|
||||
static void print_mce(struct mce *m)
|
||||
static void print_mce(struct mce_hw_err *err)
|
||||
{
|
||||
__print_mce(m);
|
||||
struct mce *m = &err->m;
|
||||
|
||||
__print_mce(err);
|
||||
|
||||
if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
|
||||
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
|
||||
@ -251,7 +260,7 @@ static const char *mce_dump_aux_info(struct mce *m)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp)
|
||||
{
|
||||
struct llist_node *pending;
|
||||
struct mce_evt_llist *l;
|
||||
@ -282,20 +291,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
pending = mce_gen_pool_prepare_records();
|
||||
/* First print corrected ones that are still unlogged */
|
||||
llist_for_each_entry(l, pending, llnode) {
|
||||
struct mce *m = &l->mce;
|
||||
struct mce_hw_err *err = &l->err;
|
||||
struct mce *m = &err->m;
|
||||
if (!(m->status & MCI_STATUS_UC)) {
|
||||
print_mce(m);
|
||||
print_mce(err);
|
||||
if (!apei_err)
|
||||
apei_err = apei_write_mce(m);
|
||||
}
|
||||
}
|
||||
/* Now print uncorrected but with the final one last */
|
||||
llist_for_each_entry(l, pending, llnode) {
|
||||
struct mce *m = &l->mce;
|
||||
struct mce_hw_err *err = &l->err;
|
||||
struct mce *m = &err->m;
|
||||
if (!(m->status & MCI_STATUS_UC))
|
||||
continue;
|
||||
if (!final || mce_cmp(m, final)) {
|
||||
print_mce(m);
|
||||
if (!final || mce_cmp(m, &final->m)) {
|
||||
print_mce(err);
|
||||
if (!apei_err)
|
||||
apei_err = apei_write_mce(m);
|
||||
}
|
||||
@ -303,12 +314,12 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
if (final) {
|
||||
print_mce(final);
|
||||
if (!apei_err)
|
||||
apei_err = apei_write_mce(final);
|
||||
apei_err = apei_write_mce(&final->m);
|
||||
}
|
||||
if (exp)
|
||||
pr_emerg(HW_ERR "Machine check: %s\n", exp);
|
||||
|
||||
memmsg = mce_dump_aux_info(final);
|
||||
memmsg = mce_dump_aux_info(&final->m);
|
||||
if (memmsg)
|
||||
pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
|
||||
|
||||
@ -323,9 +334,9 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
|
||||
* panic.
|
||||
*/
|
||||
if (kexec_crash_loaded()) {
|
||||
if (final && (final->status & MCI_STATUS_ADDRV)) {
|
||||
if (final && (final->m.status & MCI_STATUS_ADDRV)) {
|
||||
struct page *p;
|
||||
p = pfn_to_online_page(final->addr >> PAGE_SHIFT);
|
||||
p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
|
||||
if (p)
|
||||
SetPageHWPoison(p);
|
||||
}
|
||||
@ -445,16 +456,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
|
||||
* check into our "mce" struct so that we can use it later to assess
|
||||
* the severity of the problem as we read per-bank specific details.
|
||||
*/
|
||||
static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
|
||||
static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
|
||||
{
|
||||
struct mce *m;
|
||||
/*
|
||||
* Enable instrumentation around mce_prep_record() which calls external
|
||||
* facilities.
|
||||
*/
|
||||
instrumentation_begin();
|
||||
mce_prep_record(m);
|
||||
mce_prep_record(err);
|
||||
instrumentation_end();
|
||||
|
||||
m = &err->m;
|
||||
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
|
||||
if (regs) {
|
||||
/*
|
||||
@ -574,13 +587,13 @@ EXPORT_SYMBOL_GPL(mce_is_correctable);
|
||||
static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
|
||||
void *data)
|
||||
{
|
||||
struct mce *m = (struct mce *)data;
|
||||
struct mce_hw_err *err = to_mce_hw_err(data);
|
||||
|
||||
if (!m)
|
||||
if (!err)
|
||||
return NOTIFY_DONE;
|
||||
|
||||
/* Emit the trace record: */
|
||||
trace_mce_record(m);
|
||||
trace_mce_record(err);
|
||||
|
||||
set_bit(0, &mce_need_notify);
|
||||
|
||||
@ -624,13 +637,13 @@ static struct notifier_block mce_uc_nb = {
|
||||
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
|
||||
void *data)
|
||||
{
|
||||
struct mce *m = (struct mce *)data;
|
||||
struct mce_hw_err *err = to_mce_hw_err(data);
|
||||
|
||||
if (!m)
|
||||
if (!err)
|
||||
return NOTIFY_DONE;
|
||||
|
||||
if (mca_cfg.print_all || !m->kflags)
|
||||
__print_mce(m);
|
||||
if (mca_cfg.print_all || !(err->m.kflags))
|
||||
__print_mce(err);
|
||||
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
@ -644,8 +657,10 @@ static struct notifier_block mce_default_nb = {
|
||||
/*
|
||||
* Read ADDR and MISC registers.
|
||||
*/
|
||||
static noinstr void mce_read_aux(struct mce *m, int i)
|
||||
static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
|
||||
{
|
||||
struct mce *m = &err->m;
|
||||
|
||||
if (m->status & MCI_STATUS_MISCV)
|
||||
m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
|
||||
|
||||
@ -667,8 +682,11 @@ static noinstr void mce_read_aux(struct mce *m, int i)
|
||||
if (mce_flags.smca) {
|
||||
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
|
||||
|
||||
if (m->status & MCI_STATUS_SYNDV)
|
||||
if (m->status & MCI_STATUS_SYNDV) {
|
||||
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
|
||||
err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
|
||||
err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -692,26 +710,28 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
|
||||
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
|
||||
{
|
||||
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
|
||||
struct mce m;
|
||||
struct mce_hw_err err;
|
||||
struct mce *m;
|
||||
int i;
|
||||
|
||||
this_cpu_inc(mce_poll_count);
|
||||
|
||||
mce_gather_info(&m, NULL);
|
||||
mce_gather_info(&err, NULL);
|
||||
m = &err.m;
|
||||
|
||||
if (flags & MCP_TIMESTAMP)
|
||||
m.tsc = rdtsc();
|
||||
m->tsc = rdtsc();
|
||||
|
||||
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
|
||||
if (!mce_banks[i].ctl || !test_bit(i, *b))
|
||||
continue;
|
||||
|
||||
m.misc = 0;
|
||||
m.addr = 0;
|
||||
m.bank = i;
|
||||
m->misc = 0;
|
||||
m->addr = 0;
|
||||
m->bank = i;
|
||||
|
||||
barrier();
|
||||
m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
|
||||
m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
|
||||
|
||||
/*
|
||||
* Update storm tracking here, before checking for the
|
||||
@ -721,17 +741,17 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
|
||||
* storm status.
|
||||
*/
|
||||
if (!mca_cfg.cmci_disabled)
|
||||
mce_track_storm(&m);
|
||||
mce_track_storm(m);
|
||||
|
||||
/* If this entry is not valid, ignore it */
|
||||
if (!(m.status & MCI_STATUS_VAL))
|
||||
if (!(m->status & MCI_STATUS_VAL))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If we are logging everything (at CPU online) or this
|
||||
* is a corrected error, then we must log it.
|
||||
*/
|
||||
if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
|
||||
if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
|
||||
goto log_it;
|
||||
|
||||
/*
|
||||
@ -741,20 +761,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
|
||||
* everything else.
|
||||
*/
|
||||
if (!mca_cfg.ser) {
|
||||
if (m.status & MCI_STATUS_UC)
|
||||
if (m->status & MCI_STATUS_UC)
|
||||
continue;
|
||||
goto log_it;
|
||||
}
|
||||
|
||||
/* Log "not enabled" (speculative) errors */
|
||||
if (!(m.status & MCI_STATUS_EN))
|
||||
if (!(m->status & MCI_STATUS_EN))
|
||||
goto log_it;
|
||||
|
||||
/*
|
||||
* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
|
||||
* UC == 1 && PCC == 0 && S == 0
|
||||
*/
|
||||
if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
|
||||
if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
|
||||
goto log_it;
|
||||
|
||||
/*
|
||||
@ -768,20 +788,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
|
||||
if (flags & MCP_DONTLOG)
|
||||
goto clear_it;
|
||||
|
||||
mce_read_aux(&m, i);
|
||||
m.severity = mce_severity(&m, NULL, NULL, false);
|
||||
mce_read_aux(&err, i);
|
||||
m->severity = mce_severity(m, NULL, NULL, false);
|
||||
/*
|
||||
* Don't get the IP here because it's unlikely to
|
||||
* have anything to do with the actual error location.
|
||||
*/
|
||||
|
||||
if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
|
||||
if (mca_cfg.dont_log_ce && !mce_usable_address(m))
|
||||
goto clear_it;
|
||||
|
||||
if (flags & MCP_QUEUE_LOG)
|
||||
mce_gen_pool_add(&m);
|
||||
mce_gen_pool_add(&err);
|
||||
else
|
||||
mce_log(&m);
|
||||
mce_log(&err);
|
||||
|
||||
clear_it:
|
||||
/*
|
||||
@ -905,9 +925,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg
|
||||
* Do a quick check if any of the events requires a panic.
|
||||
* This decides if we keep the events around or clear them.
|
||||
*/
|
||||
static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
|
||||
static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
|
||||
struct pt_regs *regs)
|
||||
{
|
||||
struct mce *m = &err->m;
|
||||
char *tmp = *msg;
|
||||
int i;
|
||||
|
||||
@ -925,7 +946,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo
|
||||
|
||||
m->bank = i;
|
||||
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
|
||||
mce_read_aux(m, i);
|
||||
mce_read_aux(err, i);
|
||||
*msg = tmp;
|
||||
return 1;
|
||||
}
|
||||
@ -1016,10 +1037,11 @@ static noinstr int mce_timed_out(u64 *t, const char *msg)
|
||||
*/
|
||||
static void mce_reign(void)
|
||||
{
|
||||
int cpu;
|
||||
struct mce_hw_err *err = NULL;
|
||||
struct mce *m = NULL;
|
||||
int global_worst = 0;
|
||||
char *msg = NULL;
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* This CPU is the Monarch and the other CPUs have run
|
||||
@ -1027,11 +1049,13 @@ static void mce_reign(void)
|
||||
* Grade the severity of the errors of all the CPUs.
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct mce *mtmp = &per_cpu(mces_seen, cpu);
|
||||
struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
|
||||
struct mce *mtmp = &etmp->m;
|
||||
|
||||
if (mtmp->severity > global_worst) {
|
||||
global_worst = mtmp->severity;
|
||||
m = &per_cpu(mces_seen, cpu);
|
||||
err = &per_cpu(hw_errs_seen, cpu);
|
||||
m = &err->m;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1043,7 +1067,7 @@ static void mce_reign(void)
|
||||
if (m && global_worst >= MCE_PANIC_SEVERITY) {
|
||||
/* call mce_severity() to get "msg" for panic */
|
||||
mce_severity(m, NULL, &msg, true);
|
||||
mce_panic("Fatal machine check", m, msg);
|
||||
mce_panic("Fatal machine check", err, msg);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1060,11 +1084,11 @@ static void mce_reign(void)
|
||||
mce_panic("Fatal machine check from unknown source", NULL, NULL);
|
||||
|
||||
/*
|
||||
* Now clear all the mces_seen so that they don't reappear on
|
||||
* Now clear all the hw_errs_seen so that they don't reappear on
|
||||
* the next mce.
|
||||
*/
|
||||
for_each_possible_cpu(cpu)
|
||||
memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
|
||||
memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
|
||||
}
|
||||
|
||||
static atomic_t global_nwo;
|
||||
@ -1268,13 +1292,14 @@ static noinstr bool mce_check_crashing_cpu(void)
|
||||
}
|
||||
|
||||
static __always_inline int
|
||||
__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
|
||||
unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
|
||||
int *worst)
|
||||
__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
|
||||
struct mce_hw_err *final, unsigned long *toclear,
|
||||
unsigned long *valid_banks, int no_way_out, int *worst)
|
||||
{
|
||||
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
|
||||
struct mca_config *cfg = &mca_cfg;
|
||||
int severity, i, taint = 0;
|
||||
struct mce *m = &err->m;
|
||||
|
||||
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
|
||||
arch___clear_bit(i, toclear);
|
||||
@ -1319,7 +1344,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
|
||||
if (severity == MCE_NO_SEVERITY)
|
||||
continue;
|
||||
|
||||
mce_read_aux(m, i);
|
||||
mce_read_aux(err, i);
|
||||
|
||||
/* assuming valid severity level != 0 */
|
||||
m->severity = severity;
|
||||
@ -1329,17 +1354,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
|
||||
* done in #MC context, where instrumentation is disabled.
|
||||
*/
|
||||
instrumentation_begin();
|
||||
mce_log(m);
|
||||
mce_log(err);
|
||||
instrumentation_end();
|
||||
|
||||
if (severity > *worst) {
|
||||
*final = *m;
|
||||
*final = *err;
|
||||
*worst = severity;
|
||||
}
|
||||
}
|
||||
|
||||
/* mce_clear_state will clear *final, save locally for use later */
|
||||
*m = *final;
|
||||
*err = *final;
|
||||
|
||||
return taint;
|
||||
}
|
||||
@ -1399,9 +1424,10 @@ static void kill_me_never(struct callback_head *cb)
|
||||
set_mce_nospec(pfn);
|
||||
}
|
||||
|
||||
static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
|
||||
static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *))
|
||||
{
|
||||
int count = ++current->mce_count;
|
||||
struct mce *m = &err->m;
|
||||
|
||||
/* First call, save all the details */
|
||||
if (count == 1) {
|
||||
@ -1414,11 +1440,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba
|
||||
|
||||
/* Ten is likely overkill. Don't expect more than two faults before task_work() */
|
||||
if (count > 10)
|
||||
mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
|
||||
mce_panic("Too many consecutive machine checks while accessing user data",
|
||||
err, msg);
|
||||
|
||||
/* Second or later call, make sure page address matches the one from first call */
|
||||
if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
|
||||
mce_panic("Consecutive machine checks to different user pages", m, msg);
|
||||
mce_panic("Consecutive machine checks to different user pages", err, msg);
|
||||
|
||||
/* Do not call task_work_add() more than once */
|
||||
if (count > 1)
|
||||
@ -1467,8 +1494,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
|
||||
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
|
||||
DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
|
||||
struct mce m, *final;
|
||||
struct mce_hw_err *final;
|
||||
struct mce_hw_err err;
|
||||
char *msg = NULL;
|
||||
struct mce *m;
|
||||
|
||||
if (unlikely(mce_flags.p5))
|
||||
return pentium_machine_check(regs);
|
||||
@ -1506,13 +1535,14 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
|
||||
this_cpu_inc(mce_exception_count);
|
||||
|
||||
mce_gather_info(&m, regs);
|
||||
m.tsc = rdtsc();
|
||||
mce_gather_info(&err, regs);
|
||||
m = &err.m;
|
||||
m->tsc = rdtsc();
|
||||
|
||||
final = this_cpu_ptr(&mces_seen);
|
||||
*final = m;
|
||||
final = this_cpu_ptr(&hw_errs_seen);
|
||||
*final = err;
|
||||
|
||||
no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
|
||||
no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);
|
||||
|
||||
barrier();
|
||||
|
||||
@ -1521,15 +1551,15 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
* Assume the worst for now, but if we find the
|
||||
* severity is MCE_AR_SEVERITY we have other options.
|
||||
*/
|
||||
if (!(m.mcgstatus & MCG_STATUS_RIPV))
|
||||
if (!(m->mcgstatus & MCG_STATUS_RIPV))
|
||||
kill_current_task = 1;
|
||||
/*
|
||||
* Check if this MCE is signaled to only this logical processor,
|
||||
* on Intel, Zhaoxin only.
|
||||
*/
|
||||
if (m.cpuvendor == X86_VENDOR_INTEL ||
|
||||
m.cpuvendor == X86_VENDOR_ZHAOXIN)
|
||||
lmce = m.mcgstatus & MCG_STATUS_LMCES;
|
||||
if (m->cpuvendor == X86_VENDOR_INTEL ||
|
||||
m->cpuvendor == X86_VENDOR_ZHAOXIN)
|
||||
lmce = m->mcgstatus & MCG_STATUS_LMCES;
|
||||
|
||||
/*
|
||||
* Local machine check may already know that we have to panic.
|
||||
@ -1540,12 +1570,12 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
*/
|
||||
if (lmce) {
|
||||
if (no_way_out)
|
||||
mce_panic("Fatal local machine check", &m, msg);
|
||||
mce_panic("Fatal local machine check", &err, msg);
|
||||
} else {
|
||||
order = mce_start(&no_way_out);
|
||||
}
|
||||
|
||||
taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
|
||||
taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst);
|
||||
|
||||
if (!no_way_out)
|
||||
mce_clear_state(toclear);
|
||||
@ -1560,7 +1590,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
no_way_out = worst >= MCE_PANIC_SEVERITY;
|
||||
|
||||
if (no_way_out)
|
||||
mce_panic("Fatal machine check on current CPU", &m, msg);
|
||||
mce_panic("Fatal machine check on current CPU", &err, msg);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
@ -1572,8 +1602,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
* make sure we have the right "msg".
|
||||
*/
|
||||
if (worst >= MCE_PANIC_SEVERITY) {
|
||||
mce_severity(&m, regs, &msg, true);
|
||||
mce_panic("Local fatal machine check!", &m, msg);
|
||||
mce_severity(m, regs, &msg, true);
|
||||
mce_panic("Local fatal machine check!", &err, msg);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1591,16 +1621,16 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
goto out;
|
||||
|
||||
/* Fault was in user mode and we need to take some action */
|
||||
if ((m.cs & 3) == 3) {
|
||||
if ((m->cs & 3) == 3) {
|
||||
/* If this triggers there is no way to recover. Die hard. */
|
||||
BUG_ON(!on_thread_stack() || !user_mode(regs));
|
||||
|
||||
if (!mce_usable_address(&m))
|
||||
queue_task_work(&m, msg, kill_me_now);
|
||||
if (!mce_usable_address(m))
|
||||
queue_task_work(&err, msg, kill_me_now);
|
||||
else
|
||||
queue_task_work(&m, msg, kill_me_maybe);
|
||||
queue_task_work(&err, msg, kill_me_maybe);
|
||||
|
||||
} else if (m.mcgstatus & MCG_STATUS_SEAM_NR) {
|
||||
} else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
|
||||
/*
|
||||
* Saved RIP on stack makes it look like the machine check
|
||||
* was taken in the kernel on the instruction following
|
||||
@ -1612,8 +1642,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
* not occur there. Mark the page as poisoned so it won't
|
||||
* be added to free list when the guest is terminated.
|
||||
*/
|
||||
if (mce_usable_address(&m)) {
|
||||
struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT);
|
||||
if (mce_usable_address(m)) {
|
||||
struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
|
||||
|
||||
if (p)
|
||||
SetPageHWPoison(p);
|
||||
@ -1628,13 +1658,13 @@ noinstr void do_machine_check(struct pt_regs *regs)
|
||||
* corresponding exception handler which would do that is the
|
||||
* proper one.
|
||||
*/
|
||||
if (m.kflags & MCE_IN_KERNEL_RECOV) {
|
||||
if (m->kflags & MCE_IN_KERNEL_RECOV) {
|
||||
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
|
||||
mce_panic("Failed kernel mode recovery", &m, msg);
|
||||
mce_panic("Failed kernel mode recovery", &err, msg);
|
||||
}
|
||||
|
||||
if (m.kflags & MCE_IN_KERNEL_COPYIN)
|
||||
queue_task_work(&m, msg, kill_me_never);
|
||||
if (m->kflags & MCE_IN_KERNEL_COPYIN)
|
||||
queue_task_work(&err, msg, kill_me_never);
|
||||
}
|
||||
|
||||
out:
|
||||
|
@ -264,15 +264,8 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
|
||||
return put_user(sizeof(struct mce), p);
|
||||
case MCE_GET_LOG_LEN:
|
||||
return put_user(mcelog->len, p);
|
||||
case MCE_GETCLEAR_FLAGS: {
|
||||
unsigned flags;
|
||||
|
||||
do {
|
||||
flags = mcelog->flags;
|
||||
} while (cmpxchg(&mcelog->flags, flags, 0) != flags);
|
||||
|
||||
return put_user(flags, p);
|
||||
}
|
||||
case MCE_GETCLEAR_FLAGS:
|
||||
return put_user(xchg(&mcelog->flags, 0), p);
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
|
@ -31,15 +31,15 @@ static LLIST_HEAD(mce_event_llist);
|
||||
*/
|
||||
static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
|
||||
{
|
||||
struct mce_hw_err *err1, *err2;
|
||||
struct mce_evt_llist *node;
|
||||
struct mce *m1, *m2;
|
||||
|
||||
m1 = &t->mce;
|
||||
err1 = &t->err;
|
||||
|
||||
llist_for_each_entry(node, &l->llnode, llnode) {
|
||||
m2 = &node->mce;
|
||||
err2 = &node->err;
|
||||
|
||||
if (!mce_cmp(m1, m2))
|
||||
if (!mce_cmp(&err1->m, &err2->m))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void)
|
||||
|
||||
void mce_gen_pool_process(struct work_struct *__unused)
|
||||
{
|
||||
struct llist_node *head;
|
||||
struct mce_evt_llist *node, *tmp;
|
||||
struct llist_node *head;
|
||||
struct mce *mce;
|
||||
|
||||
head = llist_del_all(&mce_event_llist);
|
||||
@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused)
|
||||
|
||||
head = llist_reverse_order(head);
|
||||
llist_for_each_entry_safe(node, tmp, head, llnode) {
|
||||
mce = &node->mce;
|
||||
mce = &node->err.m;
|
||||
blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
|
||||
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
|
||||
}
|
||||
@ -94,11 +94,11 @@ bool mce_gen_pool_empty(void)
|
||||
return llist_empty(&mce_event_llist);
|
||||
}
|
||||
|
||||
int mce_gen_pool_add(struct mce *mce)
|
||||
int mce_gen_pool_add(struct mce_hw_err *err)
|
||||
{
|
||||
struct mce_evt_llist *node;
|
||||
|
||||
if (filter_mce(mce))
|
||||
if (filter_mce(&err->m))
|
||||
return -EINVAL;
|
||||
|
||||
if (!mce_evt_pool)
|
||||
@ -110,7 +110,7 @@ int mce_gen_pool_add(struct mce *mce)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memcpy(&node->mce, mce, sizeof(*mce));
|
||||
memcpy(&node->err, err, sizeof(*err));
|
||||
llist_add(&node->llnode, &mce_event_llist);
|
||||
|
||||
return 0;
|
||||
|
@ -502,8 +502,9 @@ static void prepare_msrs(void *info)
|
||||
|
||||
static void do_inject(void)
|
||||
{
|
||||
u64 mcg_status = 0;
|
||||
unsigned int cpu = i_mce.extcpu;
|
||||
struct mce_hw_err err;
|
||||
u64 mcg_status = 0;
|
||||
u8 b = i_mce.bank;
|
||||
|
||||
i_mce.tsc = rdtsc_ordered();
|
||||
@ -517,7 +518,8 @@ static void do_inject(void)
|
||||
i_mce.status |= MCI_STATUS_SYNDV;
|
||||
|
||||
if (inj_type == SW_INJ) {
|
||||
mce_log(&i_mce);
|
||||
err.m = i_mce;
|
||||
mce_log(&err);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -94,7 +94,7 @@ static int cmci_supported(int *banks)
|
||||
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
|
||||
return 0;
|
||||
rdmsrl(MSR_IA32_MCG_CAP, cap);
|
||||
*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
|
||||
*banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
|
||||
return !!(cap & MCG_CMCI_P);
|
||||
}
|
||||
|
||||
|
@ -26,12 +26,12 @@ extern struct blocking_notifier_head x86_mce_decoder_chain;
|
||||
|
||||
struct mce_evt_llist {
|
||||
struct llist_node llnode;
|
||||
struct mce mce;
|
||||
struct mce_hw_err err;
|
||||
};
|
||||
|
||||
void mce_gen_pool_process(struct work_struct *__unused);
|
||||
bool mce_gen_pool_empty(void);
|
||||
int mce_gen_pool_add(struct mce *mce);
|
||||
int mce_gen_pool_add(struct mce_hw_err *err);
|
||||
int mce_gen_pool_init(void);
|
||||
struct llist_node *mce_gen_pool_prepare_records(void);
|
||||
|
||||
|
@ -793,7 +793,9 @@ static int
|
||||
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
|
||||
{
|
||||
struct mce *m = (struct mce *)data;
|
||||
struct mce_hw_err *err = to_mce_hw_err(m);
|
||||
unsigned int fam = x86_family(m->cpuid);
|
||||
u32 mca_config_lo = 0, dummy;
|
||||
int ecc;
|
||||
|
||||
if (m->kflags & MCE_HANDLED_CEC)
|
||||
@ -813,11 +815,9 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
|
||||
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_SMCA)) {
|
||||
u32 low, high;
|
||||
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
|
||||
rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
|
||||
|
||||
if (!rdmsr_safe(addr, &low, &high) &&
|
||||
(low & MCI_CONFIG_MCAX))
|
||||
if (mca_config_lo & MCI_CONFIG_MCAX)
|
||||
pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
|
||||
|
||||
pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
|
||||
@ -850,8 +850,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
|
||||
if (boot_cpu_has(X86_FEATURE_SMCA)) {
|
||||
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
|
||||
|
||||
if (m->status & MCI_STATUS_SYNDV)
|
||||
pr_cont(", Syndrome: 0x%016llx", m->synd);
|
||||
if (m->status & MCI_STATUS_SYNDV) {
|
||||
pr_cont(", Syndrome: 0x%016llx\n", m->synd);
|
||||
if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
|
||||
char frutext[17];
|
||||
|
||||
frutext[16] = '\0';
|
||||
memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
|
||||
memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
|
||||
|
||||
pr_emerg(HW_ERR "FRU Text: %s", frutext);
|
||||
}
|
||||
}
|
||||
|
||||
pr_cont("\n");
|
||||
|
||||
|
@ -19,9 +19,9 @@
|
||||
|
||||
TRACE_EVENT(mce_record,
|
||||
|
||||
TP_PROTO(struct mce *m),
|
||||
TP_PROTO(struct mce_hw_err *err),
|
||||
|
||||
TP_ARGS(m),
|
||||
TP_ARGS(err),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( u64, mcgcap )
|
||||
@ -43,31 +43,33 @@ TRACE_EVENT(mce_record,
|
||||
__field( u8, bank )
|
||||
__field( u8, cpuvendor )
|
||||
__field( u32, microcode )
|
||||
__dynamic_array(u8, v_data, sizeof(err->vendor))
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->mcgcap = m->mcgcap;
|
||||
__entry->mcgstatus = m->mcgstatus;
|
||||
__entry->status = m->status;
|
||||
__entry->addr = m->addr;
|
||||
__entry->misc = m->misc;
|
||||
__entry->synd = m->synd;
|
||||
__entry->ipid = m->ipid;
|
||||
__entry->ip = m->ip;
|
||||
__entry->tsc = m->tsc;
|
||||
__entry->ppin = m->ppin;
|
||||
__entry->walltime = m->time;
|
||||
__entry->cpu = m->extcpu;
|
||||
__entry->cpuid = m->cpuid;
|
||||
__entry->apicid = m->apicid;
|
||||
__entry->socketid = m->socketid;
|
||||
__entry->cs = m->cs;
|
||||
__entry->bank = m->bank;
|
||||
__entry->cpuvendor = m->cpuvendor;
|
||||
__entry->microcode = m->microcode;
|
||||
__entry->mcgcap = err->m.mcgcap;
|
||||
__entry->mcgstatus = err->m.mcgstatus;
|
||||
__entry->status = err->m.status;
|
||||
__entry->addr = err->m.addr;
|
||||
__entry->misc = err->m.misc;
|
||||
__entry->synd = err->m.synd;
|
||||
__entry->ipid = err->m.ipid;
|
||||
__entry->ip = err->m.ip;
|
||||
__entry->tsc = err->m.tsc;
|
||||
__entry->ppin = err->m.ppin;
|
||||
__entry->walltime = err->m.time;
|
||||
__entry->cpu = err->m.extcpu;
|
||||
__entry->cpuid = err->m.cpuid;
|
||||
__entry->apicid = err->m.apicid;
|
||||
__entry->socketid = err->m.socketid;
|
||||
__entry->cs = err->m.cs;
|
||||
__entry->bank = err->m.bank;
|
||||
__entry->cpuvendor = err->m.cpuvendor;
|
||||
__entry->microcode = err->m.microcode;
|
||||
memcpy(__get_dynamic_array(v_data), &err->vendor, sizeof(err->vendor));
|
||||
),
|
||||
|
||||
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x",
|
||||
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s",
|
||||
__entry->cpu,
|
||||
__entry->mcgcap, __entry->mcgstatus,
|
||||
__entry->bank, __entry->status,
|
||||
@ -83,7 +85,8 @@ TRACE_EVENT(mce_record,
|
||||
__entry->walltime,
|
||||
__entry->socketid,
|
||||
__entry->apicid,
|
||||
__entry->microcode)
|
||||
__entry->microcode,
|
||||
__print_dynamic_array(v_data, sizeof(u8)))
|
||||
);
|
||||
|
||||
#endif /* _TRACE_MCE_H */
|
||||
|
@ -119,6 +119,14 @@
|
||||
trace_print_array_seq(p, array, count, el_size); \
|
||||
})
|
||||
|
||||
#undef __print_dynamic_array
|
||||
#define __print_dynamic_array(array, el_size) \
|
||||
({ \
|
||||
__print_array(__get_dynamic_array(array), \
|
||||
__get_dynamic_array_len(array) / (el_size), \
|
||||
(el_size)); \
|
||||
})
|
||||
|
||||
#undef __print_hex_dump
|
||||
#define __print_hex_dump(prefix_str, prefix_type, \
|
||||
rowsize, groupsize, buf, len, ascii) \
|
||||
|
@ -22,6 +22,7 @@
|
||||
#undef __get_rel_cpumask
|
||||
#undef __get_rel_sockaddr
|
||||
#undef __print_array
|
||||
#undef __print_dynamic_array
|
||||
#undef __print_hex_dump
|
||||
#undef __get_buf
|
||||
|
||||
|
@ -319,7 +319,7 @@ TRACE_EVENT(foo_bar,
|
||||
__assign_cpumask(cpum, cpumask_bits(mask));
|
||||
),
|
||||
|
||||
TP_printk("foo %s %d %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar,
|
||||
TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar,
|
||||
|
||||
/*
|
||||
* Notice here the use of some helper functions. This includes:
|
||||
@ -363,6 +363,11 @@ TRACE_EVENT(foo_bar,
|
||||
__print_array(__get_dynamic_array(list),
|
||||
__get_dynamic_array_len(list) / sizeof(int),
|
||||
sizeof(int)),
|
||||
|
||||
/* A shortcut is to use __print_dynamic_array for dynamic arrays */
|
||||
|
||||
__print_dynamic_array(list, sizeof(int)),
|
||||
|
||||
__get_str(str), __get_str(lstr),
|
||||
__get_bitmask(cpus), __get_cpumask(cpum),
|
||||
__get_str(vstr))
|
||||
|
Loading…
Reference in New Issue
Block a user