From d798bc6f3c174c61837862cb9778d73cccd92a8e Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 22 Nov 2024 16:46:35 +0000 Subject: [PATCH 1/6] arm64: Fix usage of new shifted MDCR_EL2 values Since the linked fixes commit, these masks are already shifted so remove the shifts. One issue that this fixes is SPE and TRBE not being available anymore: arm_spe_pmu arm,spe-v1: profiling buffer owned by higher exception level Fixes: 641630313e9c ("arm64: sysreg: Migrate MDCR_EL2 definition to table") Signed-off-by: James Clark Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20241122164636.2944180-1-james.clark@linaro.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 4 ++-- arch/arm64/kernel/hyp-stub.S | 4 ++-- arch/arm64/kvm/hyp/nvhe/pkvm.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 4cd41464be3f..f134907d3c08 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -79,7 +79,7 @@ 1 << PMSCR_EL2_PA_SHIFT) msr_s SYS_PMSCR_EL2, x0 // addresses and physical counter .Lskip_spe_el2_\@: - mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) + mov x0, #MDCR_EL2_E2PB_MASK orr x2, x2, x0 // If we don't have VHE, then // use EL1&0 translation. @@ -92,7 +92,7 @@ and x0, x0, TRBIDR_EL1_P cbnz x0, .Lskip_trace_\@ // If TRBE is available at EL2 - mov x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + mov x0, #MDCR_EL2_E2TB_MASK orr x2, x2, x0 // allow the EL1&0 translation // to own it. diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 65f76064c86b..ae990da1eae5 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -114,8 +114,8 @@ SYM_CODE_START_LOCAL(__finalise_el2) // Use EL2 translations for SPE & TRBE and disable access from EL1 mrs x0, mdcr_el2 - bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + bic x0, x0, #MDCR_EL2_E2PB_MASK + bic x0, x0, #MDCR_EL2_E2TB_MASK msr mdcr_el2, x0 // Transfer the MM state from EL1 to EL2 diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 01616c39a810..071993c16de8 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -126,7 +126,7 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) /* Trap SPE */ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { mdcr_set |= MDCR_EL2_TPMS; - mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + mdcr_clear |= MDCR_EL2_E2PB_MASK; } /* Trap Trace Filter */ @@ -143,7 +143,7 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) /* Trap External Trace */ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) - mdcr_clear |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; + mdcr_clear |= MDCR_EL2_E2TB_MASK; vcpu->arch.mdcr_el2 |= mdcr_set; vcpu->arch.mdcr_el2 &= ~mdcr_clear; From 6fc3a49f23856fdf155ab35f2244295f7870bf83 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 25 Nov 2024 09:47:56 +0000 Subject: [PATCH 2/6] KVM: arm64: Fix S1/S2 combination when FWB==1 and S2 has Device memory type The G.a revision of the ARM ARM had it pretty clear that HCR_EL2.FWB had no influence on "The way that stage 1 memory types and attributes are combined with stage 2 Device type and attributes." (D5.5.5). However, this wording was lost in further revisions of the architecture. Restore the intended behaviour, which is to take the strongest memory type of S1 and S2 in this case, as if FWB was 0. The specification is being fixed accordingly. Fixes: be04cebf3e788 ("KVM: arm64: nv: Add emulation of AT S12E{0,1}{R,W}") Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241125094756.609590-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 8c5d7990e5b3..3d7eb395e33d 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -739,8 +739,15 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, final_attr = s1_parattr; break; default: - /* MemAttr[2]=0, Device from S2 */ - final_attr = s2_memattr & GENMASK(1,0) << 2; + /* + * MemAttr[2]=0, Device from S2. + * + * FWB does not influence the way that stage 1 + * memory types and attributes are combined + * with stage 2 Device type and attributes. + */ + final_attr = min(s2_memattr_to_attr(s2_memattr), + s1_parattr); } } else { /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ From 03c7527e97f73081633d773f9f8c2373f9854b25 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Dec 2024 19:02:36 +0000 Subject: [PATCH 3/6] KVM: arm64: Do not allow ID_AA64MMFR0_EL1.ASIDbits to be overridden Catalin reports that a hypervisor lying to a guest about the size of the ASID field may result in unexpected issues: - if the underlying HW does only supports 8 bit ASIDs, the ASID field in a TLBI VAE1* operation is only 8 bits, and the HW will ignore the other 8 bits - if on the contrary the HW is 16 bit capable, the ASID field in the same TLBI operation is always 16 bits, irrespective of the value of TCR_ELx.AS. This could lead to missed invalidations if the guest was lead to assume that the HW had 8 bit ASIDs while they really are 16 bit wide. In order to avoid any potential disaster that would be hard to debug, prenent the migration between a host with 8 bit ASIDs to one with wider ASIDs (the converse was obviously always forbidden). This is also consistent with what we already do for VMIDs. If it becomes absolutely mandatory to support such a migration path in the future, we will have to trap and emulate all TLBIs, something that nobody should look forward to. Fixes: d5a32b60dc18 ("KVM: arm64: Allow userspace to change ID_AA64MMFR{0-2}_EL1") Reported-by: Catalin Marinas Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Cc: Will Deacon Cc: Mark Rutland Cc: Marc Zyngier Cc: James Morse Cc: Oliver Upton Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20241203190236.505759-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 83c6b4a07ef5..e2a5c2918d9e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2618,7 +2618,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 | ID_AA64MMFR0_EL1_TGRAN4_2 | ID_AA64MMFR0_EL1_TGRAN64_2 | - ID_AA64MMFR0_EL1_TGRAN16_2)), + ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_AA64MMFR0_EL1_ASIDBITS)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | ID_AA64MMFR1_EL1_HCX | ID_AA64MMFR1_EL1_TWED | From be7e611274224b23776469d7f7ce50e25ac53142 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Sat, 30 Nov 2024 15:49:53 +0100 Subject: [PATCH 4/6] KVM: arm64: vgic-its: Add error handling in vgic_its_cache_translation The return value of xa_store() needs to be checked. This fix adds an error handling path that resolves the kref inconsistency on failure. As suggested by Oliver Upton, this function does not return the error code intentionally because the translation cache is best effort. Fixes: 8201d1028caa ("KVM: arm64: vgic-its: Maintain a translation cache per ITS") Signed-off-by: Keisuke Nishimura Suggested-by: Oliver Upton Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20241130144952.23729-1-keisuke.nishimura@inria.fr Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-its.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index f4c4494645c3..fb96802799c6 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -608,12 +608,22 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, lockdep_assert_held(&its->its_lock); vgic_get_irq_kref(irq); + old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); + + /* + * Put the reference taken on @irq if the store fails. Intentionally do + * not return the error as the translation cache is best effort. + */ + if (xa_is_err(old)) { + vgic_put_irq(kvm, irq); + return; + } + /* * We could have raced with another CPU caching the same * translation behind our back, ensure we don't leak a * reference if that is the case. */ - old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); if (old) vgic_put_irq(kvm, old); } From ea6398a5af81e3e7fb3da5d261694d479a321fd9 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 27 Nov 2024 04:18:40 +0000 Subject: [PATCH 5/6] RISC-V: KVM: Fix csr_write -> csr_set for HVIEN PMU overflow bit This doesn't cause a problem currently as HVIEN isn't used elsewhere yet. Found by inspection. Signed-off-by: Michael Neuling Fixes: 16b0bde9a37c ("RISC-V: KVM: Add perf sampling support for guests") Reviewed-by: Atish Patra Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20241127041840.419940-1-michaelneuling@tenstorrent.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index dcced4db7fe8..19afd1f23537 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -590,7 +590,7 @@ void kvm_riscv_aia_enable(void) csr_set(CSR_HIE, BIT(IRQ_S_GEXT)); /* Enable IRQ filtering for overflow interrupt only if sscofpmf is present */ if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF)) - csr_write(CSR_HVIEN, BIT(IRQ_PMU_OVF)); + csr_set(CSR_HVIEN, BIT(IRQ_PMU_OVF)); } void kvm_riscv_aia_disable(void) From 1201f226c863b7da739f7420ddba818cedf372fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2024 17:32:58 -0800 Subject: [PATCH 6/6] KVM: x86: Cache CPUID.0xD XSTATE offsets+sizes during module init Snapshot the output of CPUID.0xD.[1..n] during kvm.ko initiliaization to avoid the overead of CPUID during runtime. The offset, size, and metadata for CPUID.0xD.[1..n] sub-leaves does not depend on XCR0 or XSS values, i.e. is constant for a given CPU, and thus can be cached during module load. On Intel's Emerald Rapids, CPUID is *wildly* expensive, to the point where recomputing XSAVE offsets and sizes results in a 4x increase in latency of nested VM-Enter and VM-Exit (nested transitions can trigger xstate_required_size() multiple times per transition), relative to using cached values. The issue is easily visible by running `perf top` while triggering nested transitions: kvm_update_cpuid_runtime() shows up at a whopping 50%. As measured via RDTSC from L2 (using KVM-Unit-Test's CPUID VM-Exit test and a slightly modified L1 KVM to handle CPUID in the fastpath), a nested roundtrip to emulate CPUID on Skylake (SKX), Icelake (ICX), and Emerald Rapids (EMR) takes: SKX 11650 ICX 22350 EMR 28850 Using cached values, the latency drops to: SKX 6850 ICX 9000 EMR 7900 The underlying issue is that CPUID itself is slow on ICX, and comically slow on EMR. The problem is exacerbated on CPUs which support XSAVES and/or XSAVEC, as KVM invokes xstate_required_size() twice on each runtime CPUID update, and because there are more supported XSAVE features (CPUID for supported XSAVE feature sub-leafs is significantly slower). SKX: CPUID.0xD.2 = 348 cycles CPUID.0xD.3 = 400 cycles CPUID.0xD.4 = 276 cycles CPUID.0xD.5 = 236 cycles EMR: CPUID.0xD.2 = 1138 cycles CPUID.0xD.3 = 1362 cycles CPUID.0xD.4 = 1068 cycles CPUID.0xD.5 = 910 cycles CPUID.0xD.6 = 914 cycles CPUID.0xD.7 = 1350 cycles CPUID.0xD.8 = 734 cycles CPUID.0xD.9 = 766 cycles CPUID.0xD.10 = 732 cycles CPUID.0xD.11 = 718 cycles CPUID.0xD.12 = 734 cycles CPUID.0xD.13 = 1700 cycles CPUID.0xD.14 = 1126 cycles CPUID.0xD.15 = 898 cycles CPUID.0xD.16 = 716 cycles CPUID.0xD.17 = 748 cycles CPUID.0xD.18 = 776 cycles Note, updating runtime CPUID information multiple times per nested transition is itself a flaw, especially since CPUID is a mandotory intercept on both Intel and AMD. E.g. KVM doesn't need to ensure emulated CPUID state is up-to-date while running L2. That flaw will be fixed in a future patch, as deferring runtime CPUID updates is more subtle than it appears at first glance, the benefits aren't super critical to have once the XSAVE issue is resolved, and caching CPUID output is desirable even if KVM's updates are deferred. Cc: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20241211013302.1347853-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 31 ++++++++++++++++++++++++++----- arch/x86/kvm/cpuid.h | 1 + arch/x86/kvm/x86.c | 2 ++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 097bdc022d0f..ae0b438a2c99 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -36,6 +36,26 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); +struct cpuid_xstate_sizes { + u32 eax; + u32 ebx; + u32 ecx; +}; + +static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init; + +void __init kvm_init_xstate_sizes(void) +{ + u32 ign; + int i; + + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + + cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign); + } +} + u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; @@ -44,14 +64,15 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) xstate_bv &= XFEATURE_MASK_EXTEND; while (xstate_bv) { if (xstate_bv & 0x1) { - u32 eax, ebx, ecx, edx, offset; - cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); + struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit]; + u32 offset; + /* ECX[1]: 64B alignment in compacted form */ if (compacted) - offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; else - offset = ebx; - ret = max(ret, offset + eax); + offset = xs->ebx; + ret = max(ret, offset + xs->eax); } xstate_bv >>= 1; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c8dc66eddefd..f16a7b2c2adc 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -31,6 +31,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); +void __init kvm_init_xstate_sizes(void); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e713480933a..c8160baf3838 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13997,6 +13997,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); static int __init kvm_x86_init(void) { + kvm_init_xstate_sizes(); + kvm_mmu_x86_module_init(); mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); return 0;