From ad361ed4771da6aebb3ca6184a81ae4b8ad9f0b6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 22 Oct 2024 15:40:15 +0100 Subject: [PATCH 01/83] KVM: arm64: Just advertise SEIS as 0 when emulating ICC_CTLR_EL1 ICC_CTLR_EL1 accesses from a guest are trapped and emulated on systems with broken SEIS support and without FEAT_GICv3_TDIR. On such systems, we mask SEIS support in 'kvm_vgic_global_state.ich_vtr_el2' and so the value of ICC_CTLR_EL1.SEIS visible to the guest is always zero. Simplify the ICC_CTLR_EL1 read emulation to return 0 for the SEIS field, rather than reading an always-zero value from the global state. Cc: Marc Zyngier Cc: Oliver Upton Signed-off-by: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241022144016.27350-2-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 18d4677002b1..3f9741e51d41 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -1012,9 +1012,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) - val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ From 8aaf3f7dce74605a2e9f31bd421825a996ac7de3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 22 Oct 2024 15:40:16 +0100 Subject: [PATCH 02/83] KVM: arm64: Don't map 'kvm_vgic_global_state' at EL2 with pKVM Now that 'kvm_vgic_global_state' is no longer needed for ICC_CTLR_EL1 emulation on machines with a broken SEIS implementation, drop the pKVM hypervisor mapping of the page. Note that kvm_vgic_global_state is still mapped in non-protected hypervisor configurations (i.e. {n,h}VHE) through the rodata section mapping. Cc: Marc Zyngier Cc: Oliver Upton Signed-off-by: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241022144016.27350-3-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/setup.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 174007f3fadd..8fec099c2775 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -95,7 +95,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; - enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -148,22 +147,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, } pkvm_create_host_sve_mappings(); - - /* - * Map the host sections RO in the hypervisor, but transfer the - * ownership from the host to the hypervisor itself to make sure they - * can't be donated or shared with another entity. - * - * The ownership transition requires matching changes in the host - * stage-2. This will be done later (see finalize_host_mappings()) once - * the hyp_vmemmap is addressable. - */ - prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); - ret = pkvm_create_mappings(&kvm_vgic_global_state, - &kvm_vgic_global_state + 1, prot); - if (ret) - return ret; - return 0; } From 2f2d46959808e9b039ecb241ff13d50be2d6e231 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 19 Oct 2024 18:15:42 +0100 Subject: [PATCH 03/83] firmware/psci: Add definitions for PSCI v1.3 specification The v1.3 PSCI spec (https://developer.arm.com/documentation/den0022) adds the SYSTEM_OFF2 function. Add definitions for it and its hibernation type parameter. Signed-off-by: David Woodhouse Reviewed-by: Miguel Luis Link: https://lore.kernel.org/r/20241019172459.2241939-2-dwmw2@infradead.org Signed-off-by: Oliver Upton --- include/uapi/linux/psci.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h index 42a40ad3fb62..81759ff385e6 100644 --- a/include/uapi/linux/psci.h +++ b/include/uapi/linux/psci.h @@ -59,6 +59,7 @@ #define PSCI_1_1_FN_SYSTEM_RESET2 PSCI_0_2_FN(18) #define PSCI_1_1_FN_MEM_PROTECT PSCI_0_2_FN(19) #define PSCI_1_1_FN_MEM_PROTECT_CHECK_RANGE PSCI_0_2_FN(20) +#define PSCI_1_3_FN_SYSTEM_OFF2 PSCI_0_2_FN(21) #define PSCI_1_0_FN64_CPU_DEFAULT_SUSPEND PSCI_0_2_FN64(12) #define PSCI_1_0_FN64_NODE_HW_STATE PSCI_0_2_FN64(13) @@ -68,6 +69,7 @@ #define PSCI_1_1_FN64_SYSTEM_RESET2 PSCI_0_2_FN64(18) #define PSCI_1_1_FN64_MEM_PROTECT_CHECK_RANGE PSCI_0_2_FN64(20) +#define PSCI_1_3_FN64_SYSTEM_OFF2 PSCI_0_2_FN64(21) /* PSCI v0.2 power state encoding for CPU_SUSPEND function */ #define PSCI_0_2_POWER_STATE_ID_MASK 0xffff @@ -100,6 +102,9 @@ #define PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET 0 #define PSCI_1_1_RESET_TYPE_VENDOR_START 0x80000000U +/* PSCI v1.3 hibernate type for SYSTEM_OFF2 */ +#define PSCI_1_3_OFF_TYPE_HIBERNATE_OFF BIT(0) + /* PSCI version decoding (independent of PSCI version) */ #define PSCI_VERSION_MAJOR_SHIFT 16 #define PSCI_VERSION_MINOR_MASK \ From 97413cea1c48cc05d33db442d1c41d71c56c730e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 19 Oct 2024 18:15:43 +0100 Subject: [PATCH 04/83] KVM: arm64: Add PSCI v1.3 SYSTEM_OFF2 function for hibernation The PSCI v1.3 specification adds support for a SYSTEM_OFF2 function which is analogous to ACPI S4 state. This will allow hosting environments to determine that a guest is hibernated rather than just powered off, and ensure that they preserve the virtual environment appropriately to allow the guest to resume safely (or bump the hardware_signature in the FACS to trigger a clean reboot instead). This feature is safe to enable unconditionally (in a subsequent commit) because it is exposed to userspace through the existing KVM_SYSTEM_EVENT_SHUTDOWN event, just with an additional flag which userspace can use to know that the instance intended hibernation instead of a plain power-off. As with SYSTEM_RESET2, there is only one type available (in this case HIBERNATE_OFF), and it is not explicitly reported to userspace through the event; userspace can get it from the registers if it cares). Signed-off-by: David Woodhouse Reviewed-by: Miguel Luis Link: https://lore.kernel.org/r/20241019172459.2241939-3-dwmw2@infradead.org [oliver: slight cleanup of comments] Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 10 ++++++++ arch/arm64/include/uapi/asm/kvm.h | 6 +++++ arch/arm64/kvm/psci.c | 38 +++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index e32471977d0a..206cd0bc4ef4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6855,6 +6855,10 @@ the first `ndata` items (possibly zero) of the data array are valid. the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI specification. + - for arm64, data[0] is set to KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2 + if the guest issued a SYSTEM_OFF2 call according to v1.3 of the PSCI + specification. + - for RISC-V, data[0] is set to the value of the second argument of the ``sbi_system_reset`` call. @@ -6888,6 +6892,12 @@ either: - Deny the guest request to suspend the VM. See ARM DEN0022D.b 5.19.2 "Caller responsibilities" for possible return values. +Hibernation using the PSCI SYSTEM_OFF2 call is enabled when PSCI v1.3 +is enabled. If a guest invokes the PSCI SYSTEM_OFF2 function, KVM will +exit to userspace with the KVM_SYSTEM_EVENT_SHUTDOWN event type and with +data[0] set to KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2. The only +supported hibernate type for the SYSTEM_OFF2 function is HIBERNATE_OFF. + :: /* KVM_EXIT_IOAPIC_EOI */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 964df31da975..66736ff04011 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -484,6 +484,12 @@ enum { */ #define KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (1ULL << 0) +/* + * Shutdown caused by a PSCI v1.3 SYSTEM_OFF2 call. + * Valid only when the system event has a type of KVM_SYSTEM_EVENT_SHUTDOWN. + */ +#define KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2 (1ULL << 0) + /* run->fail_entry.hardware_entry_failure_reason codes. */ #define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0) diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 1f69b667332b..5d5fb3743b1c 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -194,6 +194,12 @@ static void kvm_psci_system_off(struct kvm_vcpu *vcpu) kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0); } +static void kvm_psci_system_off2(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, + KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2); +} + static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) { kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0); @@ -358,6 +364,11 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) if (minor >= 1) val = 0; break; + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor >= 3) + val = PSCI_1_3_OFF_TYPE_HIBERNATE_OFF; + break; } break; case PSCI_1_0_FN_SYSTEM_SUSPEND: @@ -392,6 +403,33 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) break; } break; + case PSCI_1_3_FN_SYSTEM_OFF2: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor < 3) + break; + + arg = smccc_get_arg1(vcpu); + /* + * SYSTEM_OFF2 defaults to HIBERNATE_OFF if arg1 is zero. arg2 + * must be zero. + */ + if ((arg && arg != PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) || + smccc_get_arg2(vcpu) != 0) { + val = PSCI_RET_INVALID_PARAMS; + break; + } + kvm_psci_system_off2(vcpu); + /* + * We shouldn't be going back to the guest after receiving a + * SYSTEM_OFF2 request. Preload a return value of + * INTERNAL_FAILURE should userspace ignore the exit and resume + * the vCPU. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; default: return kvm_psci_0_2_call(vcpu); } From 8be82d536a9f8d9974cf746d235c1f1c24dd31ed Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 19 Oct 2024 18:15:44 +0100 Subject: [PATCH 05/83] KVM: arm64: Add support for PSCI v1.2 and v1.3 As with PSCI v1.1 in commit 512865d83fd9 ("KVM: arm64: Bump guest PSCI version to 1.1"), expose v1.3 to the guest by default. The SYSTEM_OFF2 call which is exposed by doing so is compatible for userspace because it's just a new flag in the event that KVM raises, in precisely the same way that SYSTEM_RESET2 was compatible when v1.1 was enabled by default. Signed-off-by: David Woodhouse Reviewed-by: Miguel Luis Link: https://lore.kernel.org/r/20241019172459.2241939-4-dwmw2@infradead.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hypercalls.c | 2 ++ arch/arm64/kvm/psci.c | 6 +++++- include/kvm/arm_psci.h | 4 +++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 5763d979d8ca..9c6267ca2b82 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -575,6 +575,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_ARM_PSCI_0_2: case KVM_ARM_PSCI_1_0: case KVM_ARM_PSCI_1_1: + case KVM_ARM_PSCI_1_2: + case KVM_ARM_PSCI_1_3: if (!wants_02) return -EINVAL; vcpu->kvm->arch.psci_version = val; diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 5d5fb3743b1c..3b5dbe9a0a0e 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -328,7 +328,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) switch(psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: - val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1; + val = PSCI_VERSION(1, minor); break; case PSCI_1_0_FN_PSCI_FEATURES: arg = smccc_get_arg1(vcpu); @@ -487,6 +487,10 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) } switch (version) { + case KVM_ARM_PSCI_1_3: + return kvm_psci_1_x_call(vcpu, 3); + case KVM_ARM_PSCI_1_2: + return kvm_psci_1_x_call(vcpu, 2); case KVM_ARM_PSCI_1_1: return kvm_psci_1_x_call(vcpu, 1); case KVM_ARM_PSCI_1_0: diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h index e8fb624013d1..cbaec804eb83 100644 --- a/include/kvm/arm_psci.h +++ b/include/kvm/arm_psci.h @@ -14,8 +14,10 @@ #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) #define KVM_ARM_PSCI_1_1 PSCI_VERSION(1, 1) +#define KVM_ARM_PSCI_1_2 PSCI_VERSION(1, 2) +#define KVM_ARM_PSCI_1_3 PSCI_VERSION(1, 3) -#define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_1 +#define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_3 static inline int kvm_psci_version(struct kvm_vcpu *vcpu) { From 72be5aa6be4af29fa2d77737e634b9a4c0e02d69 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 19 Oct 2024 18:15:45 +0100 Subject: [PATCH 06/83] KVM: selftests: Add test for PSCI SYSTEM_OFF2 Signed-off-by: David Woodhouse Link: https://lore.kernel.org/r/20241019172459.2241939-5-dwmw2@infradead.org Signed-off-by: Oliver Upton --- .../testing/selftests/kvm/aarch64/psci_test.c | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 61731a950def..eaa7655fefc1 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -54,6 +54,15 @@ static uint64_t psci_system_suspend(uint64_t entry_addr, uint64_t context_id) return res.a0; } +static uint64_t psci_system_off2(uint64_t type, uint64_t cookie) +{ + struct arm_smccc_res res; + + smccc_hvc(PSCI_1_3_FN64_SYSTEM_OFF2, type, cookie, 0, 0, 0, 0, 0, &res); + + return res.a0; +} + static uint64_t psci_features(uint32_t func_id) { struct arm_smccc_res res; @@ -188,11 +197,94 @@ static void host_test_system_suspend(void) kvm_vm_free(vm); } +static void guest_test_system_off2(void) +{ + uint64_t ret; + + /* assert that SYSTEM_OFF2 is discoverable */ + GUEST_ASSERT(psci_features(PSCI_1_3_FN_SYSTEM_OFF2) & + PSCI_1_3_OFF_TYPE_HIBERNATE_OFF); + GUEST_ASSERT(psci_features(PSCI_1_3_FN64_SYSTEM_OFF2) & + PSCI_1_3_OFF_TYPE_HIBERNATE_OFF); + + /* With non-zero 'cookie' field, it should fail */ + ret = psci_system_off2(PSCI_1_3_OFF_TYPE_HIBERNATE_OFF, 1); + GUEST_ASSERT(ret == PSCI_RET_INVALID_PARAMS); + + /* + * This would normally never return, so KVM sets the return value + * to PSCI_RET_INTERNAL_FAILURE. The test case *does* return, so + * that it can test both values for HIBERNATE_OFF. + */ + ret = psci_system_off2(PSCI_1_3_OFF_TYPE_HIBERNATE_OFF, 0); + GUEST_ASSERT(ret == PSCI_RET_INTERNAL_FAILURE); + + /* + * Revision F.b of the PSCI v1.3 specification documents zero as an + * alias for HIBERNATE_OFF, since that's the value used in earlier + * revisions of the spec and some implementations in the field. + */ + ret = psci_system_off2(0, 1); + GUEST_ASSERT(ret == PSCI_RET_INVALID_PARAMS); + + ret = psci_system_off2(0, 0); + GUEST_ASSERT(ret == PSCI_RET_INTERNAL_FAILURE); + + GUEST_DONE(); +} + +static void host_test_system_off2(void) +{ + struct kvm_vcpu *source, *target; + struct kvm_mp_state mps; + uint64_t psci_version = 0; + int nr_shutdowns = 0; + struct kvm_run *run; + struct ucall uc; + + setup_vm(guest_test_system_off2, &source, &target); + + vcpu_get_reg(target, KVM_REG_ARM_PSCI_VERSION, &psci_version); + + TEST_ASSERT(psci_version >= PSCI_VERSION(1, 3), + "Unexpected PSCI version %lu.%lu", + PSCI_VERSION_MAJOR(psci_version), + PSCI_VERSION_MINOR(psci_version)); + + vcpu_power_off(target); + run = source->run; + + enter_guest(source); + while (run->exit_reason == KVM_EXIT_SYSTEM_EVENT) { + TEST_ASSERT(run->system_event.type == KVM_SYSTEM_EVENT_SHUTDOWN, + "Unhandled system event: %u (expected: %u)", + run->system_event.type, KVM_SYSTEM_EVENT_SHUTDOWN); + TEST_ASSERT(run->system_event.ndata >= 1, + "Unexpected amount of system event data: %u (expected, >= 1)", + run->system_event.ndata); + TEST_ASSERT(run->system_event.data[0] & KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2, + "PSCI_OFF2 flag not set. Flags %llu (expected %llu)", + run->system_event.data[0], KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2); + + nr_shutdowns++; + + /* Restart the vCPU */ + mps.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu_mp_state_set(source, &mps); + + enter_guest(source); + } + + TEST_ASSERT(get_ucall(source, &uc) == UCALL_DONE, "Guest did not exit cleanly"); + TEST_ASSERT(nr_shutdowns == 2, "Two shutdown events were expected, but saw %d", nr_shutdowns); +} + int main(void) { TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SYSTEM_SUSPEND)); host_test_cpu_on(); host_test_system_suspend(); + host_test_system_off2(); return 0; } From 94f985c39a1e3c7df8c1db79749074f0e5ac1e10 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 19 Oct 2024 18:15:46 +0100 Subject: [PATCH 07/83] KVM: arm64: nvhe: Pass through PSCI v1.3 SYSTEM_OFF2 call Pass through the SYSTEM_OFF2 function for hibernation, just like SYSTEM_OFF. Signed-off-by: David Woodhouse Reviewed-by: Miguel Luis Link: https://lore.kernel.org/r/20241019172459.2241939-6-dwmw2@infradead.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index dfe8fe0f7eaf..9c2ce1e0e99a 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -265,6 +265,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ case PSCI_1_0_FN_PSCI_FEATURES: case PSCI_1_0_FN_SET_SUSPEND_MODE: case PSCI_1_1_FN64_SYSTEM_RESET2: + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: return psci_forward(host_ctxt); case PSCI_1_0_FN64_SYSTEM_SUSPEND: return psci_system_suspend(func_id, host_ctxt); From e735a5da64420a86be370b216c269b5dd8e830e2 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 20:31:03 +0000 Subject: [PATCH 08/83] KVM: arm64: Don't retire aborted MMIO instruction Returning an abort to the guest for an unsupported MMIO access is a documented feature of the KVM UAPI. Nevertheless, it's clear that this plumbing has seen limited testing, since userspace can trivially cause a WARN in the MMIO return: WARNING: CPU: 0 PID: 30558 at arch/arm64/include/asm/kvm_emulate.h:536 kvm_handle_mmio_return+0x46c/0x5c4 arch/arm64/include/asm/kvm_emulate.h:536 Call trace: kvm_handle_mmio_return+0x46c/0x5c4 arch/arm64/include/asm/kvm_emulate.h:536 kvm_arch_vcpu_ioctl_run+0x98/0x15b4 arch/arm64/kvm/arm.c:1133 kvm_vcpu_ioctl+0x75c/0xa78 virt/kvm/kvm_main.c:4487 __do_sys_ioctl fs/ioctl.c:51 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __arm64_sys_ioctl+0x14c/0x1c8 fs/ioctl.c:893 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x1e0/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x38/0x68 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x90/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 The splat is complaining that KVM is advancing PC while an exception is pending, i.e. that KVM is retiring the MMIO instruction despite a pending synchronous external abort. Womp womp. Fix the glaring UAPI bug by skipping over all the MMIO emulation in case there is a pending synchronous exception. Note that while userspace is capable of pending an asynchronous exception (SError, IRQ, or FIQ), it is still safe to retire the MMIO instruction in this case as (1) they are by definition asynchronous, and (2) KVM relies on hardware support for pending/delivering these exceptions instead of the software state machine for advancing PC. Cc: stable@vger.kernel.org Fixes: da345174ceca ("KVM: arm/arm64: Allow user injection of external data aborts") Reported-by: Alexander Potapenko Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025203106.3529261-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmio.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index cd6b7b83e2c3..ab365e839874 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -72,6 +72,31 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) return data; } +static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu) +{ + if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) + return false; + + if (vcpu_el1_is_32bit(vcpu)) { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): + case unpack_vcpu_flag(EXCEPT_AA32_IABT): + case unpack_vcpu_flag(EXCEPT_AA32_DABT): + return true; + default: + return false; + } + } else { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): + return true; + default: + return false; + } + } +} + /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * or in-kernel IO emulation @@ -84,8 +109,11 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) unsigned int len; int mask; - /* Detect an already handled MMIO return */ - if (unlikely(!vcpu->mmio_needed)) + /* + * Detect if the MMIO return was already handled or if userspace aborted + * the MMIO access. + */ + if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu))) return 1; vcpu->mmio_needed = 0; From 9fb8e9178b25f5cf84d9992ac9f8a5d714058b07 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 20:31:04 +0000 Subject: [PATCH 09/83] tools: arm64: Grab a copy of esr.h from kernel Grab esr.h and brk-imm.h for subsequent use in KVM selftests. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025203106.3529261-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- tools/arch/arm64/include/asm/brk-imm.h | 42 +++ tools/arch/arm64/include/asm/esr.h | 455 +++++++++++++++++++++++++ 2 files changed, 497 insertions(+) create mode 100644 tools/arch/arm64/include/asm/brk-imm.h create mode 100644 tools/arch/arm64/include/asm/esr.h diff --git a/tools/arch/arm64/include/asm/brk-imm.h b/tools/arch/arm64/include/asm/brk-imm.h new file mode 100644 index 000000000000..beb42c62b6ac --- /dev/null +++ b/tools/arch/arm64/include/asm/brk-imm.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 ARM Ltd. + */ + +#ifndef __ASM_BRK_IMM_H +#define __ASM_BRK_IMM_H + +/* + * #imm16 values used for BRK instruction generation + * 0x004: for installing kprobes + * 0x005: for installing uprobes + * 0x006: for kprobe software single-step + * 0x007: for kretprobe return + * Allowed values for kgdb are 0x400 - 0x7ff + * 0x100: for triggering a fault on purpose (reserved) + * 0x400: for dynamic BRK instruction + * 0x401: for compile time BRK instruction + * 0x800: kernel-mode BUG() and WARN() traps + * 0x9xx: tag-based KASAN trap (allowed values 0x900 - 0x9ff) + * 0x55xx: Undefined Behavior Sanitizer traps ('U' << 8) + * 0x8xxx: Control-Flow Integrity traps + */ +#define KPROBES_BRK_IMM 0x004 +#define UPROBES_BRK_IMM 0x005 +#define KPROBES_BRK_SS_IMM 0x006 +#define KRETPROBES_BRK_IMM 0x007 +#define FAULT_BRK_IMM 0x100 +#define KGDB_DYN_DBG_BRK_IMM 0x400 +#define KGDB_COMPILED_DBG_BRK_IMM 0x401 +#define BUG_BRK_IMM 0x800 +#define KASAN_BRK_IMM 0x900 +#define KASAN_BRK_MASK 0x0ff +#define UBSAN_BRK_IMM 0x5500 +#define UBSAN_BRK_MASK 0x00ff + +#define CFI_BRK_IMM_TARGET GENMASK(4, 0) +#define CFI_BRK_IMM_TYPE GENMASK(9, 5) +#define CFI_BRK_IMM_BASE 0x8000 +#define CFI_BRK_IMM_MASK (CFI_BRK_IMM_TARGET | CFI_BRK_IMM_TYPE) + +#endif diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h new file mode 100644 index 000000000000..bd592ca81571 --- /dev/null +++ b/tools/arch/arm64/include/asm/esr.h @@ -0,0 +1,455 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 - ARM Ltd + * Author: Marc Zyngier + */ + +#ifndef __ASM_ESR_H +#define __ASM_ESR_H + +#include + +#define ESR_ELx_EC_UNKNOWN UL(0x00) +#define ESR_ELx_EC_WFx UL(0x01) +/* Unallocated EC: 0x02 */ +#define ESR_ELx_EC_CP15_32 UL(0x03) +#define ESR_ELx_EC_CP15_64 UL(0x04) +#define ESR_ELx_EC_CP14_MR UL(0x05) +#define ESR_ELx_EC_CP14_LS UL(0x06) +#define ESR_ELx_EC_FP_ASIMD UL(0x07) +#define ESR_ELx_EC_CP10_ID UL(0x08) /* EL2 only */ +#define ESR_ELx_EC_PAC UL(0x09) /* EL2 and above */ +/* Unallocated EC: 0x0A - 0x0B */ +#define ESR_ELx_EC_CP14_64 UL(0x0C) +#define ESR_ELx_EC_BTI UL(0x0D) +#define ESR_ELx_EC_ILL UL(0x0E) +/* Unallocated EC: 0x0F - 0x10 */ +#define ESR_ELx_EC_SVC32 UL(0x11) +#define ESR_ELx_EC_HVC32 UL(0x12) /* EL2 only */ +#define ESR_ELx_EC_SMC32 UL(0x13) /* EL2 and above */ +/* Unallocated EC: 0x14 */ +#define ESR_ELx_EC_SVC64 UL(0x15) +#define ESR_ELx_EC_HVC64 UL(0x16) /* EL2 and above */ +#define ESR_ELx_EC_SMC64 UL(0x17) /* EL2 and above */ +#define ESR_ELx_EC_SYS64 UL(0x18) +#define ESR_ELx_EC_SVE UL(0x19) +#define ESR_ELx_EC_ERET UL(0x1a) /* EL2 only */ +/* Unallocated EC: 0x1B */ +#define ESR_ELx_EC_FPAC UL(0x1C) /* EL1 and above */ +#define ESR_ELx_EC_SME UL(0x1D) +/* Unallocated EC: 0x1E */ +#define ESR_ELx_EC_IMP_DEF UL(0x1f) /* EL3 only */ +#define ESR_ELx_EC_IABT_LOW UL(0x20) +#define ESR_ELx_EC_IABT_CUR UL(0x21) +#define ESR_ELx_EC_PC_ALIGN UL(0x22) +/* Unallocated EC: 0x23 */ +#define ESR_ELx_EC_DABT_LOW UL(0x24) +#define ESR_ELx_EC_DABT_CUR UL(0x25) +#define ESR_ELx_EC_SP_ALIGN UL(0x26) +#define ESR_ELx_EC_MOPS UL(0x27) +#define ESR_ELx_EC_FP_EXC32 UL(0x28) +/* Unallocated EC: 0x29 - 0x2B */ +#define ESR_ELx_EC_FP_EXC64 UL(0x2C) +/* Unallocated EC: 0x2D - 0x2E */ +#define ESR_ELx_EC_SERROR UL(0x2F) +#define ESR_ELx_EC_BREAKPT_LOW UL(0x30) +#define ESR_ELx_EC_BREAKPT_CUR UL(0x31) +#define ESR_ELx_EC_SOFTSTP_LOW UL(0x32) +#define ESR_ELx_EC_SOFTSTP_CUR UL(0x33) +#define ESR_ELx_EC_WATCHPT_LOW UL(0x34) +#define ESR_ELx_EC_WATCHPT_CUR UL(0x35) +/* Unallocated EC: 0x36 - 0x37 */ +#define ESR_ELx_EC_BKPT32 UL(0x38) +/* Unallocated EC: 0x39 */ +#define ESR_ELx_EC_VECTOR32 UL(0x3A) /* EL2 only */ +/* Unallocated EC: 0x3B */ +#define ESR_ELx_EC_BRK64 UL(0x3C) +/* Unallocated EC: 0x3D - 0x3F */ +#define ESR_ELx_EC_MAX UL(0x3F) + +#define ESR_ELx_EC_SHIFT (26) +#define ESR_ELx_EC_WIDTH (6) +#define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) +#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) + +#define ESR_ELx_IL_SHIFT (25) +#define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) +#define ESR_ELx_ISS_MASK (GENMASK(24, 0)) +#define ESR_ELx_ISS(esr) ((esr) & ESR_ELx_ISS_MASK) +#define ESR_ELx_ISS2_SHIFT (32) +#define ESR_ELx_ISS2_MASK (GENMASK_ULL(55, 32)) +#define ESR_ELx_ISS2(esr) (((esr) & ESR_ELx_ISS2_MASK) >> ESR_ELx_ISS2_SHIFT) + +/* ISS field definitions shared by different classes */ +#define ESR_ELx_WNR_SHIFT (6) +#define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) + +/* Asynchronous Error Type */ +#define ESR_ELx_IDS_SHIFT (24) +#define ESR_ELx_IDS (UL(1) << ESR_ELx_IDS_SHIFT) +#define ESR_ELx_AET_SHIFT (10) +#define ESR_ELx_AET (UL(0x7) << ESR_ELx_AET_SHIFT) + +#define ESR_ELx_AET_UC (UL(0) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UEU (UL(1) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UEO (UL(2) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UER (UL(3) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) + +/* Shared ISS field definitions for Data/Instruction aborts */ +#define ESR_ELx_SET_SHIFT (11) +#define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) +#define ESR_ELx_FnV_SHIFT (10) +#define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) +#define ESR_ELx_EA_SHIFT (9) +#define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) +#define ESR_ELx_S1PTW_SHIFT (7) +#define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) + +/* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ +#define ESR_ELx_FSC (0x3F) +#define ESR_ELx_FSC_TYPE (0x3C) +#define ESR_ELx_FSC_LEVEL (0x03) +#define ESR_ELx_FSC_EXTABT (0x10) +#define ESR_ELx_FSC_MTE (0x11) +#define ESR_ELx_FSC_SERROR (0x11) +#define ESR_ELx_FSC_ACCESS (0x08) +#define ESR_ELx_FSC_FAULT (0x04) +#define ESR_ELx_FSC_PERM (0x0C) +#define ESR_ELx_FSC_SEA_TTW(n) (0x14 + (n)) +#define ESR_ELx_FSC_SECC (0x18) +#define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n)) + +/* Status codes for individual page table levels */ +#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n)) +#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + (n)) + +#define ESR_ELx_FSC_FAULT_nL (0x2C) +#define ESR_ELx_FSC_FAULT_L(n) (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \ + ESR_ELx_FSC_FAULT) + (n)) + +/* ISS field definitions for Data Aborts */ +#define ESR_ELx_ISV_SHIFT (24) +#define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) +#define ESR_ELx_SAS_SHIFT (22) +#define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) +#define ESR_ELx_SSE_SHIFT (21) +#define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) +#define ESR_ELx_SRT_SHIFT (16) +#define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) +#define ESR_ELx_SF_SHIFT (15) +#define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) +#define ESR_ELx_AR_SHIFT (14) +#define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_CM_SHIFT (8) +#define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) + +/* ISS2 field definitions for Data Aborts */ +#define ESR_ELx_TnD_SHIFT (10) +#define ESR_ELx_TnD (UL(1) << ESR_ELx_TnD_SHIFT) +#define ESR_ELx_TagAccess_SHIFT (9) +#define ESR_ELx_TagAccess (UL(1) << ESR_ELx_TagAccess_SHIFT) +#define ESR_ELx_GCS_SHIFT (8) +#define ESR_ELx_GCS (UL(1) << ESR_ELx_GCS_SHIFT) +#define ESR_ELx_Overlay_SHIFT (6) +#define ESR_ELx_Overlay (UL(1) << ESR_ELx_Overlay_SHIFT) +#define ESR_ELx_DirtyBit_SHIFT (5) +#define ESR_ELx_DirtyBit (UL(1) << ESR_ELx_DirtyBit_SHIFT) +#define ESR_ELx_Xs_SHIFT (0) +#define ESR_ELx_Xs_MASK (GENMASK_ULL(4, 0)) + +/* ISS field definitions for exceptions taken in to Hyp */ +#define ESR_ELx_FSC_ADDRSZ (0x00) +#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n)) +#define ESR_ELx_CV (UL(1) << 24) +#define ESR_ELx_COND_SHIFT (20) +#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) +#define ESR_ELx_WFx_ISS_RN (UL(0x1F) << 5) +#define ESR_ELx_WFx_ISS_RV (UL(1) << 2) +#define ESR_ELx_WFx_ISS_TI (UL(3) << 0) +#define ESR_ELx_WFx_ISS_WFxT (UL(2) << 0) +#define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) +#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) +#define ESR_ELx_xVC_IMM_MASK ((UL(1) << 16) - 1) + +#define DISR_EL1_IDS (UL(1) << 24) +/* + * DISR_EL1 and ESR_ELx share the bottom 13 bits, but the RES0 bits may mean + * different things in the future... + */ +#define DISR_EL1_ESR_MASK (ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC) + +/* ESR value templates for specific events */ +#define ESR_ELx_WFx_MASK (ESR_ELx_EC_MASK | \ + (ESR_ELx_WFx_ISS_TI & ~ESR_ELx_WFx_ISS_WFxT)) +#define ESR_ELx_WFx_WFI_VAL ((ESR_ELx_EC_WFx << ESR_ELx_EC_SHIFT) | \ + ESR_ELx_WFx_ISS_WFI) + +/* BRK instruction trap from AArch64 state */ +#define ESR_ELx_BRK64_ISS_COMMENT_MASK 0xffff + +/* ISS field definitions for System instruction traps */ +#define ESR_ELx_SYS64_ISS_RES0_SHIFT 22 +#define ESR_ELx_SYS64_ISS_RES0_MASK (UL(0x7) << ESR_ELx_SYS64_ISS_RES0_SHIFT) +#define ESR_ELx_SYS64_ISS_DIR_MASK 0x1 +#define ESR_ELx_SYS64_ISS_DIR_READ 0x1 +#define ESR_ELx_SYS64_ISS_DIR_WRITE 0x0 + +#define ESR_ELx_SYS64_ISS_RT_SHIFT 5 +#define ESR_ELx_SYS64_ISS_RT_MASK (UL(0x1f) << ESR_ELx_SYS64_ISS_RT_SHIFT) +#define ESR_ELx_SYS64_ISS_CRM_SHIFT 1 +#define ESR_ELx_SYS64_ISS_CRM_MASK (UL(0xf) << ESR_ELx_SYS64_ISS_CRM_SHIFT) +#define ESR_ELx_SYS64_ISS_CRN_SHIFT 10 +#define ESR_ELx_SYS64_ISS_CRN_MASK (UL(0xf) << ESR_ELx_SYS64_ISS_CRN_SHIFT) +#define ESR_ELx_SYS64_ISS_OP1_SHIFT 14 +#define ESR_ELx_SYS64_ISS_OP1_MASK (UL(0x7) << ESR_ELx_SYS64_ISS_OP1_SHIFT) +#define ESR_ELx_SYS64_ISS_OP2_SHIFT 17 +#define ESR_ELx_SYS64_ISS_OP2_MASK (UL(0x7) << ESR_ELx_SYS64_ISS_OP2_SHIFT) +#define ESR_ELx_SYS64_ISS_OP0_SHIFT 20 +#define ESR_ELx_SYS64_ISS_OP0_MASK (UL(0x3) << ESR_ELx_SYS64_ISS_OP0_SHIFT) +#define ESR_ELx_SYS64_ISS_SYS_MASK (ESR_ELx_SYS64_ISS_OP0_MASK | \ + ESR_ELx_SYS64_ISS_OP1_MASK | \ + ESR_ELx_SYS64_ISS_OP2_MASK | \ + ESR_ELx_SYS64_ISS_CRN_MASK | \ + ESR_ELx_SYS64_ISS_CRM_MASK) +#define ESR_ELx_SYS64_ISS_SYS_VAL(op0, op1, op2, crn, crm) \ + (((op0) << ESR_ELx_SYS64_ISS_OP0_SHIFT) | \ + ((op1) << ESR_ELx_SYS64_ISS_OP1_SHIFT) | \ + ((op2) << ESR_ELx_SYS64_ISS_OP2_SHIFT) | \ + ((crn) << ESR_ELx_SYS64_ISS_CRN_SHIFT) | \ + ((crm) << ESR_ELx_SYS64_ISS_CRM_SHIFT)) + +#define ESR_ELx_SYS64_ISS_SYS_OP_MASK (ESR_ELx_SYS64_ISS_SYS_MASK | \ + ESR_ELx_SYS64_ISS_DIR_MASK) +#define ESR_ELx_SYS64_ISS_RT(esr) \ + (((esr) & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT) +/* + * User space cache operations have the following sysreg encoding + * in System instructions. + * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 13, 14 }, WRITE (L=0) + */ +#define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC 14 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVADP 13 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVAP 12 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVAU 11 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVAC 10 +#define ESR_ELx_SYS64_ISS_CRM_IC_IVAU 5 + +#define ESR_ELx_SYS64_ISS_EL0_CACHE_OP_MASK (ESR_ELx_SYS64_ISS_OP0_MASK | \ + ESR_ELx_SYS64_ISS_OP1_MASK | \ + ESR_ELx_SYS64_ISS_OP2_MASK | \ + ESR_ELx_SYS64_ISS_CRN_MASK | \ + ESR_ELx_SYS64_ISS_DIR_MASK) +#define ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL \ + (ESR_ELx_SYS64_ISS_SYS_VAL(1, 3, 1, 7, 0) | \ + ESR_ELx_SYS64_ISS_DIR_WRITE) +/* + * User space MRS operations which are supported for emulation + * have the following sysreg encoding in System instructions. + * op0 = 3, op1= 0, crn = 0, {crm = 0, 4-7}, READ (L = 1) + */ +#define ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK (ESR_ELx_SYS64_ISS_OP0_MASK | \ + ESR_ELx_SYS64_ISS_OP1_MASK | \ + ESR_ELx_SYS64_ISS_CRN_MASK | \ + ESR_ELx_SYS64_ISS_DIR_MASK) +#define ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL \ + (ESR_ELx_SYS64_ISS_SYS_VAL(3, 0, 0, 0, 0) | \ + ESR_ELx_SYS64_ISS_DIR_READ) + +#define ESR_ELx_SYS64_ISS_SYS_CTR ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 1, 0, 0) +#define ESR_ELx_SYS64_ISS_SYS_CTR_READ (ESR_ELx_SYS64_ISS_SYS_CTR | \ + ESR_ELx_SYS64_ISS_DIR_READ) + +#define ESR_ELx_SYS64_ISS_SYS_CNTVCT (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \ + ESR_ELx_SYS64_ISS_DIR_READ) + +#define ESR_ELx_SYS64_ISS_SYS_CNTVCTSS (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 6, 14, 0) | \ + ESR_ELx_SYS64_ISS_DIR_READ) + +#define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \ + ESR_ELx_SYS64_ISS_DIR_READ) + +#define esr_sys64_to_sysreg(e) \ + sys_reg((((e) & ESR_ELx_SYS64_ISS_OP0_MASK) >> \ + ESR_ELx_SYS64_ISS_OP0_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ + ESR_ELx_SYS64_ISS_OP1_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ + ESR_ELx_SYS64_ISS_CRN_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ + ESR_ELx_SYS64_ISS_CRM_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ + ESR_ELx_SYS64_ISS_OP2_SHIFT)) + +#define esr_cp15_to_sysreg(e) \ + sys_reg(3, \ + (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ + ESR_ELx_SYS64_ISS_OP1_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ + ESR_ELx_SYS64_ISS_CRN_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ + ESR_ELx_SYS64_ISS_CRM_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ + ESR_ELx_SYS64_ISS_OP2_SHIFT)) + +/* ISS field definitions for ERET/ERETAA/ERETAB trapping */ +#define ESR_ELx_ERET_ISS_ERET 0x2 +#define ESR_ELx_ERET_ISS_ERETA 0x1 + +/* + * ISS field definitions for floating-point exception traps + * (FP_EXC_32/FP_EXC_64). + * + * (The FPEXC_* constants are used instead for common bits.) + */ + +#define ESR_ELx_FP_EXC_TFV (UL(1) << 23) + +/* + * ISS field definitions for CP15 accesses + */ +#define ESR_ELx_CP15_32_ISS_DIR_MASK 0x1 +#define ESR_ELx_CP15_32_ISS_DIR_READ 0x1 +#define ESR_ELx_CP15_32_ISS_DIR_WRITE 0x0 + +#define ESR_ELx_CP15_32_ISS_RT_SHIFT 5 +#define ESR_ELx_CP15_32_ISS_RT_MASK (UL(0x1f) << ESR_ELx_CP15_32_ISS_RT_SHIFT) +#define ESR_ELx_CP15_32_ISS_CRM_SHIFT 1 +#define ESR_ELx_CP15_32_ISS_CRM_MASK (UL(0xf) << ESR_ELx_CP15_32_ISS_CRM_SHIFT) +#define ESR_ELx_CP15_32_ISS_CRN_SHIFT 10 +#define ESR_ELx_CP15_32_ISS_CRN_MASK (UL(0xf) << ESR_ELx_CP15_32_ISS_CRN_SHIFT) +#define ESR_ELx_CP15_32_ISS_OP1_SHIFT 14 +#define ESR_ELx_CP15_32_ISS_OP1_MASK (UL(0x7) << ESR_ELx_CP15_32_ISS_OP1_SHIFT) +#define ESR_ELx_CP15_32_ISS_OP2_SHIFT 17 +#define ESR_ELx_CP15_32_ISS_OP2_MASK (UL(0x7) << ESR_ELx_CP15_32_ISS_OP2_SHIFT) + +#define ESR_ELx_CP15_32_ISS_SYS_MASK (ESR_ELx_CP15_32_ISS_OP1_MASK | \ + ESR_ELx_CP15_32_ISS_OP2_MASK | \ + ESR_ELx_CP15_32_ISS_CRN_MASK | \ + ESR_ELx_CP15_32_ISS_CRM_MASK | \ + ESR_ELx_CP15_32_ISS_DIR_MASK) +#define ESR_ELx_CP15_32_ISS_SYS_VAL(op1, op2, crn, crm) \ + (((op1) << ESR_ELx_CP15_32_ISS_OP1_SHIFT) | \ + ((op2) << ESR_ELx_CP15_32_ISS_OP2_SHIFT) | \ + ((crn) << ESR_ELx_CP15_32_ISS_CRN_SHIFT) | \ + ((crm) << ESR_ELx_CP15_32_ISS_CRM_SHIFT)) + +#define ESR_ELx_CP15_64_ISS_DIR_MASK 0x1 +#define ESR_ELx_CP15_64_ISS_DIR_READ 0x1 +#define ESR_ELx_CP15_64_ISS_DIR_WRITE 0x0 + +#define ESR_ELx_CP15_64_ISS_RT_SHIFT 5 +#define ESR_ELx_CP15_64_ISS_RT_MASK (UL(0x1f) << ESR_ELx_CP15_64_ISS_RT_SHIFT) + +#define ESR_ELx_CP15_64_ISS_RT2_SHIFT 10 +#define ESR_ELx_CP15_64_ISS_RT2_MASK (UL(0x1f) << ESR_ELx_CP15_64_ISS_RT2_SHIFT) + +#define ESR_ELx_CP15_64_ISS_OP1_SHIFT 16 +#define ESR_ELx_CP15_64_ISS_OP1_MASK (UL(0xf) << ESR_ELx_CP15_64_ISS_OP1_SHIFT) +#define ESR_ELx_CP15_64_ISS_CRM_SHIFT 1 +#define ESR_ELx_CP15_64_ISS_CRM_MASK (UL(0xf) << ESR_ELx_CP15_64_ISS_CRM_SHIFT) + +#define ESR_ELx_CP15_64_ISS_SYS_VAL(op1, crm) \ + (((op1) << ESR_ELx_CP15_64_ISS_OP1_SHIFT) | \ + ((crm) << ESR_ELx_CP15_64_ISS_CRM_SHIFT)) + +#define ESR_ELx_CP15_64_ISS_SYS_MASK (ESR_ELx_CP15_64_ISS_OP1_MASK | \ + ESR_ELx_CP15_64_ISS_CRM_MASK | \ + ESR_ELx_CP15_64_ISS_DIR_MASK) + +#define ESR_ELx_CP15_64_ISS_SYS_CNTVCT (ESR_ELx_CP15_64_ISS_SYS_VAL(1, 14) | \ + ESR_ELx_CP15_64_ISS_DIR_READ) + +#define ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS (ESR_ELx_CP15_64_ISS_SYS_VAL(9, 14) | \ + ESR_ELx_CP15_64_ISS_DIR_READ) + +#define ESR_ELx_CP15_32_ISS_SYS_CNTFRQ (ESR_ELx_CP15_32_ISS_SYS_VAL(0, 0, 14, 0) |\ + ESR_ELx_CP15_32_ISS_DIR_READ) + +/* + * ISS values for SME traps + */ + +#define ESR_ELx_SME_ISS_SME_DISABLED 0 +#define ESR_ELx_SME_ISS_ILL 1 +#define ESR_ELx_SME_ISS_SM_DISABLED 2 +#define ESR_ELx_SME_ISS_ZA_DISABLED 3 +#define ESR_ELx_SME_ISS_ZT_DISABLED 4 + +/* ISS field definitions for MOPS exceptions */ +#define ESR_ELx_MOPS_ISS_MEM_INST (UL(1) << 24) +#define ESR_ELx_MOPS_ISS_FROM_EPILOGUE (UL(1) << 18) +#define ESR_ELx_MOPS_ISS_WRONG_OPTION (UL(1) << 17) +#define ESR_ELx_MOPS_ISS_OPTION_A (UL(1) << 16) +#define ESR_ELx_MOPS_ISS_DESTREG(esr) (((esr) & (UL(0x1f) << 10)) >> 10) +#define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5) +#define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0) + +#ifndef __ASSEMBLY__ +#include + +static inline unsigned long esr_brk_comment(unsigned long esr) +{ + return esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; +} + +static inline bool esr_is_data_abort(unsigned long esr) +{ + const unsigned long ec = ESR_ELx_EC(esr); + + return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; +} + +static inline bool esr_is_cfi_brk(unsigned long esr) +{ + return ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && + (esr_brk_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE; +} + +static inline bool esr_fsc_is_translation_fault(unsigned long esr) +{ + esr = esr & ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_FAULT_L(3)) || + (esr == ESR_ELx_FSC_FAULT_L(2)) || + (esr == ESR_ELx_FSC_FAULT_L(1)) || + (esr == ESR_ELx_FSC_FAULT_L(0)) || + (esr == ESR_ELx_FSC_FAULT_L(-1)); +} + +static inline bool esr_fsc_is_permission_fault(unsigned long esr) +{ + esr = esr & ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_PERM_L(3)) || + (esr == ESR_ELx_FSC_PERM_L(2)) || + (esr == ESR_ELx_FSC_PERM_L(1)) || + (esr == ESR_ELx_FSC_PERM_L(0)); +} + +static inline bool esr_fsc_is_access_flag_fault(unsigned long esr) +{ + esr = esr & ESR_ELx_FSC; + + return (esr == ESR_ELx_FSC_ACCESS_L(3)) || + (esr == ESR_ELx_FSC_ACCESS_L(2)) || + (esr == ESR_ELx_FSC_ACCESS_L(1)) || + (esr == ESR_ELx_FSC_ACCESS_L(0)); +} + +/* Indicate whether ESR.EC==0x1A is for an ERETAx instruction */ +static inline bool esr_iss_is_eretax(unsigned long esr) +{ + return esr & ESR_ELx_ERET_ISS_ERET; +} + +/* Indicate which key is used for ERETAx (false: A-Key, true: B-Key) */ +static inline bool esr_iss_is_eretab(unsigned long esr) +{ + return esr & ESR_ELx_ERET_ISS_ERETA; +} + +const char *esr_get_class_string(unsigned long esr); +#endif /* __ASSEMBLY */ + +#endif /* __ASM_ESR_H */ From c660d334b3a54f22836955ca5255edd946771614 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 20:31:05 +0000 Subject: [PATCH 10/83] KVM: arm64: selftests: Convert to kernel's ESR terminology Drop the KVM selftests specific flavoring of ESR in favor of the kernel header. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025203106.3529261-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- .../selftests/kvm/aarch64/debug-exceptions.c | 10 +++++----- tools/testing/selftests/kvm/aarch64/no-vgic-v3.c | 2 +- .../selftests/kvm/aarch64/page_fault_test.c | 4 ++-- .../selftests/kvm/aarch64/vpmu_counter_access.c | 12 ++++++------ .../selftests/kvm/include/aarch64/processor.h | 15 ++------------- .../testing/selftests/kvm/lib/aarch64/processor.c | 6 +++--- 6 files changed, 19 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 2582c49e525a..ff7a949fc96a 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -433,15 +433,15 @@ static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bp vcpu_init_descriptor_tables(vcpu); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_EC_BRK_INS, guest_sw_bp_handler); + ESR_ELx_EC_BRK64, guest_sw_bp_handler); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_EC_HW_BP_CURRENT, guest_hw_bp_handler); + ESR_ELx_EC_BREAKPT_CUR, guest_hw_bp_handler); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_EC_WP_CURRENT, guest_wp_handler); + ESR_ELx_EC_WATCHPT_CUR, guest_wp_handler); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_EC_SSTEP_CURRENT, guest_ss_handler); + ESR_ELx_EC_SOFTSTP_CUR, guest_ss_handler); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_EC_SVC64, guest_svc_handler); + ESR_ELx_EC_SVC64, guest_svc_handler); /* Specify bpn/wpn/ctx_bpn to be tested */ vcpu_args_set(vcpu, 3, bpn, wpn, ctx_bpn); diff --git a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c index 943d65fc6b0b..58304bbc2036 100644 --- a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c +++ b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c @@ -150,7 +150,7 @@ static void test_guest_no_gicv3(void) vcpu_init_descriptor_tables(vcpu); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_EC_UNKNOWN, guest_undef_handler); + ESR_ELx_EC_UNKNOWN, guest_undef_handler); test_run_vcpu(vcpu); diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index d29b08198b42..ec33a8f9c908 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -544,9 +544,9 @@ static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu, vcpu_init_descriptor_tables(vcpu); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_EC_DABT, no_dabt_handler); + ESR_ELx_EC_DABT_CUR, no_dabt_handler); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_EC_IABT, no_iabt_handler); + ESR_ELx_EC_IABT_CUR, no_iabt_handler); } static void setup_gva_maps(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c index d31b9f64ba14..f9c0c86d7e85 100644 --- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c @@ -300,7 +300,7 @@ static void guest_sync_handler(struct ex_regs *regs) uint64_t esr, ec; esr = read_sysreg(esr_el1); - ec = (esr >> ESR_EC_SHIFT) & ESR_EC_MASK; + ec = ESR_ELx_EC(esr); __GUEST_ASSERT(expected_ec == ec, "PC: 0x%lx; ESR: 0x%lx; EC: 0x%lx; EC expected: 0x%lx", @@ -338,10 +338,10 @@ static void test_access_invalid_pmc_regs(struct pmc_accessor *acc, int pmc_idx) * Reading/writing the event count/type registers should cause * an UNDEFINED exception. */ - TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->read_cntr(pmc_idx)); - TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0)); - TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->read_typer(pmc_idx)); - TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->write_typer(pmc_idx, 0)); + TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_cntr(pmc_idx)); + TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0)); + TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_typer(pmc_idx)); + TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_typer(pmc_idx, 0)); /* * The bit corresponding to the (unimplemented) counter in * {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers should be RAZ. @@ -425,7 +425,7 @@ static void create_vpmu_vm(void *guest_code) vpmu_vm.vm = vm_create(1); vm_init_descriptor_tables(vpmu_vm.vm); - for (ec = 0; ec < ESR_EC_NUM; ec++) { + for (ec = 0; ec < ESR_ELx_EC_MAX + 1; ec++) { vm_install_sync_handler(vpmu_vm.vm, VECTOR_SYNC_CURRENT, ec, guest_sync_handler); } diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index de977d131082..1e8d0d531fbd 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -12,6 +12,8 @@ #include #include +#include +#include #include @@ -100,19 +102,6 @@ enum { (v) == VECTOR_SYNC_LOWER_64 || \ (v) == VECTOR_SYNC_LOWER_32) -#define ESR_EC_NUM 64 -#define ESR_EC_SHIFT 26 -#define ESR_EC_MASK (ESR_EC_NUM - 1) - -#define ESR_EC_UNKNOWN 0x0 -#define ESR_EC_SVC64 0x15 -#define ESR_EC_IABT 0x21 -#define ESR_EC_DABT 0x25 -#define ESR_EC_HW_BP_CURRENT 0x31 -#define ESR_EC_SSTEP_CURRENT 0x33 -#define ESR_EC_WP_CURRENT 0x35 -#define ESR_EC_BRK_INS 0x3c - /* Access flag */ #define PTE_AF (1ULL << 10) diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index fe4dc3693112..698e34f39241 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -450,7 +450,7 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) } struct handlers { - handler_fn exception_handlers[VECTOR_NUM][ESR_EC_NUM]; + handler_fn exception_handlers[VECTOR_NUM][ESR_ELx_EC_MAX + 1]; }; void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) @@ -469,7 +469,7 @@ void route_exception(struct ex_regs *regs, int vector) switch (vector) { case VECTOR_SYNC_CURRENT: case VECTOR_SYNC_LOWER_64: - ec = (read_sysreg(esr_el1) >> ESR_EC_SHIFT) & ESR_EC_MASK; + ec = ESR_ELx_EC(read_sysreg(esr_el1)); valid_ec = true; break; case VECTOR_IRQ_CURRENT: @@ -508,7 +508,7 @@ void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec, assert(VECTOR_IS_SYNC(vector)); assert(vector < VECTOR_NUM); - assert(ec < ESR_EC_NUM); + assert(ec <= ESR_ELx_EC_MAX); handlers->exception_handlers[vector][ec] = handler; } From 3eb09a3e028e26fd26bc132c7aa0577f00de2d05 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 20:31:06 +0000 Subject: [PATCH 11/83] KVM: arm64: selftests: Add tests for MMIO external abort injection Test that the plumbing exposed to userspace for injecting aborts in response to unexpected MMIO works as intended in two different flavors: - A 'normal' MMIO instruction (i.e. ESR_ELx.ISV=1) - An ISV=0 MMIO instruction with/without KVM_CAP_ARM_NISV_TO_USER enabled Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025203106.3529261-5-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/aarch64/mmio_abort.c | 159 ++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/mmio_abort.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 960cf6a77198..98957a99ead6 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -156,6 +156,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs TEST_GEN_PROGS_aarch64 += aarch64/arch_timer_edge_cases TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/hypercalls +TEST_GEN_PROGS_aarch64 += aarch64/mmio_abort TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test TEST_GEN_PROGS_aarch64 += aarch64/psci_test TEST_GEN_PROGS_aarch64 += aarch64/set_id_regs diff --git a/tools/testing/selftests/kvm/aarch64/mmio_abort.c b/tools/testing/selftests/kvm/aarch64/mmio_abort.c new file mode 100644 index 000000000000..8b7a80a51b1c --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/mmio_abort.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mmio_abort - Tests for userspace MMIO abort injection + * + * Copyright (c) 2024 Google LLC + */ +#include "processor.h" +#include "test_util.h" + +#define MMIO_ADDR 0x8000000ULL + +static u64 expected_abort_pc; + +static void expect_sea_handler(struct ex_regs *regs) +{ + u64 esr = read_sysreg(esr_el1); + + GUEST_ASSERT_EQ(regs->pc, expected_abort_pc); + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); + GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + + GUEST_DONE(); +} + +static void unexpected_dabt_handler(struct ex_regs *regs) +{ + GUEST_FAIL("Unexpected data abort at PC: %lx\n", regs->pc); +} + +static struct kvm_vm *vm_create_with_dabt_handler(struct kvm_vcpu **vcpu, void *guest_code, + handler_fn dabt_handler) +{ + struct kvm_vm *vm = vm_create_with_one_vcpu(vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(*vcpu); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_ELx_EC_DABT_CUR, dabt_handler); + + virt_map(vm, MMIO_ADDR, MMIO_ADDR, 1); + + return vm; +} + +static void vcpu_inject_extabt(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events = {}; + + events.exception.ext_dabt_pending = true; + vcpu_events_set(vcpu, &events); +} + +static void vcpu_run_expect_done(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } +} + +extern char test_mmio_abort_insn; + +static void test_mmio_abort_guest(void) +{ + WRITE_ONCE(expected_abort_pc, (u64)&test_mmio_abort_insn); + + asm volatile("test_mmio_abort_insn:\n\t" + "ldr x0, [%0]\n\t" + : : "r" (MMIO_ADDR) : "x0", "memory"); + + GUEST_FAIL("MMIO instruction should not retire"); +} + +/* + * Test that KVM doesn't complete MMIO emulation when userspace has made an + * external abort pending for the instruction. + */ +static void test_mmio_abort(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_abort_guest, + expect_sea_handler); + struct kvm_run *run = vcpu->run; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO); + TEST_ASSERT_EQ(run->mmio.phys_addr, MMIO_ADDR); + TEST_ASSERT_EQ(run->mmio.len, sizeof(unsigned long)); + TEST_ASSERT(!run->mmio.is_write, "Expected MMIO read"); + + vcpu_inject_extabt(vcpu); + vcpu_run_expect_done(vcpu); + kvm_vm_free(vm); +} + +extern char test_mmio_nisv_insn; + +static void test_mmio_nisv_guest(void) +{ + WRITE_ONCE(expected_abort_pc, (u64)&test_mmio_nisv_insn); + + asm volatile("test_mmio_nisv_insn:\n\t" + "ldr x0, [%0], #8\n\t" + : : "r" (MMIO_ADDR) : "x0", "memory"); + + GUEST_FAIL("MMIO instruction should not retire"); +} + +/* + * Test that the KVM_RUN ioctl fails for ESR_EL2.ISV=0 MMIO aborts if userspace + * hasn't enabled KVM_CAP_ARM_NISV_TO_USER. + */ +static void test_mmio_nisv(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_nisv_guest, + unexpected_dabt_handler); + + TEST_ASSERT(_vcpu_run(vcpu), "Expected nonzero return code from KVM_RUN"); + TEST_ASSERT_EQ(errno, ENOSYS); + + kvm_vm_free(vm); +} + +/* + * Test that ESR_EL2.ISV=0 MMIO aborts reach userspace and that an injected SEA + * reaches the guest. + */ +static void test_mmio_nisv_abort(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_mmio_nisv_guest, + expect_sea_handler); + struct kvm_run *run = vcpu->run; + + vm_enable_cap(vm, KVM_CAP_ARM_NISV_TO_USER, 1); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_NISV); + TEST_ASSERT_EQ(run->arm_nisv.fault_ipa, MMIO_ADDR); + + vcpu_inject_extabt(vcpu); + vcpu_run_expect_done(vcpu); + kvm_vm_free(vm); +} + +int main(void) +{ + test_mmio_abort(); + test_mmio_nisv(); + test_mmio_nisv_abort(); +} From d41571c7097a21a5e188b401a3976b563be3fbe4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:09 +0100 Subject: [PATCH 12/83] arm64: Drop SKL0/SKL1 from TCR2_EL2 Despite what the documentation says, TCR2_EL2.{SKL0,SKL1} do not exist, and the corresponding information is in the respective TTBRx_EL2. This is a leftover from a development version of the architecture. This change makes TCR2_EL2 similar to TCR2_EL1 in that respect. Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-2-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/tools/sysreg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8d637ac4b7c6..ee3adec6a7c8 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2819,8 +2819,7 @@ Field 13 AMEC1 Field 12 AMEC0 Field 11 HAFT Field 10 PTTWI -Field 9:8 SKL1 -Field 7:6 SKL0 +Res0 9:6 Field 5 D128 Field 4 AIE Field 3 POE From 5792349d0cce03fa37243a3bbcca78d2338d1f53 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:10 +0100 Subject: [PATCH 13/83] arm64: Remove VNCR definition for PIRE0_EL2 As of the ARM ARM Known Issues document 102105_K.a_04_en, D22677 fixes a problem with the PIRE0_EL2 register, resulting in its removal from the VNCR page (it had no purpose being there the first place). Follow the architecture update by removing this offset. Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-3-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/vncr_mapping.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index 06f8ec0906a6..4f9bbd4d6c26 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -50,7 +50,6 @@ #define VNCR_VBAR_EL1 0x250 #define VNCR_TCR2_EL1 0x270 #define VNCR_PIRE0_EL1 0x290 -#define VNCR_PIRE0_EL2 0x298 #define VNCR_PIR_EL1 0x2A0 #define VNCR_POR_EL1 0x2A8 #define VNCR_ICH_LR0_EL2 0x400 From 4ecda4c67961234e634295334fe69aea574c8e61 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:11 +0100 Subject: [PATCH 14/83] arm64: Add encoding for PIRE0_EL2 PIRE0_EL2 is the equivalent of PIRE0_EL1 for the EL2&0 translation regime, and it is sorely missing from the sysreg file. Add the sucker. Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20241023145345.1613824-4-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/tools/sysreg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index ee3adec6a7c8..3c812fd28eca 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2882,6 +2882,10 @@ Sysreg PIRE0_EL12 3 5 10 2 2 Fields PIRx_ELx EndSysreg +Sysreg PIRE0_EL2 3 4 10 2 2 +Fields PIRx_ELx +EndSysreg + Sysreg PIR_EL1 3 0 10 2 3 Fields PIRx_ELx EndSysreg From a5c870d0939b7a878edacb70831b8b32cbfed593 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:12 +0100 Subject: [PATCH 15/83] KVM: arm64: Drop useless struct s2_mmu in __kvm_at_s1e2() __kvm_at_s1e2() contains the definition of an s2_mmu for the current context, but doesn't make any use of it. Drop it. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-5-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 39f0e87a340e..f04677127fbc 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -992,12 +992,9 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) * switching context behind everybody's back, disable interrupts... */ scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { - struct kvm_s2_mmu *mmu; u64 val, hcr; bool fail; - mmu = &vcpu->kvm->arch.mmu; - val = hcr = read_sysreg(hcr_el2); val &= ~HCR_TGE; val |= HCR_VM; From dfeb91686992f5a973d09b66ddedf458de1acf08 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:13 +0100 Subject: [PATCH 16/83] KVM: arm64: nv: Add missing EL2->EL1 mappings in get_el2_to_el1_mapping() As KVM has grown a bunch of new system register for NV, it appears that we are missing them in the get_el2_to_el1_mapping() list. Most of them are not crucial as they don't tend to be accessed via vcpu_read_sys_reg() and vcpu_write_sys_reg(). Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-6-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ff8c4e1b847e..ae0ecd8ae671 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -110,6 +110,14 @@ static bool get_el2_to_el1_mapping(unsigned int reg, PURE_EL2_SYSREG( RVBAR_EL2 ); PURE_EL2_SYSREG( TPIDR_EL2 ); PURE_EL2_SYSREG( HPFAR_EL2 ); + PURE_EL2_SYSREG( HCRX_EL2 ); + PURE_EL2_SYSREG( HFGRTR_EL2 ); + PURE_EL2_SYSREG( HFGWTR_EL2 ); + PURE_EL2_SYSREG( HFGITR_EL2 ); + PURE_EL2_SYSREG( HDFGRTR_EL2 ); + PURE_EL2_SYSREG( HDFGWTR_EL2 ); + PURE_EL2_SYSREG( HAFGRTR_EL2 ); + PURE_EL2_SYSREG( CNTVOFF_EL2 ); PURE_EL2_SYSREG( CNTHCTL_EL2 ); MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, translate_sctlr_el2_to_sctlr_el1 ); @@ -130,6 +138,7 @@ static bool get_el2_to_el1_mapping(unsigned int reg, MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL ); + MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); default: return false; } From 164b5e20cdf6038f1b38867d2f6252ec6f10c356 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:14 +0100 Subject: [PATCH 17/83] KVM: arm64: nv: Handle CNTHCTL_EL2 specially Accessing CNTHCTL_EL2 is fraught with danger if running with HCR_EL2.E2H=1: half of the bits are held in CNTKCTL_EL1, and thus can be changed behind our back, while the rest lives in the CNTHCTL_EL2 shadow copy that is memory-based. Yes, this is a lot of fun! Make sure that we merge the two on read access, while we can write to CNTKCTL_EL1 in a more straightforward manner. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-7-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 28 ++++++++++++++++++++++++++++ include/kvm/arm_arch_timer.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ae0ecd8ae671..9fb2847f5abc 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -157,6 +157,21 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) if (!is_hyp_ctxt(vcpu)) goto memory_read; + /* + * CNTHCTL_EL2 requires some special treatment to + * account for the bits that can be set via CNTKCTL_EL1. + */ + switch (reg) { + case CNTHCTL_EL2: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; + return val; + } + break; + } + /* * If this register does not have an EL1 counterpart, * then read the stored EL2 version. @@ -207,6 +222,19 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) */ __vcpu_sys_reg(vcpu, reg) = val; + switch (reg) { + case CNTHCTL_EL2: + /* + * If E2H=0, CNHTCTL_EL2 is a pure shadow register. + * Otherwise, some of the bits are backed by + * CNTKCTL_EL1, while the rest is kept in memory. + * Yes, this is fun stuff. + */ + if (vcpu_el2_e2h_is_set(vcpu)) + write_sysreg_el1(val, SYS_CNTKCTL); + return; + } + /* No EL1 counterpart? We're done here.? */ if (reg == el1r) return; diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index c819c5d16613..fd650a8789b9 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -147,6 +147,9 @@ u64 timer_get_cval(struct arch_timer_context *ctxt); void kvm_timer_cpu_up(void); void kvm_timer_cpu_down(void); +/* CNTKCTL_EL1 valid bits as of DDI0487J.a */ +#define CNTKCTL_VALID_BITS (BIT(17) | GENMASK_ULL(9, 0)) + static inline bool has_cntpoff(void) { return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); From 9ae424d2a1ae144c83ffd7c9e2f408ae5acfb634 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:18 +0100 Subject: [PATCH 18/83] arm64: Define ID_AA64MMFR1_EL1.HAFDBS advertising FEAT_HAFT This definition is missing, and we are going to need it to sanitise TCR2_ELx. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-11-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/tools/sysreg | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 3c812fd28eca..8db4431093b2 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1688,6 +1688,7 @@ UnsignedEnum 3:0 HAFDBS 0b0000 NI 0b0001 AF 0b0010 DBM + 0b0011 HAFT EndEnum EndSysreg From 69c19e047dfee63f3e5b06b4ad288bbad32fe8f0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:19 +0100 Subject: [PATCH 19/83] KVM: arm64: Add TCR2_EL2 to the sysreg arrays Add the TCR2_EL2 register to the per-vcpu sysreg register array, the sysreg descriptor array, and advertise it as mapped to TCR2_EL1 for NV purposes. Access to this register is conditional based on ID_AA64MMFR3_EL1.TCRX being advertised. Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-12-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/sys_regs.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c753ab31dcea..b41de132fa4e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -499,6 +499,7 @@ enum vcpu_sysreg { /* Anything from this can be RES0/RES1 sanitised */ MARKER(__SANITISED_REG_START__), + TCR2_EL2, /* Extended Translation Control Register (EL2) */ /* Any VNCR-capable reg goes after this point */ MARKER(__VNCR_START__), diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9115a75b2c61..a435b78606c0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -134,6 +134,7 @@ static bool get_el2_to_el1_mapping(unsigned int reg, MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); @@ -452,6 +453,18 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, return true; } +static bool access_tcr2_el2(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + return access_rw(vcpu, p, r); +} + static bool access_actlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -2866,6 +2879,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), + EL2_REG(TCR2_EL2, access_tcr2_el2, reset_val, TCR2_EL2_RES1), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), From b9527b38c66730061c245e353dab42ef7dda33c6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:15 +0100 Subject: [PATCH 20/83] KVM: arm64: nv: Save/Restore vEL2 sysregs Whenever we need to restore the guest's system registers to the CPU, we now need to take care of the EL2 system registers as well. Most of them are accessed via traps only, but some have an immediate effect and also a guest running in VHE mode would expect them to be accessible via their EL1 encoding, which we do not trap. For vEL2 we write the virtual EL2 registers with an identical format directly into their EL1 counterpart, and translate the few registers that have a different format for the same effect on the execution when running a non-VHE guest guest hypervisor. Based on an initial patch from Andre Przywara, rewritten many times since. Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-8-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 5 +- arch/arm64/kvm/hyp/nvhe/sysreg-sr.c | 2 +- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 136 ++++++++++++++++++++- 3 files changed, 138 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 1579a3c08a36..d67628d01bf5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -152,9 +152,10 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0); } -static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) +static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, + u64 mpidr) { - write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2); + write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index 29305022bc04..dba101565de3 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -28,7 +28,7 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) { - __sysreg_restore_el1_state(ctxt); + __sysreg_restore_el1_state(ctxt, ctxt_sys_reg(ctxt, MPIDR_EL1)); __sysreg_restore_common_state(ctxt); __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index e12bd7d6d2dc..922aac39b021 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -15,6 +15,107 @@ #include #include +static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) +{ + /* These registers are common with EL1 */ + __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); + __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); + + __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); + __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); + __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); + __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); + __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); + __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); + __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); + __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); + + /* + * In VHE mode those registers are compatible between EL1 and EL2, + * and the guest uses the _EL1 versions on the CPU naturally. + * So we save them into their _EL2 versions here. + * For nVHE mode we trap accesses to those registers, so our + * _EL2 copy in sys_regs[] is always up-to-date and we don't need + * to save anything here. + */ + if (vcpu_el2_e2h_is_set(vcpu)) { + u64 val; + + /* + * We don't save CPTR_EL2, as accesses to CPACR_EL1 + * are always trapped, ensuring that the in-memory + * copy is always up-to-date. A small blessing... + */ + __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); + __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); + __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); + __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); + + /* + * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where + * the interesting CNTHCTL_EL2 bits live. So preserve these + * bits when reading back the guest-visible value. + */ + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; + __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; + } + + __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); + __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); + __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); +} + +static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* These registers are common with EL1 */ + write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); + write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); + + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + + if (vcpu_el2_e2h_is_set(vcpu)) { + /* + * In VHE mode those registers are compatible between + * EL1 and EL2. + */ + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); + } else { + /* + * CNTHCTL_EL2 only affects EL1 when running nVHE, so + * no need to restore it. + */ + val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); + write_sysreg_el1(val, SYS_SCTLR); + val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); + write_sysreg_el1(val, SYS_CPACR); + val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); + write_sysreg_el1(val, SYS_TTBR0); + val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); + write_sysreg_el1(val, SYS_TCR); + } + + write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); + write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); +} + /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and * pstate, which are handled as part of the el2 return state) on every @@ -66,6 +167,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; + u64 mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); @@ -89,7 +191,29 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); - __sysreg_restore_el1_state(guest_ctxt); + + if (unlikely(__is_hyp_ctxt(guest_ctxt))) { + __sysreg_restore_vel2_state(vcpu); + } else { + if (vcpu_has_nv(vcpu)) { + /* + * Use the guest hypervisor's VPIDR_EL2 when in a + * nested state. The hardware value of MIDR_EL1 gets + * restored on put. + */ + write_sysreg(ctxt_sys_reg(guest_ctxt, VPIDR_EL2), vpidr_el2); + + /* + * As we're restoring a nested guest, set the value + * provided by the guest hypervisor. + */ + mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); + } else { + mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); + } + + __sysreg_restore_el1_state(guest_ctxt, mpidr); + } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); } @@ -112,12 +236,20 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); - __sysreg_save_el1_state(guest_ctxt); + if (unlikely(__is_hyp_ctxt(guest_ctxt))) + __sysreg_save_vel2_state(vcpu); + else + __sysreg_save_el1_state(guest_ctxt); + __sysreg_save_user_state(guest_ctxt); __sysreg32_save_state(vcpu); /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); + /* If leaving a nesting guest, restore MIDR_EL1 default view */ + if (vcpu_has_nv(vcpu)) + write_sysreg(read_cpuid_id(), vpidr_el2); + vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } From ad4f6ef0fa19d0418e4087fd6783679c3fdfa888 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:20 +0100 Subject: [PATCH 21/83] KVM: arm64: Sanitise TCR2_EL2 TCR2_EL2 is a bag of control bits, all of which are only valid if certain features are present, and RES0 otherwise. Describe these constraints and register them with the masking infrastructure. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241023145345.1613824-13-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 26103d6514bd..1c8a6aa907df 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1204,6 +1204,28 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= ~(res0 | res1); set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); + /* TCR2_EL2 */ + res0 = TCR2_EL2_RES0; + res1 = TCR2_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP)) + res0 |= (TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1 | TCR2_EL2_D128); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, MEC, IMP)) + res0 |= TCR2_EL2_AMEC1 | TCR2_EL2_AMEC0; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT)) + res0 |= TCR2_EL2_HAFT; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) + res0 |= TCR2_EL2_PTTWI | TCR2_EL2_PnCH; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) + res0 |= TCR2_EL2_AIE; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + res0 |= TCR2_EL2_PIE; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= (TCR2_EL2_E0POE | TCR2_EL2_D128 | + TCR2_EL2_AMEC1 | TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1); + set_sysreg_masks(kvm, TCR2_EL2, res0, res1); + /* SCTLR_EL1 */ res0 = SCTLR_EL1_RES0; res1 = SCTLR_EL1_RES1; From 14ca930d828b6bd0538a0c7e101b52319ae7ad35 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:16 +0100 Subject: [PATCH 22/83] KVM: arm64: Correctly access TCR2_EL1, PIR_EL1, PIRE0_EL1 with VHE For code that accesses any of the guest registers for emulation purposes, it is crucial to know where the most up-to-date data is. While this is pretty clear for nVHE (memory is the sole repository), things are a lot muddier for VHE, as depending on the SYSREGS_ON_CPU flag, registers can either be loaded on the HW or be in memory. Even worse with NV, where the loaded state is by definition partial. For these reasons, KVM offers the vcpu_read_sys_reg() and vcpu_write_sys_reg() primitives that always do the right thing. However, these primitive must know what register to access, and this is the role of the __vcpu_read_sys_reg_from_cpu() and __vcpu_write_sys_reg_to_cpu() helpers. As it turns out, TCR2_EL1, PIR_EL1, PIRE0_EL1 and not described in the latter helpers, meaning that the AT code cannot use them to emulate S1PIE. Add the three registers to the (long) list. Fixes: 86f9de9db178 ("KVM: arm64: Save/restore PIE registers") Signed-off-by: Marc Zyngier Cc: Joey Gouly Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241023145345.1613824-9-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bf64fed9820e..b89f19b4195d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1037,6 +1037,9 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break; case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break; + case TCR2_EL1: *val = read_sysreg_s(SYS_TCR2_EL12); break; + case PIR_EL1: *val = read_sysreg_s(SYS_PIR_EL12); break; + case PIRE0_EL1: *val = read_sysreg_s(SYS_PIRE0_EL12); break; case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; @@ -1083,6 +1086,9 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; + case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break; + case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break; + case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break; case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; From 5055938452ede673baf13bda4fe84d8fac2bee76 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:21 +0100 Subject: [PATCH 23/83] KVM: arm64: Add save/restore for TCR2_EL2 Like its EL1 equivalent, TCR2_EL2 gets context-switched. This is made conditional on FEAT_TCRX being adversised. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-14-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 922aac39b021..cdbf52bfc483 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -51,6 +51,9 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) + __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); + /* * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where * the interesting CNTHCTL_EL2 bits live. So preserve these @@ -107,6 +110,10 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) write_sysreg_el1(val, SYS_TCR); } + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); + + write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); From a0162020095e2a34a59fc64c07183cc039e759f6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:17 +0100 Subject: [PATCH 24/83] KVM: arm64: Extend masking facility to arbitrary registers We currently only use the masking (RES0/RES1) facility for VNCR registers, as they are memory-based and thus easy to sanitise. But we could apply the same thing to other registers if we: - split the sanitisation from __VNCR_START__ - apply the sanitisation when reading from a HW register This involves a new "marker" in the vcpu_sysreg enum, which defines the point at which the sanitisation applies (the VNCR registers being of course after this marker). Whle we are at it, rename kvm_vcpu_sanitise_vncr_reg() to kvm_vcpu_apply_reg_masks(), which is vaguely more explicit, and harden set_sysreg_masks() against setting masks for random registers... Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241023145345.1613824-10-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 19 +++++++++++++------ arch/arm64/kvm/nested.c | 12 ++++++++---- arch/arm64/kvm/sys_regs.c | 3 +++ 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b89f19b4195d..c753ab31dcea 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -374,7 +374,7 @@ struct kvm_arch { u64 ctr_el0; - /* Masks for VNCR-baked sysregs */ + /* Masks for VNCR-backed and general EL2 sysregs */ struct kvm_sysreg_masks *sysreg_masks; /* @@ -408,6 +408,9 @@ struct kvm_vcpu_fault_info { r = __VNCR_START__ + ((VNCR_ ## r) / 8), \ __after_##r = __MAX__(__before_##r - 1, r) +#define MARKER(m) \ + m, __after_##m = m - 1 + enum vcpu_sysreg { __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ @@ -494,7 +497,11 @@ enum vcpu_sysreg { CNTHV_CTL_EL2, CNTHV_CVAL_EL2, - __VNCR_START__, /* Any VNCR-capable reg goes after this point */ + /* Anything from this can be RES0/RES1 sanitised */ + MARKER(__SANITISED_REG_START__), + + /* Any VNCR-capable reg goes after this point */ + MARKER(__VNCR_START__), VNCR(SCTLR_EL1),/* System Control Register */ VNCR(ACTLR_EL1),/* Auxiliary Control Register */ @@ -554,7 +561,7 @@ struct kvm_sysreg_masks { struct { u64 res0; u64 res1; - } mask[NR_SYS_REGS - __VNCR_START__]; + } mask[NR_SYS_REGS - __SANITISED_REG_START__]; }; struct kvm_cpu_context { @@ -1002,13 +1009,13 @@ static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) -u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *, enum vcpu_sysreg); +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); #define __vcpu_sys_reg(v,r) \ (*({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ u64 *__r = __ctxt_sys_reg(ctxt, (r)); \ - if (vcpu_has_nv((v)) && (r) >= __VNCR_START__) \ - *__r = kvm_vcpu_sanitise_vncr_reg((v), (r)); \ + if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ + *__r = kvm_vcpu_apply_reg_masks((v), (r), *__r);\ __r; \ })) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index c4b17d90fc49..26103d6514bd 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -933,15 +933,15 @@ static void limit_nv_id_regs(struct kvm *kvm) kvm_set_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1, val); } -u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg sr, u64 v) { - u64 v = ctxt_sys_reg(&vcpu->arch.ctxt, sr); struct kvm_sysreg_masks *masks; masks = vcpu->kvm->arch.sysreg_masks; if (masks) { - sr -= __VNCR_START__; + sr -= __SANITISED_REG_START__; v &= ~masks->mask[sr].res0; v |= masks->mask[sr].res1; @@ -952,7 +952,11 @@ u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) { - int i = sr - __VNCR_START__; + int i = sr - __SANITISED_REG_START__; + + BUILD_BUG_ON(!__builtin_constant_p(sr)); + BUILD_BUG_ON(sr < __SANITISED_REG_START__); + BUILD_BUG_ON(sr >= NR_SYS_REGS); kvm->arch.sysreg_masks->mask[i].res0 = res0; kvm->arch.sysreg_masks->mask[i].res1 = res1; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9fb2847f5abc..9115a75b2c61 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -189,6 +189,9 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) /* Get the current version of the EL1 counterpart. */ WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val)); + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + return val; } From 5f8d5a15ef5a6ebf2c568101c20d4880a970a874 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:22 +0100 Subject: [PATCH 25/83] KVM: arm64: Add PIR{,E0}_EL2 to the sysreg arrays Add the FEAT_S1PIE EL2 registers to the per-vcpu sysreg register array. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-15-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/sys_regs.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b41de132fa4e..553fb07f2153 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -478,6 +478,8 @@ enum vcpu_sysreg { TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ TCR_EL2, /* Translation Control Register (EL2) */ + PIRE0_EL2, /* Permission Indirection Register 0 (EL2) */ + PIR_EL2, /* Permission Indirection Register 1 (EL2) */ SPSR_EL2, /* EL2 saved program status register */ ELR_EL2, /* EL2 exception link register */ AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a435b78606c0..9cbcd69d45fc 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -135,6 +135,8 @@ static bool get_el2_to_el1_mapping(unsigned int reg, MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); + MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); + MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); From b3ad940a088761fd183dccd65c6ee20b360e8c4b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:23 +0100 Subject: [PATCH 26/83] KVM: arm64: Add save/restore for PIR{,E0}_EL2 Like their EL1 equivalent, the EL2-specific FEAT_S1PIE registers are context-switched. This is made conditional on both FEAT_TCRX and FEAT_S1PIE being adversised. Note that this change only makes sense if read together with the issue D22677 contained in 102105_K.a_04_en. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-16-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index cdbf52bfc483..a603966726f6 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -51,9 +51,15 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); - if (ctxt_has_tcrx(&vcpu->arch.ctxt)) + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); + __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); + } + } + /* * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where * the interesting CNTHCTL_EL2 bits live. So preserve these @@ -110,9 +116,14 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) write_sysreg_el1(val, SYS_TCR); } - if (ctxt_has_tcrx(&vcpu->arch.ctxt)) + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); + } + } write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); From 874ae1d48e607d41ae08fa72a9ed76cb62651085 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:24 +0100 Subject: [PATCH 27/83] KVM: arm64: Handle PIR{,E0}_EL2 traps Add the FEAT_S1PIE EL2 registers the sysreg descriptor array so that they can be handled as a trap. Access to these registers is conditional based on ID_AA64MMFR3_EL1.S1PIE being advertised. Similarly to other other changes, PIRE0_EL2 is guaranteed to trap thanks to the D22677 update to the architecture. Reviewed-by: Joey Gouly Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-17-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9cbcd69d45fc..2d30266a4c65 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -369,6 +369,18 @@ static bool access_rw(struct kvm_vcpu *vcpu, return true; } +static bool check_s1pie_access_rw(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + return access_rw(vcpu, p, r); +} + /* * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). */ @@ -2909,6 +2921,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), EL2_REG(MAIR_EL2, access_rw, reset_val, 0), + EL2_REG(PIRE0_EL2, check_s1pie_access_rw, reset_val, 0), + EL2_REG(PIR_EL2, check_s1pie_access_rw, reset_val, 0), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), EL2_REG(VBAR_EL2, access_rw, reset_val, 0), From 23e7a34c8397d1ecff430b3500ad5c8bdea10a5c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:26 +0100 Subject: [PATCH 28/83] KVM: arm64: Add AT fast-path support for S1PIE Emulating AT using AT instructions requires that the live state matches the translation regime the AT instruction targets. If targeting the EL1&0 translation regime and that S1PIE is supported, we also need to restore that state (covering TCR2_EL1, PIR_EL1, and PIRE0_EL1). Add the required system register switcheroo. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241023145345.1613824-19-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index f04677127fbc..b9d0992e9197 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -412,6 +412,9 @@ struct mmu_config { u64 ttbr1; u64 tcr; u64 mair; + u64 tcr2; + u64 pir; + u64 pire0; u64 sctlr; u64 vttbr; u64 vtcr; @@ -424,6 +427,13 @@ static void __mmu_config_save(struct mmu_config *config) config->ttbr1 = read_sysreg_el1(SYS_TTBR1); config->tcr = read_sysreg_el1(SYS_TCR); config->mair = read_sysreg_el1(SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + config->tcr2 = read_sysreg_el1(SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + config->pir = read_sysreg_el1(SYS_PIR); + config->pire0 = read_sysreg_el1(SYS_PIRE0); + } + } config->sctlr = read_sysreg_el1(SYS_SCTLR); config->vttbr = read_sysreg(vttbr_el2); config->vtcr = read_sysreg(vtcr_el2); @@ -444,6 +454,13 @@ static void __mmu_config_restore(struct mmu_config *config) write_sysreg_el1(config->ttbr1, SYS_TTBR1); write_sysreg_el1(config->tcr, SYS_TCR); write_sysreg_el1(config->mair, SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + write_sysreg_el1(config->tcr2, SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + write_sysreg_el1(config->pir, SYS_PIR); + write_sysreg_el1(config->pire0, SYS_PIRE0); + } + } write_sysreg_el1(config->sctlr, SYS_SCTLR); write_sysreg(config->vttbr, vttbr_el2); write_sysreg(config->vtcr, vtcr_el2); @@ -914,6 +931,13 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); + } + } write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); __load_stage2(mmu, mmu->arch); From 4967b87a9ff7cb19bd85dd985616e08d0f08b07b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:27 +0100 Subject: [PATCH 29/83] KVM: arm64: Split S1 permission evaluation into direct and hierarchical parts The AArch64.S1DirectBasePermissions() pseudocode deals with both direct and hierarchical S1 permission evaluation. While this is probably convenient in the pseudocode, we would like a bit more flexibility to slot things like indirect permissions. To that effect, split the two permission check parts out of handle_at_slow() and into their own functions. The permissions are passed around as part of the walk_result structure. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-20-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 174 ++++++++++++++++++++++++++------------------ 1 file changed, 104 insertions(+), 70 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index b9d0992e9197..adcfce3f67f0 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -37,6 +37,12 @@ struct s1_walk_result { u8 APTable; bool UXNTable; bool PXNTable; + bool ur; + bool uw; + bool ux; + bool pr; + bool pw; + bool px; }; struct { u8 fst; @@ -764,11 +770,103 @@ static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) return sctlr & SCTLR_EL1_EPAN; } +static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { + case 0b00: + wr->pr = wr->pw = true; + wr->ur = wr->uw = false; + break; + case 0b01: + wr->pr = wr->pw = wr->ur = wr->uw = true; + break; + case 0b10: + wr->pr = true; + wr->pw = wr->ur = wr->uw = false; + break; + case 0b11: + wr->pr = wr->ur = true; + wr->pw = wr->uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + wr->px = !((wr->desc & PTE_PXN) || wr->uw); + wr->ux = !(wr->desc & PTE_UXN); + } else { + wr->ur = wr->uw = wr->ux = false; + + if (!(wr->desc & PTE_RDONLY)) { + wr->pr = wr->pw = true; + } else { + wr->pr = true; + wr->pw = false; + } + + /* XN maps to UXN */ + wr->px = !(wr->desc & PTE_UXN); + } +} + +static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (wr->APTable) { + case 0b00: + break; + case 0b01: + wr->ur = wr->uw = false; + break; + case 0b10: + wr->pw = wr->uw = false; + break; + case 0b11: + wr->pw = wr->ur = wr->uw = false; + break; + } + + wr->px &= !wr->PXNTable; + wr->ux &= !wr->UXNTable; + } else { + if (wr->APTable & BIT(1)) + wr->pw = false; + + /* XN maps to UXN */ + wr->px &= !wr->UXNTable; + } +} + +static void compute_s1_permissions(struct kvm_vcpu *vcpu, u32 op, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + compute_s1_direct_permissions(vcpu, wi, wr); + + if (!wi->hpd) + compute_s1_hierarchical_permissions(vcpu, wi, wr); + + if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { + bool pan; + + pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; + pan &= wr->ur || wr->uw || (pan3_enabled(vcpu, wi->regime) && wr->ux); + wr->pw &= !pan; + wr->pr &= !pan; + } +} + static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { - bool perm_fail, ur, uw, ux, pr, pw, px; struct s1_walk_result wr = {}; struct s1_walk_info wi = {}; + bool perm_fail = false; int ret, idx; ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); @@ -787,88 +885,24 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) if (ret) goto compute_par; - /* FIXME: revisit when adding indirect permission support */ - /* AArch64.S1DirectBasePermissions() */ - if (wi.regime != TR_EL2) { - switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) { - case 0b00: - pr = pw = true; - ur = uw = false; - break; - case 0b01: - pr = pw = ur = uw = true; - break; - case 0b10: - pr = true; - pw = ur = uw = false; - break; - case 0b11: - pr = ur = true; - pw = uw = false; - break; - } - - switch (wr.APTable) { - case 0b00: - break; - case 0b01: - ur = uw = false; - break; - case 0b10: - pw = uw = false; - break; - case 0b11: - pw = ur = uw = false; - break; - } - - /* We don't use px for anything yet, but hey... */ - px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw); - ux = !((wr.desc & PTE_UXN) || wr.UXNTable); - - if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { - bool pan; - - pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; - pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux); - pw &= !pan; - pr &= !pan; - } - } else { - ur = uw = ux = false; - - if (!(wr.desc & PTE_RDONLY)) { - pr = pw = true; - } else { - pr = true; - pw = false; - } - - if (wr.APTable & BIT(1)) - pw = false; - - /* XN maps to UXN */ - px = !((wr.desc & PTE_UXN) || wr.UXNTable); - } - - perm_fail = false; + compute_s1_permissions(vcpu, op, &wi, &wr); switch (op) { case OP_AT_S1E1RP: case OP_AT_S1E1R: case OP_AT_S1E2R: - perm_fail = !pr; + perm_fail = !wr.pr; break; case OP_AT_S1E1WP: case OP_AT_S1E1W: case OP_AT_S1E2W: - perm_fail = !pw; + perm_fail = !wr.pw; break; case OP_AT_S1E0R: - perm_fail = !ur; + perm_fail = !wr.ur; break; case OP_AT_S1E0W: - perm_fail = !uw; + perm_fail = !wr.uw; break; case OP_AT_S1E1A: case OP_AT_S1E2A: From 5e21b297872237a96a23b637e670548987a09bb9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:28 +0100 Subject: [PATCH 30/83] KVM: arm64: Disable hierarchical permissions when S1PIE is enabled S1PIE implicitly disables hierarchical permissions, as specified in R_JHSVW, by making TCR_ELx.HPDn RES1. Add a predicate for S1PIE being enabled for a given translation regime, and emulate this behaviour by forcing the hpd field to true if S1PIE is enabled for that translation regime. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241023145345.1613824-21-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index adcfce3f67f0..f5bd750288ff 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -93,6 +93,23 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o } } +static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + return false; + + switch (regime) { + case TR_EL2: + case TR_EL20: + return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; + case TR_EL10: + return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && + (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1x_PIE); + default: + BUG(); + } +} + static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { @@ -186,6 +203,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, (va55 ? FIELD_GET(TCR_HPD1, tcr) : FIELD_GET(TCR_HPD0, tcr))); + /* R_JHSVW */ + wi->hpd |= s1pie_enabled(vcpu, wi->regime); /* Someone was silly enough to encode TG0/TG1 differently */ if (va55) { From 364c081029a68b47e0ecb475a0cf337a89c9f960 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:29 +0100 Subject: [PATCH 31/83] KVM: arm64: Implement AT S1PIE support It doesn't take much effort to implement S1PIE support in AT. It is only a matter of using the AArch64.S1IndirectBasePermissions() encodings for the permission, ignoring GCS which has no impact on AT, and enforce FEAT_PAN3 being enabled as this is a requirement of FEAT_S1PIE. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241023145345.1613824-22-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 117 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index f5bd750288ff..3d93ed179560 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -781,6 +781,9 @@ static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) return false; + if (s1pie_enabled(vcpu, regime)) + return true; + if (regime == TR_EL10) sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); else @@ -862,11 +865,123 @@ static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, } } +#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) + +#define set_priv_perms(wr, r, w, x) \ + do { \ + (wr)->pr = (r); \ + (wr)->pw = (w); \ + (wr)->px = (x); \ + } while (0) + +#define set_unpriv_perms(wr, r, w, x) \ + do { \ + (wr)->ur = (r); \ + (wr)->uw = (w); \ + (wr)->ux = (x); \ + } while (0) + +/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ +#define set_perms(w, wr, ip) \ + do { \ + /* R_LLZDZ */ \ + switch ((ip)) { \ + case 0b0000: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b0010: \ + set_ ## w ## _perms((wr), false, false, true ); \ + break; \ + case 0b0011: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b0100: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0101: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b0110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b0111: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1000: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1010: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b1011: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1100: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b1101: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1111: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + } \ + } while (0) + +static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 up, pp, idx; + + idx = pte_pi_index(wr->desc); + + switch (wi->regime) { + case TR_EL10: + pp = perm_idx(vcpu, PIR_EL1, idx); + up = perm_idx(vcpu, PIRE0_EL1, idx); + break; + case TR_EL20: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = perm_idx(vcpu, PIRE0_EL2, idx); + break; + case TR_EL2: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = 0; + break; + } + + set_perms(priv, wr, pp); + + if (wi->regime != TR_EL2) + set_perms(unpriv, wr, up); + else + set_unpriv_perms(wr, false, false, false); + + /* R_VFPJF */ + if (wr->px && wr->uw) { + set_priv_perms(wr, false, false, false); + set_unpriv_perms(wr, false, false, false); + } +} + static void compute_s1_permissions(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, struct s1_walk_result *wr) { - compute_s1_direct_permissions(vcpu, wi, wr); + if (!s1pie_enabled(vcpu, wi->regime)) + compute_s1_direct_permissions(vcpu, wi, wr); + else + compute_s1_indirect_permissions(vcpu, wi, wr); if (!wi->hpd) compute_s1_hierarchical_permissions(vcpu, wi, wr); From ee3a9a0643c58f61d2227ba819e13dbb552aff11 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:30 +0100 Subject: [PATCH 32/83] KVM: arm64: Add a composite EL2 visibility helper We are starting to have a bunch of visibility helpers checking for EL2 + something else, and we are going to add more. Simplify things somehow by introducing a helper that implement extractly that by taking a visibility helper as a parameter, and convert the existing ones to that. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-23-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 2d30266a4c65..9938c768c5ac 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2303,16 +2303,18 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return __vcpu_sys_reg(vcpu, r->reg) = val; } +static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + unsigned int (*fn)(const struct kvm_vcpu *, + const struct sys_reg_desc *)) +{ + return el2_visibility(vcpu, rd) ?: fn(vcpu, rd); +} + static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - unsigned int r; - - r = el2_visibility(vcpu, rd); - if (r) - return r; - - return sve_visibility(vcpu, rd); + return __el2_visibility(vcpu, rd, sve_visibility); } static bool access_zcr_el2(struct kvm_vcpu *vcpu, From 997eeecafebaef668b76264800c185544a432839 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Oct 2024 15:53:31 +0100 Subject: [PATCH 33/83] KVM: arm64: Define helper for EL2 registers with custom visibility In preparation for adding more visibility filtering for EL2 registers add a helper macro like EL2_REG() which allows specification of a custom visibility operation. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240822-kvm-arm64-hide-pie-regs-v2-1-376624fa829c@kernel.org Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-24-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9938c768c5ac..a2648a66b88d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2171,6 +2171,15 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = v, \ } +#define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ + SYS_DESC(SYS_##name), \ + .access = acc, \ + .reset = rst, \ + .reg = name, \ + .visibility = filter, \ + .val = v, \ +} + #define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) @@ -2887,8 +2896,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), EL2_REG_VNCR(HACR_EL2, reset_val, 0), - { SYS_DESC(SYS_ZCR_EL2), .access = access_zcr_el2, .reset = reset_val, - .visibility = sve_el2_visibility, .reg = ZCR_EL2 }, + EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0, + sve_el2_visibility), EL2_REG_VNCR(HCRX_EL2, reset_val, 0), From 0fcb4eea5345531118ca6f46391e88b12fbc35e4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Oct 2024 15:53:32 +0100 Subject: [PATCH 34/83] KVM: arm64: Hide TCR2_EL1 from userspace when disabled for guests When the guest does not support FEAT_TCR2 we should not allow any access to it in order to ensure that we do not create spurious issues with guest migration. Add a visibility operation for it. Fixes: fbff56068232 ("KVM: arm64: Save/restore TCR2_EL1") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240822-kvm-arm64-hide-pie-regs-v2-2-376624fa829c@kernel.org [maz: simplify by using __el2_visibility(), kvm_has_tcr2() throughout] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-25-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/at.c | 2 +- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 2 +- arch/arm64/kvm/nested.c | 2 +- arch/arm64/kvm/sys_regs.c | 27 ++++++++++++++++++---- 5 files changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 553fb07f2153..fb5dd6a287eb 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1519,4 +1519,7 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); (system_supports_fpmr() && \ kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP)) +#define kvm_has_tcr2(k) \ + (kvm_has_feat((k), ID_AA64MMFR3_EL1, TCRX, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 3d93ed179560..a9f665d5ceb0 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1099,7 +1099,7 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { + if (kvm_has_tcr2(vcpu->kvm)) { write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) { write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index d67628d01bf5..c92c2c0b86aa 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -69,7 +69,7 @@ static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) return false; vcpu = ctxt_to_vcpu(ctxt); - return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, TCRX, IMP); + return kvm_has_tcr2(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 1c8a6aa907df..a4e085dab3ba 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1054,7 +1054,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= HCRX_EL2_PTTWI; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SCTLRX, IMP)) res0 |= HCRX_EL2_SCTLR2En; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + if (!kvm_has_tcr2(kvm)) res0 |= HCRX_EL2_TCR2En; if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) res0 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a2648a66b88d..d4b5faacf066 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -446,7 +446,7 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, u64 val, mask, shift; if (reg_to_encoding(r) == SYS_TCR2_EL1 && - !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + !kvm_has_tcr2(vcpu->kvm)) return undef_access(vcpu, p, r); BUG_ON(!p->is_write); @@ -471,7 +471,7 @@ static bool access_tcr2_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { + if (!kvm_has_tcr2(vcpu->kvm)) { kvm_inject_undefined(vcpu); return false; } @@ -2357,6 +2357,21 @@ static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_tcr2(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, tcr2_visibility); +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2567,7 +2582,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, - { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0 }, + { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0, + .visibility = tcr2_visibility }, PTRAUTH_KEY(APIA), PTRAUTH_KEY(APIB), @@ -2904,7 +2920,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), - EL2_REG(TCR2_EL2, access_tcr2_el2, reset_val, TCR2_EL2_RES1), + EL2_REG_FILTERED(TCR2_EL2, access_tcr2_el2, reset_val, TCR2_EL2_RES1, + tcr2_el2_visibility), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), @@ -4800,7 +4817,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); - if (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + if (kvm_has_tcr2(kvm)) vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; if (kvm_has_fpmr(kvm)) From a68cddbe47efdac10855e5f154cbd6c87b792ba2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Oct 2024 15:53:33 +0100 Subject: [PATCH 35/83] KVM: arm64: Hide S1PIE registers from userspace when disabled for guests When the guest does not support S1PIE we should not allow any access to the system registers it adds in order to ensure that we do not create spurious issues with guest migration. Add a visibility operation for these registers. Fixes: 86f9de9db178 ("KVM: arm64: Save/restore PIE registers") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240822-kvm-arm64-hide-pie-regs-v2-3-376624fa829c@kernel.org [maz: simplify by using __el2_visibility(), kvm_has_s1pie() throughout] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-26-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/at.c | 4 +-- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 2 +- arch/arm64/kvm/nested.c | 4 +-- arch/arm64/kvm/sys_regs.c | 31 +++++++++++++++++----- 5 files changed, 33 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index fb5dd6a287eb..063463d05644 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1522,4 +1522,7 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); #define kvm_has_tcr2(k) \ (kvm_has_feat((k), ID_AA64MMFR3_EL1, TCRX, IMP)) +#define kvm_has_s1pie(k) \ + (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1PIE, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index a9f665d5ceb0..de7109111e40 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -95,7 +95,7 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) { - if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + if (!kvm_has_s1pie(vcpu->kvm)) return false; switch (regime) { @@ -1101,7 +1101,7 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); if (kvm_has_tcr2(vcpu->kvm)) { write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) { + if (kvm_has_s1pie(vcpu->kvm)) { write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); } diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index c92c2c0b86aa..a306ea70502c 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -58,7 +58,7 @@ static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt) return false; vcpu = ctxt_to_vcpu(ctxt); - return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1PIE, IMP); + return kvm_has_s1pie(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index a4e085dab3ba..9838b92dde5f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1105,7 +1105,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) res0 |= HFGxTR_EL2_nRCWMASK_EL1; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + if (!kvm_has_s1pie(kvm)) res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); @@ -1219,7 +1219,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= TCR2_EL2_AIE; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + if (!kvm_has_s1pie(kvm)) res0 |= TCR2_EL2_PIE; if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) res0 |= (TCR2_EL2_E0POE | TCR2_EL2_D128 | diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d4b5faacf066..46aa3d54b528 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -373,7 +373,7 @@ static bool check_s1pie_access_rw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) { + if (!kvm_has_s1pie(vcpu->kvm)) { kvm_inject_undefined(vcpu); return false; } @@ -2372,6 +2372,21 @@ static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu, return __el2_visibility(vcpu, rd, tcr2_visibility); } +static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_s1pie(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1pie_visibility); +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2637,8 +2652,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, - { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, - { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1, + .visibility = s1pie_visibility }, + { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1, + .visibility = s1pie_visibility }, { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, @@ -2949,8 +2966,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), EL2_REG(MAIR_EL2, access_rw, reset_val, 0), - EL2_REG(PIRE0_EL2, check_s1pie_access_rw, reset_val, 0), - EL2_REG(PIR_EL2, check_s1pie_access_rw, reset_val, 0), + EL2_REG_FILTERED(PIRE0_EL2, check_s1pie_access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(PIR_EL2, check_s1pie_access_rw, reset_val, 0, + s1pie_el2_visibility), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), EL2_REG(VBAR_EL2, access_rw, reset_val, 0), @@ -4867,7 +4886,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | HFGITR_EL2_ATS1E1WP); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + if (!kvm_has_s1pie(kvm)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); From b4824120303f18dfa2b4b6bf303240172117925b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:34 +0100 Subject: [PATCH 36/83] KVM: arm64: Rely on visibility to let PIR*_ELx/TCR2_ELx UNDEF With a visibility defined for these registers, there is no need to check again for S1PIE or TCRX being implemented as perform_access() already handles it. Reviewed-by: Mark Brown Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-27-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 46aa3d54b528..e0fbe71d5890 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -369,18 +369,6 @@ static bool access_rw(struct kvm_vcpu *vcpu, return true; } -static bool check_s1pie_access_rw(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - if (!kvm_has_s1pie(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } - - return access_rw(vcpu, p, r); -} - /* * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). */ @@ -445,10 +433,6 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, bool was_enabled = vcpu_has_cache_enabled(vcpu); u64 val, mask, shift; - if (reg_to_encoding(r) == SYS_TCR2_EL1 && - !kvm_has_tcr2(vcpu->kvm)) - return undef_access(vcpu, p, r); - BUG_ON(!p->is_write); get_access_mask(r, &mask, &shift); @@ -467,18 +451,6 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, return true; } -static bool access_tcr2_el2(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - if (!kvm_has_tcr2(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } - - return access_rw(vcpu, p, r); -} - static bool access_actlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -2937,7 +2909,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), - EL2_REG_FILTERED(TCR2_EL2, access_tcr2_el2, reset_val, TCR2_EL2_RES1, + EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1, tcr2_el2_visibility), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), @@ -2966,9 +2938,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), EL2_REG(MAIR_EL2, access_rw, reset_val, 0), - EL2_REG_FILTERED(PIRE0_EL2, check_s1pie_access_rw, reset_val, 0, + EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0, s1pie_el2_visibility), - EL2_REG_FILTERED(PIR_EL2, check_s1pie_access_rw, reset_val, 0, + EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0, s1pie_el2_visibility), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), From b9ed7e5dfbe9b77639fd4ff042bef94fb232b94d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:35 +0100 Subject: [PATCH 37/83] arm64: Add encoding for POR_EL2 POR_EL2 is the equivalent of POR_EL1 for the EL2&0 translation regime, and it is sorely missing from the sysreg file. Add the sucker. Signed-off-by: Marc Zyngier Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20241023145345.1613824-28-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/tools/sysreg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8db4431093b2..a33136243bdf 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2907,6 +2907,10 @@ Sysreg POR_EL1 3 0 10 2 4 Fields PIRx_ELx EndSysreg +Sysreg POR_EL2 3 4 10 2 4 +Fields PIRx_ELx +EndSysreg + Sysreg POR_EL12 3 5 10 2 4 Fields PIRx_ELx EndSysreg From b17d8aa2012617c51c5478f1be69044adb602d53 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:36 +0100 Subject: [PATCH 38/83] KVM: arm64: Drop bogus CPTR_EL2.E0POE trap routing It took me some time to realise it, but CPTR_EL2.E0POE does not apply to a guest, only to EL0 when InHost(). And when InHost(), CPCR_EL2 is mapped to CPACR_EL1, maning that the E0POE bit naturally takes effect without any trap. To sum it up, this trap bit is better left ignored, we will never have to hanedle it. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-29-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/emulate-nested.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 05b6435d02a9..ddcbaa983de3 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -79,7 +79,6 @@ enum cgt_group_id { CGT_MDCR_E2TB, CGT_MDCR_TDCC, - CGT_CPACR_E0POE, CGT_CPTR_TAM, CGT_CPTR_TCPAC, @@ -362,12 +361,6 @@ static const struct trap_bits coarse_trap_bits[] = { .mask = MDCR_EL2_TDCC, .behaviour = BEHAVE_FORWARD_ANY, }, - [CGT_CPACR_E0POE] = { - .index = CPTR_EL2, - .value = CPACR_ELx_E0POE, - .mask = CPACR_ELx_E0POE, - .behaviour = BEHAVE_FORWARD_ANY, - }, [CGT_CPTR_TAM] = { .index = CPTR_EL2, .value = CPTR_EL2_TAM, @@ -1141,7 +1134,6 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM), - SR_TRAP(SYS_POR_EL0, CGT_CPACR_E0POE), /* op0=2, op1=1, and CRn<0b1000 */ SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0), sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA), From f7575530df436c39d07e79d6eef721c6e7b35bb4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:37 +0100 Subject: [PATCH 39/83] KVM: arm64: Subject S1PIE/S1POE registers to HCR_EL2.{TVM,TRVM} All the El0/EL1 S1PIE/S1POE system register are caught by the HCR_EL2 TVM and TRVM bits. Reflect this in the coarse grained trap table. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-30-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/emulate-nested.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index ddcbaa983de3..0ab090553354 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -704,6 +704,10 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En), SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ), From 26e89dccdf6328b608baa35d84cc36d915769f50 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:38 +0100 Subject: [PATCH 40/83] KVM: arm64: Add kvm_has_s1poe() helper Just like we have kvm_has_s1pie(), add its S1POE counterpart, making the code slightly more readable. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-31-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 2 +- arch/arm64/kvm/nested.c | 4 ++-- arch/arm64/kvm/sys_regs.c | 4 ++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 063463d05644..9269d893f85b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1525,4 +1525,7 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); #define kvm_has_s1pie(k) \ (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1PIE, IMP)) +#define kvm_has_s1poe(k) \ + (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index a306ea70502c..a651c43ad679 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -80,7 +80,7 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) return false; vcpu = ctxt_to_vcpu(ctxt); - return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1POE, IMP); + return kvm_has_s1poe(kern_hyp_va(vcpu->kvm)); } static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 9838b92dde5f..be88a36da071 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1107,7 +1107,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= HFGxTR_EL2_nRCWMASK_EL1; if (!kvm_has_s1pie(kvm)) res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + if (!kvm_has_s1poe(kvm)) res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) res0 |= HFGxTR_EL2_nS2POR_EL1; @@ -1217,7 +1217,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= TCR2_EL2_PTTWI | TCR2_EL2_PnCH; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) res0 |= TCR2_EL2_AIE; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + if (!kvm_has_s1poe(kvm)) res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE; if (!kvm_has_s1pie(kvm)) res0 |= TCR2_EL2_PIE; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e0fbe71d5890..d46663f27fba 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2323,7 +2323,7 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + if (kvm_has_s1poe(vcpu->kvm)) return 0; return REG_HIDDEN; @@ -4862,7 +4862,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + if (!kvm_has_s1poe(kvm)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | HFGxTR_EL2_nPOR_EL0); From 5970e9903f03560ce9829297e302463acdc26bc0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:39 +0100 Subject: [PATCH 41/83] KVM: arm64: Add basic support for POR_EL2 S1POE support implies support for POR_EL2, which we provide by - adding it to the vcpu_sysreg enum - advertising it as mapped to its EL1 counterpart in get_el2_to_el1_mapping - wiring it in the sys_reg_desc table with the correct visibility - handling POR_EL1 in __vcpu_{read,write}_sys_reg_from_cpu() Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-32-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 3 +++ arch/arm64/kvm/sys_regs.c | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 9269d893f85b..28ea7e72477c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -480,6 +480,7 @@ enum vcpu_sysreg { TCR_EL2, /* Translation Control Register (EL2) */ PIRE0_EL2, /* Permission Indirection Register 0 (EL2) */ PIR_EL2, /* Permission Indirection Register 1 (EL2) */ + POR_EL2, /* Permission Overlay Register 2 (EL2) */ SPSR_EL2, /* EL2 saved program status register */ ELR_EL2, /* EL2 exception link register */ AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */ @@ -1050,6 +1051,7 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) case TCR2_EL1: *val = read_sysreg_s(SYS_TCR2_EL12); break; case PIR_EL1: *val = read_sysreg_s(SYS_PIR_EL12); break; case PIRE0_EL1: *val = read_sysreg_s(SYS_PIRE0_EL12); break; + case POR_EL1: *val = read_sysreg_s(SYS_POR_EL12); break; case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; @@ -1099,6 +1101,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break; case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break; case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break; + case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break; case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d46663f27fba..4f4fe84fb0e8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -137,6 +137,7 @@ static bool get_el2_to_el1_mapping(unsigned int reg, MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); + MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL ); MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); @@ -2329,6 +2330,12 @@ static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1poe_visibility); +} + static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { @@ -2942,6 +2949,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { s1pie_el2_visibility), EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0, s1pie_el2_visibility), + EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0, + s1poe_el2_visibility), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), EL2_REG(VBAR_EL2, access_rw, reset_val, 0), From de5c2827fb44ae3074638e373bcea64ac9107689 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:40 +0100 Subject: [PATCH 42/83] KVM: arm64: Add save/restore support for POR_EL2 POR_EL2 needs saving when the guest is VHE, and restoring in any case. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-33-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index a603966726f6..5f78a39053a7 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -58,6 +58,9 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR); } /* @@ -123,6 +126,9 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR); } write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); From 846c993df98278ce18b80183d78b5c1ca8b14f5c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:41 +0100 Subject: [PATCH 43/83] KVM: arm64: Add POE save/restore for AT emulation fast-path Just like the other extensions affecting address translation, we must save/restore POE so that an out-of-context translation context can be restored and used with the AT instructions. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241023145345.1613824-34-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index de7109111e40..ef1643faedeb 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -440,6 +440,8 @@ struct mmu_config { u64 tcr2; u64 pir; u64 pire0; + u64 por_el0; + u64 por_el1; u64 sctlr; u64 vttbr; u64 vtcr; @@ -458,6 +460,10 @@ static void __mmu_config_save(struct mmu_config *config) config->pir = read_sysreg_el1(SYS_PIR); config->pire0 = read_sysreg_el1(SYS_PIRE0); } + if (system_supports_poe()) { + config->por_el1 = read_sysreg_el1(SYS_POR); + config->por_el0 = read_sysreg_s(SYS_POR_EL0); + } } config->sctlr = read_sysreg_el1(SYS_SCTLR); config->vttbr = read_sysreg(vttbr_el2); @@ -485,6 +491,10 @@ static void __mmu_config_restore(struct mmu_config *config) write_sysreg_el1(config->pir, SYS_PIR); write_sysreg_el1(config->pire0, SYS_PIRE0); } + if (system_supports_poe()) { + write_sysreg_el1(config->por_el1, SYS_POR); + write_sysreg_s(config->por_el0, SYS_POR_EL0); + } } write_sysreg_el1(config->sctlr, SYS_SCTLR); write_sysreg(config->vttbr, vttbr_el2); @@ -1105,6 +1115,10 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); } + if (kvm_has_s1poe(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); + write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); + } } write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); __load_stage2(mmu, mmu->arch); From 8a9b304d7e2276a0ebb60a23cdcf7d348052752f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:42 +0100 Subject: [PATCH 44/83] KVM: arm64: Disable hierarchical permissions when POE is enabled The hierarchical permissions must be disabled when POE is enabled in the translation regime used for a given table walk. We store the two enable bits in the s1_walk_info structure so that they can be retrieved down the line, as they will be useful. Signed-off-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241023145345.1613824-35-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index ef1643faedeb..8d1dc6327ec5 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -24,6 +24,8 @@ struct s1_walk_info { unsigned int txsz; int sl; bool hpd; + bool e0poe; + bool poe; bool be; bool s2; }; @@ -110,6 +112,34 @@ static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) } } +static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) +{ + u64 val; + + if (!kvm_has_s1poe(vcpu->kvm)) { + wi->poe = wi->e0poe = false; + return; + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + val = vcpu_read_sys_reg(vcpu, TCR2_EL2); + wi->poe = val & TCR2_EL2_POE; + wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); + break; + case TR_EL10: + if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { + wi->poe = wi->e0poe = false; + return; + } + + val = __vcpu_sys_reg(vcpu, TCR2_EL1); + wi->poe = val & TCR2_EL1x_POE; + wi->e0poe = val & TCR2_EL1x_E0POE; + } +} + static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { @@ -206,6 +236,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, /* R_JHSVW */ wi->hpd |= s1pie_enabled(vcpu, wi->regime); + /* Do we have POE? */ + compute_s1poe(vcpu, wi); + + /* R_BVXDG */ + wi->hpd |= (wi->poe || wi->e0poe); + /* Someone was silly enough to encode TG0/TG1 differently */ if (va55) { wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); From 7cd5c2796cb039aaa43d3f16d875bb7d60b2c1b0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:43 +0100 Subject: [PATCH 45/83] KVM: arm64: Make PAN conditions part of the S1 walk context Move the conditions describing PAN as part of the s1_walk_info structure, in an effort to declutter the permission processing. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-36-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 8d1dc6327ec5..2ab2c3578c3a 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -26,6 +26,7 @@ struct s1_walk_info { bool hpd; bool e0poe; bool poe; + bool pan; bool be; bool s2; }; @@ -151,6 +152,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, wi->regime = compute_translation_regime(vcpu, op); as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); va55 = va & BIT(55); @@ -1020,10 +1023,12 @@ static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, } } -static void compute_s1_permissions(struct kvm_vcpu *vcpu, u32 op, +static void compute_s1_permissions(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr) { + bool pan; + if (!s1pie_enabled(vcpu, wi->regime)) compute_s1_direct_permissions(vcpu, wi, wr); else @@ -1032,14 +1037,10 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu, u32 op, if (!wi->hpd) compute_s1_hierarchical_permissions(vcpu, wi, wr); - if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { - bool pan; - - pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; - pan &= wr->ur || wr->uw || (pan3_enabled(vcpu, wi->regime) && wr->ux); - wr->pw &= !pan; - wr->pr &= !pan; - } + pan = wi->pan && (wr->ur || wr->uw || + (pan3_enabled(vcpu, wi->regime) && wr->ux)); + wr->pw &= !pan; + wr->pr &= !pan; } static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) @@ -1065,7 +1066,7 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) if (ret) goto compute_par; - compute_s1_permissions(vcpu, op, &wi, &wr); + compute_s1_permissions(vcpu, &wi, &wr); switch (op) { case OP_AT_S1E1RP: From e39ce7033c70df5e402b47dd248137878df6c771 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:44 +0100 Subject: [PATCH 46/83] KVM: arm64: Handle stage-1 permission overlays We now have the intrastructure in place to emulate S1POE: - direct permissions are always overlay-capable - indirect permissions are overlay-capable if the permissions are in the 0b0xxx range - the overlays are strictly substractive Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-37-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 2ab2c3578c3a..d300cd1a0d8a 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -40,9 +40,11 @@ struct s1_walk_result { u8 APTable; bool UXNTable; bool PXNTable; + bool uov; bool ur; bool uw; bool ux; + bool pov; bool pr; bool pw; bool px; @@ -881,6 +883,9 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, /* XN maps to UXN */ wr->px = !(wr->desc & PTE_UXN); } + + wr->pov = wi->poe; + wr->uov = wi->e0poe; } static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, @@ -1016,6 +1021,9 @@ static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, else set_unpriv_perms(wr, false, false, false); + wr->pov = wi->poe && !(pp & BIT(3)); + wr->uov = wi->e0poe && !(up & BIT(3)); + /* R_VFPJF */ if (wr->px && wr->uw) { set_priv_perms(wr, false, false, false); @@ -1023,6 +1031,48 @@ static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, } } +static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 idx, pov_perms, uov_perms; + + idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); + + switch (wi->regime) { + case TR_EL10: + pov_perms = perm_idx(vcpu, POR_EL1, idx); + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL20: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL2: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + uov_perms = 0; + break; + } + + if (pov_perms & ~POE_RXW) + pov_perms = POE_NONE; + + if (wi->poe && wr->pov) { + wr->pr &= pov_perms & POE_R; + wr->px &= pov_perms & POE_X; + wr->pw &= pov_perms & POE_W; + } + + if (uov_perms & ~POE_RXW) + uov_perms = POE_NONE; + + if (wi->e0poe && wr->uov) { + wr->ur &= uov_perms & POE_R; + wr->ux &= uov_perms & POE_X; + wr->uw &= uov_perms & POE_W; + } +} + static void compute_s1_permissions(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr) @@ -1037,6 +1087,9 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu, if (!wi->hpd) compute_s1_hierarchical_permissions(vcpu, wi, wr); + if (wi->poe || wi->e0poe) + compute_s1_overlay_permissions(vcpu, wi, wr); + pan = wi->pan && (wr->ur || wr->uw || (pan3_enabled(vcpu, wi->regime) && wr->ux)); wr->pw &= !pan; From 1c6801d565eca7654732812b0c45ed898e64f238 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Oct 2024 15:53:45 +0100 Subject: [PATCH 47/83] KVM: arm64: Handle WXN attribute Until now, we didn't really care about WXN as it didn't have an effect on the R/W permissions (only the execution could be droppped), and therefore not of interest for AT. However, with S1POE, WXN can revoke the Write permission if an overlay is active and that execution is allowed. This *is* relevant to AT. Add full handling of WXN so that we correctly handle this case. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241023145345.1613824-38-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index d300cd1a0d8a..8c5d7990e5b3 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -40,10 +40,12 @@ struct s1_walk_result { u8 APTable; bool UXNTable; bool PXNTable; + bool uwxn; bool uov; bool ur; bool uw; bool ux; + bool pwxn; bool pov; bool pr; bool pw; @@ -847,6 +849,8 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr) { + bool wxn; + /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ if (wi->regime != TR_EL2) { switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { @@ -884,6 +888,17 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, wr->px = !(wr->desc & PTE_UXN); } + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); + break; + case TR_EL10: + wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + break; + } + + wr->pwxn = wr->uwxn = wxn; wr->pov = wi->poe; wr->uov = wi->e0poe; } @@ -935,6 +950,16 @@ static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, (wr)->ux = (x); \ } while (0) +#define set_priv_wxn(wr, v) \ + do { \ + (wr)->pwxn = (v); \ + } while (0) + +#define set_unpriv_wxn(wr, v) \ + do { \ + (wr)->uwxn = (v); \ + } while (0) + /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ #define set_perms(w, wr, ip) \ do { \ @@ -989,6 +1014,10 @@ static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, set_ ## w ## _perms((wr), false, false, false); \ break; \ } \ + \ + /* R_HJYGR */ \ + set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ + \ } while (0) static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, @@ -1090,6 +1119,22 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu, if (wi->poe || wi->e0poe) compute_s1_overlay_permissions(vcpu, wi, wr); + /* R_QXXPC */ + if (wr->pwxn) { + if (!wr->pov && wr->pw) + wr->px = false; + if (wr->pov && wr->px) + wr->pw = false; + } + + /* R_NPBXC */ + if (wr->uwxn) { + if (!wr->uov && wr->uw) + wr->ux = false; + if (wr->uov && wr->ux) + wr->uw = false; + } + pan = wi->pan && (wr->ur || wr->uw || (pan3_enabled(vcpu, wi->regime) && wr->ux)); wr->pw &= !pan; From 3e251afaec9a671716c9cc4c184f4e4a09915ec4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 19 Oct 2024 18:15:47 +0100 Subject: [PATCH 48/83] arm64: Use SYSTEM_OFF2 PSCI call to power off for hibernate The PSCI v1.3 specification adds support for a SYSTEM_OFF2 function which is analogous to ACPI S4 state. This will allow hosting environments to determine that a guest is hibernated rather than just powered off, and handle that state appropriately on subsequent launches. Since commit 60c0d45a7f7a ("efi/arm64: use UEFI for system reset and poweroff") the EFI shutdown method is deliberately preferred over PSCI or other methods. So register a SYS_OFF_MODE_POWER_OFF handler which *only* handles the hibernation, leaving the original PSCI SYSTEM_OFF as a last resort via the legacy pm_power_off function pointer. The hibernation code already exports a system_entering_hibernation() function which is be used by the higher-priority handler to check for hibernation. That existing function just returns the value of a static boolean variable from hibernate.c, which was previously only set in the hibernation_platform_enter() code path. Set the same flag in the simpler code path around the call to kernel_power_off() too. An alternative way to hook SYSTEM_OFF2 into the hibernation code would be to register a platform_hibernation_ops structure with an ->enter() method which makes the new SYSTEM_OFF2 call. But that would have the unwanted side-effect of making hibernation take a completely different code path in hibernation_platform_enter(), invoking a lot of special dpm callbacks. Another option might be to add a new SYS_OFF_MODE_HIBERNATE mode, with fallback to SYS_OFF_MODE_POWER_OFF. Or to use the sys_off_data to indicate whether the power off is for hibernation. But this version works and is relatively simple. Signed-off-by: David Woodhouse Acked-by: Rafael J. Wysocki Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20241019172459.2241939-7-dwmw2@infradead.org Signed-off-by: Oliver Upton --- drivers/firmware/psci/psci.c | 45 ++++++++++++++++++++++++++++++++++++ kernel/power/hibernate.c | 5 +++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 2328ca58bba6..a1ebbe9b73b1 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -78,6 +78,7 @@ struct psci_0_1_function_ids get_psci_0_1_function_ids(void) static u32 psci_cpu_suspend_feature; static bool psci_system_reset2_supported; +static bool psci_system_off2_hibernate_supported; static inline bool psci_has_ext_power_state(void) { @@ -333,6 +334,36 @@ static void psci_sys_poweroff(void) invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0); } +#ifdef CONFIG_HIBERNATION +static int psci_sys_hibernate(struct sys_off_data *data) +{ + /* + * If no hibernate type is specified SYSTEM_OFF2 defaults to selecting + * HIBERNATE_OFF. + * + * There are hypervisors in the wild that do not align with the spec and + * reject calls that explicitly provide a hibernate type. For + * compatibility with these nonstandard implementations, pass 0 as the + * type. + */ + if (system_entering_hibernation()) + invoke_psci_fn(PSCI_FN_NATIVE(1_3, SYSTEM_OFF2), 0, 0, 0); + return NOTIFY_DONE; +} + +static int __init psci_hibernate_init(void) +{ + if (psci_system_off2_hibernate_supported) { + /* Higher priority than EFI shutdown, but only for hibernate */ + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_FIRMWARE + 2, + psci_sys_hibernate, NULL); + } + return 0; +} +subsys_initcall(psci_hibernate_init); +#endif + static int psci_features(u32 psci_func_id) { return invoke_psci_fn(PSCI_1_0_FN_PSCI_FEATURES, @@ -364,6 +395,7 @@ static const struct { PSCI_ID_NATIVE(1_1, SYSTEM_RESET2), PSCI_ID(1_1, MEM_PROTECT), PSCI_ID_NATIVE(1_1, MEM_PROTECT_CHECK_RANGE), + PSCI_ID_NATIVE(1_3, SYSTEM_OFF2), }; static int psci_debugfs_read(struct seq_file *s, void *data) @@ -525,6 +557,18 @@ static void __init psci_init_system_reset2(void) psci_system_reset2_supported = true; } +static void __init psci_init_system_off2(void) +{ + int ret; + + ret = psci_features(PSCI_FN_NATIVE(1_3, SYSTEM_OFF2)); + if (ret < 0) + return; + + if (ret & PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) + psci_system_off2_hibernate_supported = true; +} + static void __init psci_init_system_suspend(void) { int ret; @@ -655,6 +699,7 @@ static int __init psci_probe(void) psci_init_cpu_suspend(); psci_init_system_suspend(); psci_init_system_reset2(); + psci_init_system_off2(); kvm_init_hyp_services(); } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e35829d36039..1f87aa01ba44 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -685,8 +685,11 @@ static void power_down(void) } fallthrough; case HIBERNATION_SHUTDOWN: - if (kernel_can_power_off()) + if (kernel_can_power_off()) { + entering_platform_hibernation = true; kernel_power_off(); + entering_platform_hibernation = false; + } break; } kernel_halt(); From 83732ce6a056c4bb242d64fd25e1fc78f35e6a74 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:11 +0000 Subject: [PATCH 49/83] arm64/sysreg: Convert existing MPAM sysregs and add the remaining entries Move the existing MPAM system register defines from sysreg.h to tools/sysreg and add the remaining system registers. Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Acked-by: Catalin Marinas Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-2-joey.gouly@arm.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/sysreg.h | 12 --- arch/arm64/tools/sysreg | 161 ++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9ea97dddefc4..345e81e0d2b3 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -542,18 +542,6 @@ #define SYS_MAIR_EL2 sys_reg(3, 4, 10, 2, 0) #define SYS_AMAIR_EL2 sys_reg(3, 4, 10, 3, 0) -#define SYS_MPAMHCR_EL2 sys_reg(3, 4, 10, 4, 0) -#define SYS_MPAMVPMV_EL2 sys_reg(3, 4, 10, 4, 1) -#define SYS_MPAM2_EL2 sys_reg(3, 4, 10, 5, 0) -#define __SYS__MPAMVPMx_EL2(x) sys_reg(3, 4, 10, 6, x) -#define SYS_MPAMVPM0_EL2 __SYS__MPAMVPMx_EL2(0) -#define SYS_MPAMVPM1_EL2 __SYS__MPAMVPMx_EL2(1) -#define SYS_MPAMVPM2_EL2 __SYS__MPAMVPMx_EL2(2) -#define SYS_MPAMVPM3_EL2 __SYS__MPAMVPMx_EL2(3) -#define SYS_MPAMVPM4_EL2 __SYS__MPAMVPMx_EL2(4) -#define SYS_MPAMVPM5_EL2 __SYS__MPAMVPMx_EL2(5) -#define SYS_MPAMVPM6_EL2 __SYS__MPAMVPMx_EL2(6) -#define SYS_MPAMVPM7_EL2 __SYS__MPAMVPMx_EL2(7) #define SYS_VBAR_EL2 sys_reg(3, 4, 12, 0, 0) #define SYS_RVBAR_EL2 sys_reg(3, 4, 12, 0, 1) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8d637ac4b7c6..05d04a45db65 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2737,6 +2737,126 @@ Field 1 E2SPE Field 0 E0HSPE EndSysreg +Sysreg MPAMHCR_EL2 3 4 10 4 0 +Res0 63:32 +Field 31 TRAP_MPAMIDR_EL1 +Res0 30:9 +Field 8 GSTAPP_PLK +Res0 7:2 +Field 1 EL1_VPMEN +Field 0 EL0_VPMEN +EndSysreg + +Sysreg MPAMVPMV_EL2 3 4 10 4 1 +Res0 63:32 +Field 31 VPM_V31 +Field 30 VPM_V30 +Field 29 VPM_V29 +Field 28 VPM_V28 +Field 27 VPM_V27 +Field 26 VPM_V26 +Field 25 VPM_V25 +Field 24 VPM_V24 +Field 23 VPM_V23 +Field 22 VPM_V22 +Field 21 VPM_V21 +Field 20 VPM_V20 +Field 19 VPM_V19 +Field 18 VPM_V18 +Field 17 VPM_V17 +Field 16 VPM_V16 +Field 15 VPM_V15 +Field 14 VPM_V14 +Field 13 VPM_V13 +Field 12 VPM_V12 +Field 11 VPM_V11 +Field 10 VPM_V10 +Field 9 VPM_V9 +Field 8 VPM_V8 +Field 7 VPM_V7 +Field 6 VPM_V6 +Field 5 VPM_V5 +Field 4 VPM_V4 +Field 3 VPM_V3 +Field 2 VPM_V2 +Field 1 VPM_V1 +Field 0 VPM_V0 +EndSysreg + +Sysreg MPAM2_EL2 3 4 10 5 0 +Field 63 MPAMEN +Res0 62:59 +Field 58 TIDR +Res0 57 +Field 56 ALTSP_HFC +Field 55 ALTSP_EL2 +Field 54 ALTSP_FRCD +Res0 53:51 +Field 50 EnMPAMSM +Field 49 TRAPMPAM0EL1 +Field 48 TRAPMPAM1EL1 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + +Sysreg MPAMVPM0_EL2 3 4 10 6 0 +Field 63:48 PhyPARTID3 +Field 47:32 PhyPARTID2 +Field 31:16 PhyPARTID1 +Field 15:0 PhyPARTID0 +EndSysreg + +Sysreg MPAMVPM1_EL2 3 4 10 6 1 +Field 63:48 PhyPARTID7 +Field 47:32 PhyPARTID6 +Field 31:16 PhyPARTID5 +Field 15:0 PhyPARTID4 +EndSysreg + +Sysreg MPAMVPM2_EL2 3 4 10 6 2 +Field 63:48 PhyPARTID11 +Field 47:32 PhyPARTID10 +Field 31:16 PhyPARTID9 +Field 15:0 PhyPARTID8 +EndSysreg + +Sysreg MPAMVPM3_EL2 3 4 10 6 3 +Field 63:48 PhyPARTID15 +Field 47:32 PhyPARTID14 +Field 31:16 PhyPARTID13 +Field 15:0 PhyPARTID12 +EndSysreg + +Sysreg MPAMVPM4_EL2 3 4 10 6 4 +Field 63:48 PhyPARTID19 +Field 47:32 PhyPARTID18 +Field 31:16 PhyPARTID17 +Field 15:0 PhyPARTID16 +EndSysreg + +Sysreg MPAMVPM5_EL2 3 4 10 6 5 +Field 63:48 PhyPARTID23 +Field 47:32 PhyPARTID22 +Field 31:16 PhyPARTID21 +Field 15:0 PhyPARTID20 +EndSysreg + +Sysreg MPAMVPM6_EL2 3 4 10 6 6 +Field 63:48 PhyPARTID27 +Field 47:32 PhyPARTID26 +Field 31:16 PhyPARTID25 +Field 15:0 PhyPARTID24 +EndSysreg + +Sysreg MPAMVPM7_EL2 3 4 10 6 7 +Field 63:48 PhyPARTID31 +Field 47:32 PhyPARTID30 +Field 31:16 PhyPARTID29 +Field 15:0 PhyPARTID28 +EndSysreg + Sysreg CONTEXTIDR_EL2 3 4 13 0 1 Fields CONTEXTIDR_ELx EndSysreg @@ -2769,6 +2889,10 @@ Sysreg FAR_EL12 3 5 6 0 0 Field 63:0 ADDR EndSysreg +Sysreg MPAM1_EL12 3 5 10 5 0 +Fields MPAM1_ELx +EndSysreg + Sysreg CONTEXTIDR_EL12 3 5 13 0 1 Fields CONTEXTIDR_ELx EndSysreg @@ -2941,6 +3065,22 @@ Res0 1 Field 0 EN EndSysreg +Sysreg MPAMIDR_EL1 3 0 10 4 4 +Res0 63:62 +Field 61 HAS_SDEFLT +Field 60 HAS_FORCE_NS +Field 59 SP4 +Field 58 HAS_TIDR +Field 57 HAS_ALTSP +Res0 56:40 +Field 39:32 PMG_MAX +Res0 31:21 +Field 20:18 VPMR_MAX +Field 17 HAS_HCR +Res0 16 +Field 15:0 PARTID_MAX +EndSysreg + Sysreg LORID_EL1 3 0 10 4 7 Res0 63:24 Field 23:16 LD @@ -2948,6 +3088,27 @@ Res0 15:8 Field 7:0 LR EndSysreg +Sysreg MPAM1_EL1 3 0 10 5 0 +Field 63 MPAMEN +Res0 62:61 +Field 60 FORCED_NS +Res0 59:55 +Field 54 ALTSP_FRCD +Res0 53:48 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + +Sysreg MPAM0_EL1 3 0 10 5 1 +Res0 63:48 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + Sysreg ISR_EL1 3 0 12 1 0 Res0 63:11 Field 10 IS From 23b33d1e168cfcc96666f025beb3bccfcb58403a Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:12 +0000 Subject: [PATCH 50/83] arm64: head.S: Initialise MPAM EL2 registers and disable traps Add code to head.S's el2_setup to detect MPAM and disable any EL2 traps. This register resets to an unknown value, setting it to the default parititons/pmg before we enable the MMU is the best thing to do. Kexec/kdump will depend on this if the previous kernel left the CPU configured with a restrictive configuration. If linux is booted at the highest implemented exception level el2_setup will clear the enable bit, disabling MPAM. This code can't be enabled until a subsequent patch adds the Kconfig and cpufeature boiler plate. Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Acked-by: Catalin Marinas Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-3-joey.gouly@arm.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index e0ffdf13a18b..4cd41464be3f 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -220,6 +220,19 @@ msr spsr_el2, x0 .endm +.macro __init_el2_mpam + /* Memory Partitioning And Monitoring: disable EL2 traps */ + mrs x1, id_aa64pfr0_el1 + ubfx x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4 + cbz x0, .Lskip_mpam_\@ // skip if no MPAM + msr_s SYS_MPAM2_EL2, xzr // use the default partition + // and disable lower traps + mrs_s x0, SYS_MPAMIDR_EL1 + tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg + msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 +.Lskip_mpam_\@: +.endm + /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -237,6 +250,7 @@ __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr + __init_el2_mpam __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt From 09e6b306f3bad803a9743e40da6a644d66d19928 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:13 +0000 Subject: [PATCH 51/83] arm64: cpufeature: discover CPU support for MPAM ARMv8.4 adds support for 'Memory Partitioning And Monitoring' (MPAM) which describes an interface to cache and bandwidth controls wherever they appear in the system. Add support to detect MPAM. Like SVE, MPAM has an extra id register that describes some more properties, including the virtualisation support, which is optional. Detect this separately so we can detect mismatched/insane systems, but still use MPAM on the host even if the virtualisation support is missing. MPAM needs enabling at the highest implemented exception level, otherwise the register accesses trap. The 'enabled' flag is accessible to lower exception levels, but its in a register that traps when MPAM isn't enabled. The cpufeature 'matches' hook is extended to test this on one of the CPUs, so that firmware can emulate MPAM as disabled if it is reserved for use by secure world. Secondary CPUs that appear late could trip cpufeature's 'lower safe' behaviour after the MPAM properties have been advertised to user-space. Add a verify call to ensure late secondaries match the existing CPUs. (If you have a boot failure that bisects here its likely your CPUs advertise MPAM in the id registers, but firmware failed to either enable or MPAM, or emulate the trap as if it were disabled) Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Acked-by: Catalin Marinas Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-4-joey.gouly@arm.com Signed-off-by: Oliver Upton --- .../arch/arm64/cpu-feature-registers.rst | 2 + arch/arm64/include/asm/cpu.h | 1 + arch/arm64/include/asm/cpucaps.h | 5 + arch/arm64/include/asm/cpufeature.h | 17 ++++ arch/arm64/kernel/cpufeature.c | 96 +++++++++++++++++++ arch/arm64/kernel/cpuinfo.c | 3 + arch/arm64/tools/cpucaps | 2 + 7 files changed, 126 insertions(+) diff --git a/Documentation/arch/arm64/cpu-feature-registers.rst b/Documentation/arch/arm64/cpu-feature-registers.rst index 44f9bd78539d..253e9743de2f 100644 --- a/Documentation/arch/arm64/cpu-feature-registers.rst +++ b/Documentation/arch/arm64/cpu-feature-registers.rst @@ -152,6 +152,8 @@ infrastructure: +------------------------------+---------+---------+ | DIT | [51-48] | y | +------------------------------+---------+---------+ + | MPAM | [43-40] | n | + +------------------------------+---------+---------+ | SVE | [35-32] | y | +------------------------------+---------+---------+ | GIC | [27-24] | n | diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 9b73fd0cd721..81e4157f92b7 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -46,6 +46,7 @@ struct cpuinfo_arm64 { u64 reg_revidr; u64 reg_gmid; u64 reg_smidr; + u64 reg_mpamidr; u64 reg_id_aa64dfr0; u64 reg_id_aa64dfr1; diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index a6e5b07b64fd..f20e6e4212a7 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -60,6 +60,11 @@ cpucap_is_possible(const unsigned int cap) return IS_ENABLED(CONFIG_ARM64_WORKAROUND_REPEAT_TLBI); case ARM64_WORKAROUND_SPECULATIVE_SSBS: return IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386); + case ARM64_MPAM: + /* + * KVM MPAM support doesn't rely on the host kernel supporting MPAM. + */ + return true; } return true; diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d261cc123c1..352aa6b93964 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -612,6 +612,13 @@ static inline bool id_aa64pfr1_sme(u64 pfr1) return val > 0; } +static inline bool id_aa64pfr0_mpam(u64 pfr0) +{ + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT); + + return val > 0; +} + static inline bool id_aa64pfr1_mte(u64 pfr1) { u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_MTE_SHIFT); @@ -838,6 +845,16 @@ static inline bool system_supports_poe(void) alternative_has_cap_unlikely(ARM64_HAS_S1POE); } +static inline bool system_supports_mpam(void) +{ + return alternative_has_cap_unlikely(ARM64_MPAM); +} + +static __always_inline bool system_supports_mpam_hcr(void) +{ + return alternative_has_cap_unlikely(ARM64_MPAM_HCR); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..33dce6b9c49b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -684,6 +684,14 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_mpamidr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0), + ARM64_FTR_END, +}; + /* * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of @@ -804,6 +812,9 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), + /* Op1 = 0, CRn = 10, CRm = 4 */ + ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr), + /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -1163,6 +1174,9 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) cpacr_restore(cpacr); } + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); } @@ -1419,6 +1433,11 @@ void update_cpu_features(int cpu, cpacr_restore(cpacr); } + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, + info->reg_mpamidr, boot->reg_mpamidr); + } + /* * The kernel uses the LDGM/STGM instructions and the number of tags * they read/write depends on the GMID_EL1.BS field. Check that the @@ -2377,6 +2396,36 @@ cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); } +static bool +test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return false; + + /* Check firmware actually enabled MPAM on this cpu. */ + return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN); +} + +static void +cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) +{ + /* + * Access by the kernel (at EL1) should use the reserved PARTID + * which is configured unrestricted. This avoids priority-inversion + * where latency sensitive tasks have to wait for a task that has + * been throttled to release the lock. + */ + write_sysreg_s(0, SYS_MPAM1_EL1); +} + +static bool +test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + + return idr & MPAMIDR_EL1_HAS_HCR; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -2873,6 +2922,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = { #endif }, #endif + { + .desc = "Memory Partitioning And Monitoring", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM, + .matches = test_has_mpam, + .cpu_enable = cpu_enable_mpam, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) + }, + { + .desc = "Memory Partitioning And Monitoring Virtualisation", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM_HCR, + .matches = test_has_mpam_hcr, + }, { .desc = "NV1", .capability = ARM64_HAS_HCR_NV1, @@ -3396,6 +3459,36 @@ static void verify_hyp_capabilities(void) } } +static void verify_mpam_capabilities(void) +{ + u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1); + u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max; + + if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) != + FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) { + pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_idr = read_cpuid(MPAMIDR_EL1); + sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) != + FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) { + pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr); + cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr); + sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr); + sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr); + if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) { + pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id()); + cpu_die_early(); + } +} + /* * Run through the enabled system capabilities and enable() it on this CPU. * The capabilities were decided based on the available CPUs at the boot time. @@ -3422,6 +3515,9 @@ static void verify_local_cpu_capabilities(void) if (is_hyp_mode_available()) verify_hyp_capabilities(); + + if (system_supports_mpam()) + verify_mpam_capabilities(); } void check_local_cpu_capabilities(void) diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 44718d0482b3..46ba30d42b9b 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -478,6 +478,9 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index eedb5acc21ed..dfac646430ac 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -60,6 +60,8 @@ HW_DBM KVM_HVHE KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE +MPAM +MPAM_HCR MTE MTE_ASYMM SME From 31ff96c38ea393d9707f1d95b4bf8d372cf32177 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:14 +0000 Subject: [PATCH 52/83] KVM: arm64: Fix missing traps of guest accesses to the MPAM registers commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to guests, but didn't add trap handling. If you are unlucky, this results in an MPAM aware guest being delivered an undef during boot. The host prints: | kvm [97]: Unsupported guest sys_reg access at: ffff800080024c64 [00000005] | { Op0( 3), Op1( 0), CRn(10), CRm( 5), Op2( 0), func_read }, Which results in: | Internal error: Oops - Undefined instruction: 0000000002000000 [#1] PREEMPT SMP | Modules linked in: | CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.6.0-rc7-00559-gd89c186d50b2 #14616 | Hardware name: linux,dummy-virt (DT) | pstate: 00000005 (nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : test_has_mpam+0x18/0x30 | lr : test_has_mpam+0x10/0x30 | sp : ffff80008000bd90 ... | Call trace: | test_has_mpam+0x18/0x30 | update_cpu_capabilities+0x7c/0x11c | setup_cpu_features+0x14/0xd8 | smp_cpus_done+0x24/0xb8 | smp_init+0x7c/0x8c | kernel_init_freeable+0xf8/0x280 | kernel_init+0x24/0x1e0 | ret_from_fork+0x10/0x20 | Code: 910003fd 97ffffde 72001c00 54000080 (d538a500) | ---[ end trace 0000000000000000 ]--- | Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b | ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]--- Add the support to enable the traps, and handle the three guest accessible registers by injecting an UNDEF. This stops KVM from spamming the host log, but doesn't yet hide the feature from the id registers. With MPAM v1.0 we can trap the MPAMIDR_EL1 register only if ARM64_HAS_MPAM_HCR, with v1.1 an additional MPAM2_EL2.TIDR bit traps MPAMIDR_EL1 on platforms that don't have MPAMHCR_EL2. Enable one of these if either is supported. If neither is supported, the guest can discover that the CPU has MPAM support, and how many PARTID etc the host has ... but it can't influence anything, so its harmless. Fixes: 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits in ID_AA64PFR0 register") CC: Anshuman Khandual Link: https://lore.kernel.org/linux-arm-kernel/20200925160102.118858-1-james.morse@arm.com/ Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-5-joey.gouly@arm.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cpufeature.h | 2 +- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/kvm/hyp/include/hyp/switch.h | 31 +++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 14 +++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 352aa6b93964..93fe8e6beb64 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -845,7 +845,7 @@ static inline bool system_supports_poe(void) alternative_has_cap_unlikely(ARM64_HAS_S1POE); } -static inline bool system_supports_mpam(void) +static __always_inline bool system_supports_mpam(void) { return alternative_has_cap_unlikely(ARM64_MPAM); } diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 109a85ee6910..16afb7a79b15 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -103,6 +103,7 @@ #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) #define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM) +#define MPAMHCR_HOST_FLAGS 0 /* TCR_EL2 Registers bits */ #define TCR_EL2_DS (1UL << 32) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5310fe1da616..34f53707892d 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -204,6 +204,35 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) __deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2); } +static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) +{ + u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + + if (!system_supports_mpam()) + return; + + /* trap guest access to MPAMIDR_EL1 */ + if (system_supports_mpam_hcr()) { + write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); + } else { + /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ + r |= MPAM2_EL2_TIDR; + } + + write_sysreg_s(r, SYS_MPAM2_EL2); +} + +static inline void __deactivate_traps_mpam(void) +{ + if (!system_supports_mpam()) + return; + + write_sysreg_s(0, SYS_MPAM2_EL2); + + if (system_supports_mpam_hcr()) + write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); +} + static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ @@ -244,6 +273,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) } __activate_traps_hfgxtr(vcpu); + __activate_traps_mpam(vcpu); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) @@ -263,6 +293,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); + __deactivate_traps_mpam(); } static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ff8c4e1b847e..66d6af1dbadc 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2553,8 +2553,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_LOREA_EL1), trap_loregion }, { SYS_DESC(SYS_LORN_EL1), trap_loregion }, { SYS_DESC(SYS_LORC_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAMIDR_EL1), undef_access }, { SYS_DESC(SYS_LORID_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAM1_EL1), undef_access }, + { SYS_DESC(SYS_MPAM0_EL1), undef_access }, { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, @@ -2854,6 +2857,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(MAIR_EL2, access_rw, reset_val, 0), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_MPAMHCR_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access }, + { SYS_DESC(SYS_MPAM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access }, EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), From 7da540e29dea6016ed55d16450d3133c70761d21 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:15 +0000 Subject: [PATCH 53/83] KVM: arm64: Add a macro for creating filtered sys_reg_descs entries The sys_reg_descs array holds function pointers and reset value for managing the user-space and guest view of system registers. These are mostly created by a set of macro's as only some combinations of behaviour are needed. If a register needs special treatment, its sys_reg_descs entry is open-coded. This is true of some id registers where the value provided by user-space is validated by some helpers. Before adding another one of these, add a helper that covers the existing special cases. 'ID_FILTERED' expects helpers to set the user-space value, and retrieve the modified reset value. Like ID_WRITABLE() this uses id_visibility(), which should have no functional change for the registers converted to use ID_FILTERED(). read_sanitised_id_aa64dfr0_el1() and read_sanitised_id_aa64pfr0_el1() have been refactored to be called from kvm_read_sanitised_id_reg(), to try be consistent with ID_WRITABLE(). Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-6-joey.gouly@arm.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 66 ++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 66d6af1dbadc..86ce783a54a6 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1509,6 +1509,9 @@ static u8 pmuver_to_perfmon(u8 pmuver) } } +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); + /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) @@ -1522,6 +1525,12 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = read_sanitised_ftr_reg(id); switch (id) { + case SYS_ID_AA64DFR0_EL1: + val = sanitise_id_aa64dfr0_el1(vcpu, val); + break; + case SYS_ID_AA64PFR0_EL1: + val = sanitise_id_aa64pfr0_el1(vcpu, val); + break; case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); @@ -1692,11 +1701,8 @@ static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - if (!vcpu_has_sve(vcpu)) val &= ~ID_AA64PFR0_EL1_SVE_MASK; @@ -1737,11 +1743,8 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, (val); \ }) -static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); /* @@ -1834,6 +1837,12 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, val); } +static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 val) +{ + return set_id_reg(vcpu, rd, val); +} + /* * cpufeature ID register user accessors * @@ -2150,6 +2159,15 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = mask, \ } +/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ +#define ID_FILTERED(sysreg, name, mask) { \ + ID_DESC(sysreg), \ + .set_user = set_##name, \ + .visibility = id_visibility, \ + .reset = kvm_read_sanitised_id_reg, \ + .val = (mask), \ +} + /* * sys_reg_desc initialiser for architecturally unallocated cpufeature ID * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 @@ -2374,17 +2392,13 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 ID registers */ /* CRm=4 */ - { SYS_DESC(SYS_ID_AA64PFR0_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_reg, - .reset = read_sanitised_id_aa64pfr0_el1, - .val = ~(ID_AA64PFR0_EL1_AMU | - ID_AA64PFR0_EL1_MPAM | - ID_AA64PFR0_EL1_SVE | - ID_AA64PFR0_EL1_RAS | - ID_AA64PFR0_EL1_AdvSIMD | - ID_AA64PFR0_EL1_FP), }, + ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1, + ~(ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SVE | + ID_AA64PFR0_EL1_RAS | + ID_AA64PFR0_EL1_AdvSIMD | + ID_AA64PFR0_EL1_FP)), ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | ID_AA64PFR1_EL1_DF2 | ID_AA64PFR1_EL1_MTEX | @@ -2406,11 +2420,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ - { SYS_DESC(SYS_ID_AA64DFR0_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_aa64dfr0_el1, - .reset = read_sanitised_id_aa64dfr0_el1, /* * Prior to FEAT_Debugv8.9, the architecture defines context-aware * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). @@ -2423,10 +2432,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { * See DDI0487K.a, section D2.8.3 Breakpoint types and linking * of breakpoints for more details. */ - .val = ID_AA64DFR0_EL1_DoubleLock_MASK | - ID_AA64DFR0_EL1_WRPs_MASK | - ID_AA64DFR0_EL1_PMUVer_MASK | - ID_AA64DFR0_EL1_DebugVer_MASK, }, + ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, + ID_AA64DFR0_EL1_DoubleLock_MASK | + ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_PMUVer_MASK | + ID_AA64DFR0_EL1_DebugVer_MASK), ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), From 6685f5d572c22e1003e7c0d089afe1c64340ab1f Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:16 +0000 Subject: [PATCH 54/83] KVM: arm64: Disable MPAM visibility by default and ignore VMM writes commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to guests, but didn't add trap handling. A previous patch supplied the missing trap handling. Existing VMs that have the MPAM field of ID_AA64PFR0_EL1 set need to be migratable, but there is little point enabling the MPAM CPU interface on new VMs until there is something a guest can do with it. Clear the MPAM field from the guest's ID_AA64PFR0_EL1 and on hardware that supports MPAM, politely ignore the VMMs attempts to set this bit. Guests exposed to this bug have the sanitised value of the MPAM field, so only the correct value needs to be ignored. This means the field can continue to be used to block migration to incompatible hardware (between MPAM=1 and MPAM=5), and the VMM can't rely on the field being ignored. Signed-off-by: James Morse Co-developed-by: Joey Gouly Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-7-joey.gouly@arm.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 45 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 86ce783a54a6..7dc4a5ce5292 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1544,6 +1544,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac); break; case SYS_ID_AA64PFR2_EL1: /* We only expose FPMR */ @@ -1730,6 +1731,13 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) val &= ~ID_AA64PFR0_EL1_AMU_MASK; + /* + * MPAM is disabled by default as KVM also needs a set of PARTID to + * program the MPAMVPMx_EL2 PARTID remapping registers with. But some + * older kernels let the guest see the ID bit. + */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + return val; } @@ -1838,9 +1846,39 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, } static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, u64 val) + const struct sys_reg_desc *rd, u64 user_val) { - return set_id_reg(vcpu, rd, val); + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; + + /* + * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits + * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to + * guests, but didn't add trap handling. KVM doesn't support MPAM and + * always returns an UNDEF for these registers. The guest must see 0 + * for this field. + * + * But KVM must also accept values from user-space that were provided + * by KVM. On CPUs that support MPAM, permit user-space to write + * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. + */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; + + /* See set_id_aa64pfr0_el1 for comment about MPAM */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + + return set_id_reg(vcpu, rd, user_val); } /* @@ -2399,7 +2437,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_RAS | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP)), - ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | + ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, + ~(ID_AA64PFR1_EL1_PFAR | ID_AA64PFR1_EL1_DF2 | ID_AA64PFR1_EL1_MTEX | ID_AA64PFR1_EL1_THE | From 75cd027cbcb161e277209e20df14f0818c62d9e7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:17 +0000 Subject: [PATCH 55/83] KVM: arm64: selftests: Test ID_AA64PFR0.MPAM isn't completely ignored The ID_AA64PFR0.MPAM bit was previously accidentally exposed to guests, and is ignored by KVM. KVM will always present the guest with 0 here, and trap the MPAM system registers to inject an undef. But, this value is still needed to prevent migration when the value is incompatible with the target hardware. Add a kvm unit test to try and write multiple values to ID_AA64PFR0.MPAM. Only the hardware value previously exposed should be ignored, all other values should be rejected. Signed-off-by: James Morse Signed-off-by: Joey Gouly Tested-by: Shameer Kolothum Reviewed-by: Gavin Shan Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-8-joey.gouly@arm.com Signed-off-by: Oliver Upton --- .../selftests/kvm/aarch64/set_id_regs.c | 99 ++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index b87e53580bfc..a79b7f18452d 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -443,6 +443,101 @@ static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) } } +#define MPAM_IDREG_TEST 6 +static void test_user_set_mpam_reg(struct kvm_vcpu *vcpu) +{ + uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE]; + struct reg_mask_range range = { + .addr = (__u64)masks, + }; + uint64_t val; + int idx, err; + + /* + * If ID_AA64PFR0.MPAM is _not_ officially modifiable and is zero, + * check that if it can be set to 1, (i.e. it is supported by the + * hardware), that it can't be set to other values. + */ + + /* Get writable masks for feature ID registers */ + memset(range.reserved, 0, sizeof(range.reserved)); + vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range); + + /* Writeable? Nothing to test! */ + idx = encoding_to_range_idx(SYS_ID_AA64PFR0_EL1); + if ((masks[idx] & ID_AA64PFR0_EL1_MPAM_MASK) == ID_AA64PFR0_EL1_MPAM_MASK) { + ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is officially writable, nothing to test\n"); + return; + } + + /* Get the id register value */ + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); + + /* Try to set MPAM=0. This should always be possible. */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 0); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val); + if (err) + ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM=0 was not accepted\n"); + else + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=0 worked\n"); + + /* Try to set MPAM=1 */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 1); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val); + if (err) + ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is not writable, nothing to test\n"); + else + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=1 was writable\n"); + + /* Try to set MPAM=2 */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 2); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val); + if (err) + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM not arbitrarily modifiable\n"); + else + ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM value should not be ignored\n"); + + /* And again for ID_AA64PFR1_EL1.MPAM_frac */ + idx = encoding_to_range_idx(SYS_ID_AA64PFR1_EL1); + if ((masks[idx] & ID_AA64PFR1_EL1_MPAM_frac_MASK) == ID_AA64PFR1_EL1_MPAM_frac_MASK) { + ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is officially writable, nothing to test\n"); + return; + } + + /* Get the id register value */ + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), &val); + + /* Try to set MPAM_frac=0. This should always be possible. */ + val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 0); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val); + if (err) + ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM_frac=0 was not accepted\n"); + else + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=0 worked\n"); + + /* Try to set MPAM_frac=1 */ + val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 1); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val); + if (err) + ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is not writable, nothing to test\n"); + else + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=1 was writable\n"); + + /* Try to set MPAM_frac=2 */ + val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 2); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val); + if (err) + ksft_test_result_pass("ID_AA64PFR1_EL1.MPAM_frac not arbitrarily modifiable\n"); + else + ksft_test_result_fail("ID_AA64PFR1_EL1.MPAM_frac value should not be ignored\n"); +} + static void test_guest_reg_read(struct kvm_vcpu *vcpu) { bool done = false; @@ -581,12 +676,14 @@ int main(void) ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + - ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2; + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2 + + MPAM_IDREG_TEST; ksft_set_plan(test_cnt); test_vm_ftr_id_regs(vcpu, aarch64_only); test_vcpu_ftr_id_regs(vcpu); + test_user_set_mpam_reg(vcpu); test_guest_reg_read(vcpu); From 0546d4a925a6ce52b362e528f6d962efd72c84b9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 18 Oct 2024 08:48:29 +0100 Subject: [PATCH 56/83] KVM: arm64: Move pkvm_vcpu_init_traps() to init_pkvm_hyp_vcpu() Move pkvm_vcpu_init_traps() to the initialization of the hypervisor's vcpu state in init_pkvm_hyp_vcpu(), and remove the associated hypercall. In protected mode, traps need to be initialized whenever a VCPU is initialized anyway, and not only for protected VMs. This also saves an unnecessary hypercall. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241018074833.2563674-2-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_asm.h | 1 - arch/arm64/kvm/arm.c | 8 -------- arch/arm64/kvm/hyp/include/nvhe/trap_handler.h | 2 -- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 8 -------- arch/arm64/kvm/hyp/nvhe/pkvm.c | 4 +++- 5 files changed, 3 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index b36a3b6cc011..7f7866d2bfc2 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -76,7 +76,6 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, - __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps, __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a0d01c46e408..ece934bb4531 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -856,14 +856,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) static_branch_inc(&userspace_irqchip_in_use); } - /* - * Initialize traps for protected VMs. - * NOTE: Move to run in EL2 directly, rather than via a hypercall, once - * the code is in place for first run initialization at EL2. - */ - if (kvm_vm_is_protected(kvm)) - kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); - mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 45a84f0ade04..1e6d995968a1 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -15,6 +15,4 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); - #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index fefc89209f9e..1a224d5df207 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -349,13 +349,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } -static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); - - __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); -} - static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); @@ -411,7 +404,6 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), - HANDLE_FUNC(__pkvm_vcpu_init_traps), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 077d4098548d..869955e551a0 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -204,7 +204,7 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) /* * Initialize trap register values in protected mode. */ -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) { pvm_init_trap_regs(vcpu); pvm_init_traps_aa64pfr0(vcpu); @@ -335,6 +335,8 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + + pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); From 3663b258f7231c7f3888c96e5c1eee33547873a6 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 18 Oct 2024 08:48:30 +0100 Subject: [PATCH 57/83] KVM: arm64: Refactor kvm_vcpu_enable_ptrauth() for hyp use Move kvm_vcpu_enable_ptrauth() to a shared header to be used by hypervisor code in protected mode. No functional change intended. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241018074833.2563674-3-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 4 ++++ arch/arm64/kvm/reset.c | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index a601a9305b10..9a68befbeecf 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -693,4 +693,8 @@ static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu) return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN); } +static inline void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) +{ + vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); +} #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 0b0ae5ae7bc2..470524b31951 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -167,11 +167,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } -static void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) -{ - vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); -} - /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer From cb0c272acebdf099431c34972963377f83872a08 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 18 Oct 2024 08:48:31 +0100 Subject: [PATCH 58/83] KVM: arm64: Initialize the hypervisor's VM state at EL2 Do not trust the state of the VM as provided by the host when initializing the hypervisor's view of the VM sate. Initialize it instead at EL2 to a known good and safe state, as pKVM already does with hypervisor VCPU states. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241018074833.2563674-4-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 869955e551a0..954df57a935f 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -6,6 +6,9 @@ #include #include + +#include + #include #include #include @@ -289,6 +292,65 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_spin_unlock(&vm_table_lock); } +static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) +{ + struct kvm *kvm = &hyp_vm->kvm; + DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* No restrictions for non-protected VMs. */ + if (!kvm_vm_is_protected(kvm)) { + bitmap_copy(kvm->arch.vcpu_features, + host_kvm->arch.vcpu_features, + KVM_VCPU_MAX_FEATURES); + return; + } + + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* + * For protected VMs, always allow: + * - CPU starting in poweroff state + * - PSCI v0.2 + */ + set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); + set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); + + /* + * Check if remaining features are allowed: + * - Performance Monitoring + * - Scalable Vectors + * - Pointer Authentication + */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW)) + set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED)) + set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW)) + set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, + allowed_features, KVM_VCPU_MAX_FEATURES); +} + +static void pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || + vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) { + kvm_vcpu_enable_ptrauth(vcpu); + } else { + vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH); + } +} + static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) @@ -310,6 +372,18 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + pkvm_init_features_from_host(hyp_vm, host_kvm); +} + +static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { + vcpu_clear_flag(vcpu, GUEST_HAS_SVE); + vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + } } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, @@ -335,7 +409,10 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); + pkvm_vcpu_init_ptrauth(hyp_vcpu); pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); done: if (ret) From b56680de9c6489f2aed707fad2811e714b52fe4c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 18 Oct 2024 08:48:32 +0100 Subject: [PATCH 59/83] KVM: arm64: Initialize trap register values in hyp in pKVM Handle the initialization of trap registers at the hypervisor in pKVM, even for non-protected guests. The host is not trusted with the values of the trap registers, regardless of the VM type. Therefore, when switching between the host and the guests, only flush the HCR_EL2 TWI and TWE bits. The host is allowed to configure these for opportunistic scheduling, as neither affects the protection of VMs or the hypervisor. Reported-by: Will Deacon Fixes: 814ad8f96e92 ("KVM: arm64: Drop trapping of PAuth instructions/keys") Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241018074833.2563674-5-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 4 +++- arch/arm64/kvm/hyp/nvhe/pkvm.c | 35 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 1a224d5df207..6aa0b13d86e5 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -105,8 +105,10 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; - hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); + hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & + (HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 954df57a935f..01616c39a810 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -204,11 +204,46 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) } } +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; + + if (has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); +} + /* * Initialize trap register values in protected mode. */ static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) { + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!vcpu_is_protected(vcpu))) + return; + pvm_init_trap_regs(vcpu); pvm_init_traps_aa64pfr0(vcpu); pvm_init_traps_aa64pfr1(vcpu); From 93d7356e4b3044f5165f35a8beb6ef05b1a40b7a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:37 +0000 Subject: [PATCH 60/83] arm64: sysreg: Describe ID_AA64DFR2_EL1 fields Describe the new ID register in line with DDI0601 2024-09. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/tools/sysreg | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 3c812fd28eca..98e710aaa620 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1287,6 +1287,32 @@ Field 15:8 BRPs Field 7:0 SYSPMUID EndSysreg +Sysreg ID_AA64DFR2_EL1 3 0 0 5 2 +Res0 63:28 +UnsignedEnum 27:24 TRBE_EXC + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 23:20 SPE_nVM + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 19:16 SPE_EXC + 0b0000 NI + 0b0001 IMP +EndEnum +Res0 15:8 +UnsignedEnum 7:4 BWE + 0b0000 NI + 0b0001 FEAT_BWE + 0b0002 FEAT_BWE2 +EndEnum +UnsignedEnum 3:0 STEP + 0b0000 NI + 0b0001 IMP +EndEnum +EndSysreg + Sysreg ID_AA64AFR0_EL1 3 0 0 5 4 Res0 63:32 Field 31:28 IMPDEF7 From 641630313e9c68b2a889b1aad684f29b9c3e4017 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:38 +0000 Subject: [PATCH 61/83] arm64: sysreg: Migrate MDCR_EL2 definition to table Migrate MDCR_EL2 over to the sysreg table and align definitions with DDI0601 2024-09. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_arm.h | 29 -------------------------- arch/arm64/tools/sysreg | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 109a85ee6910..fb8d15f299a4 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -311,35 +311,6 @@ GENMASK(19, 18) | \ GENMASK(15, 0)) -/* Hyp Debug Configuration Register bits */ -#define MDCR_EL2_E2TB_MASK (UL(0x3)) -#define MDCR_EL2_E2TB_SHIFT (UL(24)) -#define MDCR_EL2_HPMFZS (UL(1) << 36) -#define MDCR_EL2_HPMFZO (UL(1) << 29) -#define MDCR_EL2_MTPME (UL(1) << 28) -#define MDCR_EL2_TDCC (UL(1) << 27) -#define MDCR_EL2_HLP (UL(1) << 26) -#define MDCR_EL2_HCCD (UL(1) << 23) -#define MDCR_EL2_TTRF (UL(1) << 19) -#define MDCR_EL2_HPMD (UL(1) << 17) -#define MDCR_EL2_TPMS (UL(1) << 14) -#define MDCR_EL2_E2PB_MASK (UL(0x3)) -#define MDCR_EL2_E2PB_SHIFT (UL(12)) -#define MDCR_EL2_TDRA (UL(1) << 11) -#define MDCR_EL2_TDOSA (UL(1) << 10) -#define MDCR_EL2_TDA (UL(1) << 9) -#define MDCR_EL2_TDE (UL(1) << 8) -#define MDCR_EL2_HPME (UL(1) << 7) -#define MDCR_EL2_TPM (UL(1) << 6) -#define MDCR_EL2_TPMCR (UL(1) << 5) -#define MDCR_EL2_HPMN_MASK (UL(0x1F)) -#define MDCR_EL2_RES0 (GENMASK(63, 37) | \ - GENMASK(35, 30) | \ - GENMASK(25, 24) | \ - GENMASK(22, 20) | \ - BIT(18) | \ - GENMASK(16, 15)) - /* * FGT register definitions * diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 98e710aaa620..c3117debdc2a 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2414,6 +2414,41 @@ Field 1 AFSR1_EL1 Field 0 AFSR0_EL1 EndSysregFields +Sysreg MDCR_EL2 3 4 1 1 1 +Res0 63:51 +Field 50 EnSTEPOP +Res0 49:44 +Field 43 EBWE +Res0 42 +Field 41:40 PMEE +Res0 39:37 +Field 36 HPMFZS +Res0 35:32 +Field 31:30 PMSSE +Field 29 HPMFZO +Field 28 MTPME +Field 27 TDCC +Field 26 HLP +Field 25:24 E2TB +Field 23 HCCD +Res0 22:20 +Field 19 TTRF +Res0 18 +Field 17 HPMD +Res0 16 +Field 15 EnSPM +Field 14 TPMS +Field 13:12 E2PB +Field 11 TDRA +Field 10 TDOSA +Field 9 TDA +Field 8 TDE +Field 7 HPME +Field 6 TPM +Field 5 TPMCR +Field 4:0 HPMN +EndSysreg + Sysreg HFGRTR_EL2 3 4 1 1 4 Fields HFGxTR_EL2 EndSysreg From 3ecb1fe3842c8783d8cd94a192aec4225f72b652 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:39 +0000 Subject: [PATCH 62/83] arm64: sysreg: Add new definitions for ID_AA64DFR0_EL1 Align the field definitions w/ DDI0601 2024-09 and opportunistically declare MTPMU as a signed field. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-5-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/tools/sysreg | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index c3117debdc2a..5b28bf157c38 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1200,7 +1200,7 @@ UnsignedEnum 55:52 BRBE 0b0001 IMP 0b0010 BRBE_V1P1 EndEnum -Enum 51:48 MTPMU +SignedEnum 51:48 MTPMU 0b0000 NI_IMPDEF 0b0001 IMP 0b1111 NI @@ -1208,6 +1208,7 @@ EndEnum UnsignedEnum 47:44 TraceBuffer 0b0000 NI 0b0001 IMP + 0b0010 TRBE_V1P1 EndEnum UnsignedEnum 43:40 TraceFilt 0b0000 NI @@ -1224,11 +1225,18 @@ UnsignedEnum 35:32 PMSVer 0b0011 V1P2 0b0100 V1P3 0b0101 V1P4 + 0b0110 V1P5 EndEnum Field 31:28 CTX_CMPs -Res0 27:24 +UnsignedEnum 27:24 SEBEP + 0b0000 NI + 0b0001 IMP +EndEnum Field 23:20 WRPs -Res0 19:16 +UnsignedEnum 19:16 PMSS + 0b0000 NI + 0b0001 IMP +EndEnum Field 15:12 BRPs UnsignedEnum 11:8 PMUVer 0b0000 NI @@ -1238,6 +1246,7 @@ UnsignedEnum 11:8 PMUVer 0b0110 V3P5 0b0111 V3P7 0b1000 V3P8 + 0b1001 V3P9 0b1111 IMP_DEF EndEnum UnsignedEnum 7:4 TraceVer From eb609638da5578c59fd28baf9825f1dd19b61d7a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:40 +0000 Subject: [PATCH 63/83] KVM: arm64: Describe RES0/RES1 bits of MDCR_EL2 Add support for sanitising MDCR_EL2 and describe the RES0/RES1 bits according to the feature set exposed to the VM. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-6-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/nested.c | 37 +++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c753ab31dcea..725b5cd18ee0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -471,7 +471,6 @@ enum vcpu_sysreg { /* EL2 registers */ SCTLR_EL2, /* System Control Register (EL2) */ ACTLR_EL2, /* Auxiliary Control Register (EL2) */ - MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ ZCR_EL2, /* SVE Control Register (EL2) */ @@ -499,6 +498,7 @@ enum vcpu_sysreg { /* Anything from this can be RES0/RES1 sanitised */ MARKER(__SANITISED_REG_START__), + MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ /* Any VNCR-capable reg goes after this point */ MARKER(__VNCR_START__), diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 26103d6514bd..676e2de78fdd 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1211,6 +1211,43 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= SCTLR_EL1_EPAN; set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + /* MDCR_EL2 */ + res0 = MDCR_EL2_RES0; + res1 = MDCR_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) + res0 |= (MDCR_EL2_HPMN | MDCR_EL2_TPMCR | + MDCR_EL2_TPM | MDCR_EL2_HPME); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) + res0 |= MDCR_EL2_E2PB | MDCR_EL2_TPMS; + if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, SPMU, IMP)) + res0 |= MDCR_EL2_EnSPM; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P1)) + res0 |= MDCR_EL2_HPMD; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + res0 |= MDCR_EL2_TTRF; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) + res0 |= MDCR_EL2_HCCD | MDCR_EL2_HLP; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) + res0 |= MDCR_EL2_E2TB; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + res0 |= MDCR_EL2_TDCC; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, MTPMU, IMP) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) + res0 |= MDCR_EL2_MTPME; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P7)) + res0 |= MDCR_EL2_HPMFZO; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSS, IMP)) + res0 |= MDCR_EL2_PMSSE; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) + res0 |= MDCR_EL2_HPMFZS; + if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, EBEP, IMP)) + res0 |= MDCR_EL2_PMEE; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, V8P9)) + res0 |= MDCR_EL2_EBWE; + if (!kvm_has_feat(kvm, ID_AA64DFR2_EL1, STEP, IMP)) + res0 |= MDCR_EL2_EnSTEPOP; + set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + return 0; } From 18aeeeb57b93f5038ad9b1bac2102640e786b1d1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:41 +0000 Subject: [PATCH 64/83] KVM: arm64: nv: Allow coarse-grained trap combos to use complex traps KVM uses a sanity-check to avoid infinite recursion in trap combinations that could potentially depend on itself. Narrow the scope of this sanity check to the exact CGT IDs that correspond w/ trap combos, opening the door to using 'complex' traps as part of a combination. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-7-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/emulate-nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 05b6435d02a9..da7ab14e036d 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2021,7 +2021,8 @@ check_mcb: cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; for (int i = 0; cgids[i] != __RESERVED__; i++) { - if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) { + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ && + cgids[i] < __COMPLEX_CONDITIONS__) { kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); ret = -EINVAL; } From a4063b5aa0bd7abc31c8044d897cff606cc8b74b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:42 +0000 Subject: [PATCH 65/83] KVM: arm64: nv: Rename BEHAVE_FORWARD_ANY BEHAVE_FORWARD_ANY is slightly ambiguous, especially since we're about to cram some more information into the enum. Rephrase it. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-8-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/emulate-nested.c | 93 +++++++++++++++++---------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index da7ab14e036d..e1a30d1bcd06 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -16,9 +16,10 @@ enum trap_behaviour { BEHAVE_HANDLE_LOCALLY = 0, + BEHAVE_FORWARD_READ = BIT(0), BEHAVE_FORWARD_WRITE = BIT(1), - BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, }; struct trap_bits { @@ -138,7 +139,7 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TID2, .mask = HCR_TID2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID3] = { .index = HCR_EL2, @@ -162,37 +163,37 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TIDCP, .mask = HCR_TIDCP, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TACR] = { .index = HCR_EL2, .value = HCR_TACR, .mask = HCR_TACR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TSW] = { .index = HCR_EL2, .value = HCR_TSW, .mask = HCR_TSW, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */ .index = HCR_EL2, .value = HCR_TPC, .mask = HCR_TPC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPU] = { .index = HCR_EL2, .value = HCR_TPU, .mask = HCR_TPU, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLB] = { .index = HCR_EL2, .value = HCR_TTLB, .mask = HCR_TTLB, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TVM] = { .index = HCR_EL2, @@ -204,7 +205,7 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TDZ, .mask = HCR_TDZ, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TRVM] = { .index = HCR_EL2, @@ -216,205 +217,205 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TLOR, .mask = HCR_TLOR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TERR] = { .index = HCR_EL2, .value = HCR_TERR, .mask = HCR_TERR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_APK] = { .index = HCR_EL2, .value = 0, .mask = HCR_APK, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV_nNV2] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV | HCR_NV2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV1_nNV2] = { .index = HCR_EL2, .value = HCR_NV | HCR_NV1, .mask = HCR_NV | HCR_NV1 | HCR_NV2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_AT] = { .index = HCR_EL2, .value = HCR_AT, .mask = HCR_AT, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_nFIEN] = { .index = HCR_EL2, .value = 0, .mask = HCR_FIEN, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID4] = { .index = HCR_EL2, .value = HCR_TID4, .mask = HCR_TID4, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TICAB] = { .index = HCR_EL2, .value = HCR_TICAB, .mask = HCR_TICAB, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TOCU] = { .index = HCR_EL2, .value = HCR_TOCU, .mask = HCR_TOCU, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_ENSCXT] = { .index = HCR_EL2, .value = 0, .mask = HCR_ENSCXT, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBIS] = { .index = HCR_EL2, .value = HCR_TTLBIS, .mask = HCR_TTLBIS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBOS] = { .index = HCR_EL2, .value = HCR_TTLBOS, .mask = HCR_TTLBOS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMCR] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMCR, .mask = MDCR_EL2_TPMCR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPM] = { .index = MDCR_EL2, .value = MDCR_EL2_TPM, .mask = MDCR_EL2_TPM, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDE] = { .index = MDCR_EL2, .value = MDCR_EL2_TDE, .mask = MDCR_EL2_TDE, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDA, .mask = MDCR_EL2_TDA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDOSA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDOSA, .mask = MDCR_EL2_TDOSA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDRA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDRA, .mask = MDCR_EL2_TDRA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2PB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2PB_SHIFT), - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMS] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMS, .mask = MDCR_EL2_TPMS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TTRF] = { .index = MDCR_EL2, .value = MDCR_EL2_TTRF, .mask = MDCR_EL2_TTRF, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2TB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2TB_SHIFT), - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDCC] = { .index = MDCR_EL2, .value = MDCR_EL2_TDCC, .mask = MDCR_EL2_TDCC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPACR_E0POE] = { .index = CPTR_EL2, .value = CPACR_ELx_E0POE, .mask = CPACR_ELx_E0POE, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPTR_TAM] = { .index = CPTR_EL2, .value = CPTR_EL2_TAM, .mask = CPTR_EL2_TAM, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPTR_TCPAC] = { .index = CPTR_EL2, .value = CPTR_EL2_TCPAC, .mask = CPTR_EL2_TCPAC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_EnFPM] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_EnFPM, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_TCR2En] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_TCR2En, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TC] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TC, .mask = ICH_HCR_TC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL0] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TALL0, .mask = ICH_HCR_TALL0, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL1] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TALL1, .mask = ICH_HCR_TALL1, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TDIR] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TDIR, .mask = ICH_HCR_TDIR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, }; @@ -474,7 +475,7 @@ static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu) if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10)) return BEHAVE_HANDLE_LOCALLY; - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) @@ -482,7 +483,7 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10)) return BEHAVE_HANDLE_LOCALLY; - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) @@ -493,7 +494,7 @@ static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) val = translate_cptr_el2_to_cpacr_el1(val); if (val & CPACR_ELx_TTA) - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; return BEHAVE_HANDLE_LOCALLY; } From d97e66fbcba796bb986e2097c352afae896b7942 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:43 +0000 Subject: [PATCH 66/83] KVM: arm64: nv: Reinject traps that take effect in Host EL0 Wire up the other end of traps that affect host EL0 by actually injecting them into the guest hypervisor. Skip over FGT entirely, as a cursory glance suggests no FGT is effective in host EL0. Note that kvm_inject_nested() is already equipped for handling exceptions while the VM is already in a host context. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-9-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 5 +++++ arch/arm64/kvm/emulate-nested.c | 29 ++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index a601a9305b10..bf0c48403f59 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -225,6 +225,11 @@ static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) return vcpu_has_nv(vcpu) && __is_hyp_ctxt(&vcpu->arch.ctxt); } +static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu) +{ + return is_hyp_ctxt(vcpu) && !vcpu_is_el2(vcpu); +} + /* * The layout of SPSR for an AArch32 state is different when observed from an * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index e1a30d1bcd06..b072098ee44e 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -20,6 +20,9 @@ enum trap_behaviour { BEHAVE_FORWARD_READ = BIT(0), BEHAVE_FORWARD_WRITE = BIT(1), BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + + /* Traps that take effect in Host EL0, this is rare! */ + BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2), }; struct trap_bits { @@ -2128,11 +2131,19 @@ static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) return masks->mask[sr - __VNCR_START__].res0; } -static bool check_fgt_bit(struct kvm *kvm, bool is_read, +static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read, u64 val, const union trap_config tc) { + struct kvm *kvm = vcpu->kvm; enum vcpu_sysreg sr; + /* + * KVM doesn't know about any FGTs that apply to the host, and hopefully + * that'll remain the case. + */ + if (is_hyp_ctxt(vcpu)) + return false; + if (tc.pol) return (val & BIT(tc.bit)); @@ -2209,7 +2220,15 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) * If we're not nesting, immediately return to the caller, with the * sysreg index, should we have it. */ - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + if (!vcpu_has_nv(vcpu)) + goto local; + + /* + * There are a few traps that take effect InHost, but are constrained + * to EL0. Don't bother with computing the trap behaviour if the vCPU + * isn't in EL0. + */ + if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu)) goto local; switch ((enum fgt_group_id)tc.fgt) { @@ -2255,12 +2274,14 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) goto local; } - if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu->kvm, is_read, - val, tc)) + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, is_read, val, tc)) goto inject; b = compute_trap_behaviour(vcpu, tc); + if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu)) + goto local; + if (((b & BEHAVE_FORWARD_READ) && is_read) || ((b & BEHAVE_FORWARD_WRITE) && !is_read)) goto inject; From 4ee5d5ff4b4dd0e08d0424aeec62f25d1d66bb04 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:44 +0000 Subject: [PATCH 67/83] KVM: arm64: nv: Honor MDCR_EL2.{TPM, TPMCR} in Host EL0 TPM and TPMCR trap bits also affect Host EL0. How fun. Mark these two trap bits as such and take advantage of the new infrastructure for dealing w/ EL0 traps. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-10-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/emulate-nested.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index b072098ee44e..4f14fa47df8e 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -304,13 +304,15 @@ static const struct trap_bits coarse_trap_bits[] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMCR, .mask = MDCR_EL2_TPMCR, - .behaviour = BEHAVE_FORWARD_RW, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TPM] = { .index = MDCR_EL2, .value = MDCR_EL2_TPM, .mask = MDCR_EL2_TPM, - .behaviour = BEHAVE_FORWARD_RW, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TDE] = { .index = MDCR_EL2, From 336afe0c832d6eb985d0e9dbc5a70929594e58d9 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:45 +0000 Subject: [PATCH 68/83] KVM: arm64: nv: Describe trap behaviour of MDCR_EL2.HPMN MDCR_EL2.HPMN splits the PMU event counters into two ranges: the first range is accessible from all ELs, and the second range is accessible only to EL2/3. Supposing the guest hypervisor allows direct access to the PMU counters from the L2, KVM needs to locally handle those accesses. Add a new complex trap configuration for HPMN that checks if the counter index is accessible to the current context. As written, the architecture suggests HPMN only causes PMEVCNTR_EL0 to trap, though intuition (and the pseudocode) suggest that the trap applies to PMEVTYPER_EL0 as well. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-11-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/emulate-nested.c | 160 +++++++++++++++++++------------- arch/arm64/kvm/pmu-emul.c | 18 ++++ include/kvm/arm_pmu.h | 6 ++ 3 files changed, 120 insertions(+), 64 deletions(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 4f14fa47df8e..897ea81ed2b2 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -110,6 +110,7 @@ enum cgt_group_id { CGT_HCR_TPU_TOCU, CGT_HCR_NV1_nNV2_ENSCXT, CGT_MDCR_TPM_TPMCR, + CGT_MDCR_TPM_HPMN, CGT_MDCR_TDE_TDA, CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE_TDRA, @@ -126,6 +127,7 @@ enum cgt_group_id { CGT_CNTHCTL_EL1PTEN, CGT_CPTR_TTA, + CGT_MDCR_HPMN, /* Must be last */ __NR_CGT_GROUP_IDS__ @@ -441,6 +443,7 @@ static const enum cgt_group_id *coarse_control_combo[] = { MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU), MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT), MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR), + MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN), MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA), MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), @@ -504,6 +507,34 @@ static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) return BEHAVE_HANDLE_LOCALLY; } +static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + unsigned int idx; + + + switch (sysreg) { + case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30): + case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): + idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg); + break; + case SYS_PMXEVTYPER_EL0: + case SYS_PMXEVCNTR_EL0: + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); + break; + default: + /* Someone used this trap helper for something else... */ + KVM_BUG_ON(1, vcpu->kvm); + return BEHAVE_HANDLE_LOCALLY; + } + + if (kvm_pmu_counter_is_hyp(vcpu, idx)) + return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0; + + return BEHAVE_HANDLE_LOCALLY; +} + #define CCC(id, fn) \ [id - __COMPLEX_CONDITIONS__] = fn @@ -511,6 +542,7 @@ static const complex_condition_check ccc[] = { CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), CCC(CGT_CPTR_TTA, check_cptr_tta), + CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), }; /* @@ -925,77 +957,77 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM), - SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM), - SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM), + SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA), diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index ac36c438b8c1..28f938f145ac 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -265,6 +265,24 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) irq_work_sync(&vcpu->arch.pmu.overflow_work); } +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + unsigned int hpmn; + + if (!vcpu_has_nv(vcpu) || idx == ARMV8_PMU_CYCLE_IDX) + return false; + + /* + * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't + * implemented. Since KVM's ability to emulate HPMN=0 does not directly + * depend on hardware (all PMU registers are trapped), make the + * implementation choice that all counters are included in the second + * range reserved for EL2/EL3. + */ + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + return idx >= hpmn; +} + u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) { u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index e08aeec5d936..feb5d1d35f0f 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -96,6 +96,7 @@ int kvm_arm_set_default_pmu(struct kvm *kvm); u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm); u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx); #else struct kvm_pmu { }; @@ -187,6 +188,11 @@ static inline u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) return 0; } +static inline bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + return false; +} + #endif #endif From 166b77a2f42341c22fe6cda504a1afa3f672f920 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:46 +0000 Subject: [PATCH 69/83] KVM: arm64: nv: Advertise support for FEAT_HPMN0 Everything is in place now for KVM to actually handle MDCR_EL2.HPMN. Not only that, the emulation is capable of doing FEAT_HPMN0. Advertise support for the feature in the VM's ID registers. It is possible to emulate FEAT_HPMN0 on hardware that doesn't support it since KVM currently traps all PMU registers. Having said that, let's only advertise the feature on supporting hardware in case KVM ever provides 'direct' PMU support to VMs w/o involving host perf. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-12-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 676e2de78fdd..2cbd8ac19d83 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -917,12 +917,13 @@ static void limit_nv_id_regs(struct kvm *kvm) ID_AA64MMFR4_EL1_E2H0_NI_NV1); kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR4_EL1, val); - /* Only limited support for PMU, Debug, BPs and WPs */ + /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); val &= (NV_FTR(DFR0, PMUVer) | NV_FTR(DFR0, WRPs) | NV_FTR(DFR0, BRPs) | - NV_FTR(DFR0, DebugVer)); + NV_FTR(DFR0, DebugVer) | + NV_FTR(DFR0, HPMN0)); /* Cap Debug to ARMv8.1 */ tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val); From a3034dab74fc12d6c0a589e31af9fafc436a4a0e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:47 +0000 Subject: [PATCH 70/83] KVM: arm64: Rename kvm_pmu_valid_counter_mask() Nested PMU support requires dynamically changing the visible range of PMU counters based on the exception level and value of MDCR_EL2.HPMN. At the same time, the PMU emulation code needs to know the absolute number of implemented counters, regardless of context. Rename the existing helper to make it obvious that it returns the number of implemented counters and not anything else. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-13-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 8 ++++---- arch/arm64/kvm/sys_regs.c | 12 ++++++------ include/kvm/arm_pmu.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 28f938f145ac..fd08c4b53be3 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -244,7 +244,7 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) */ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu); int i; for_each_set_bit(i, &mask, 32) @@ -283,7 +283,7 @@ bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) return idx >= hpmn; } -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); @@ -592,7 +592,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); @@ -822,7 +822,7 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu)); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9115a75b2c61..03a64b72e45d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1168,7 +1168,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va { bool set; - val &= kvm_pmu_valid_counter_mask(vcpu); + val &= kvm_pmu_implemented_counter_mask(vcpu); switch (r->reg) { case PMOVSSET_EL0: @@ -1191,7 +1191,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); *val = __vcpu_sys_reg(vcpu, r->reg) & mask; return 0; @@ -1205,7 +1205,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_implemented_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; if (r->Op2 & 0x1) { @@ -1228,7 +1228,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); if (check_pmu_access_disabled(vcpu, 0)) return false; @@ -1252,7 +1252,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); if (pmu_access_el0_disabled(vcpu)) return false; @@ -1282,7 +1282,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_write_swinc_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_implemented_counter_mask(vcpu); kvm_pmu_software_increment(vcpu, p->regval & mask); return true; } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index feb5d1d35f0f..019ace2bb914 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -47,7 +47,7 @@ static __always_inline bool kvm_arm_support_pmu_v3(void) #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu); +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); @@ -114,7 +114,7 @@ static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, } static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} -static inline u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +static inline u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { return 0; } From 9a1c58cfefb06974a804174f127de3fedc779394 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:48 +0000 Subject: [PATCH 71/83] KVM: arm64: nv: Adjust range of accessible PMCs according to HPMN The value of MDCR_EL2.HPMN controls the number of event counters made visible to EL0 and EL1. This means it is possible for the guest hypervisor to allow direct access to event counters to the L2. Rework KVM's PMU register emulation to take the effects of HPMN into account when handling a trap. For bitmask-style registers, writes only affect accessible registers. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-14-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 14 +++++++++++++- arch/arm64/kvm/sys_regs.c | 12 ++++++------ include/kvm/arm_pmu.h | 5 +++++ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index fd08c4b53be3..0d669fb84485 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -283,6 +283,18 @@ bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) return idx >= hpmn; } +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + u64 hpmn; + + if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) + return mask; + + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + return mask & ~GENMASK(vcpu->kvm->arch.pmcr_n - 1, hpmn); +} + u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); @@ -592,7 +604,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu); + unsigned long mask = kvm_pmu_accessible_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 03a64b72e45d..2bd4c32c3d6d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1168,7 +1168,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va { bool set; - val &= kvm_pmu_implemented_counter_mask(vcpu); + val &= kvm_pmu_accessible_counter_mask(vcpu); switch (r->reg) { case PMOVSSET_EL0: @@ -1191,7 +1191,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { - u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); *val = __vcpu_sys_reg(vcpu, r->reg) & mask; return 0; @@ -1205,7 +1205,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - mask = kvm_pmu_implemented_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; if (r->Op2 & 0x1) { @@ -1228,7 +1228,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (check_pmu_access_disabled(vcpu, 0)) return false; @@ -1252,7 +1252,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (pmu_access_el0_disabled(vcpu)) return false; @@ -1282,7 +1282,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_write_swinc_el0_disabled(vcpu)) return false; - mask = kvm_pmu_implemented_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); kvm_pmu_software_increment(vcpu, p->regval & mask); return true; } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 019ace2bb914..76244f0bd47a 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -48,6 +48,7 @@ static __always_inline bool kvm_arm_support_pmu_v3(void) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); @@ -118,6 +119,10 @@ static inline u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { return 0; } +static inline u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) +{ + return 0; +} static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} From 9d15f8290a228ccd74a6ca8c082df350009e9e06 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:49 +0000 Subject: [PATCH 72/83] KVM: arm64: Add helpers to determine if PMC counts at a given EL Checking the exception level filters for a PMC is a minor annoyance to open code. Add helpers to check if an event counts at EL0 and EL1, which will prove useful in a subsequent change. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-15-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 40 +++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 0d669fb84485..03cd1ad7a55a 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -111,6 +111,11 @@ static u32 counter_index_to_evtreg(u64 idx) return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } +static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc) +{ + return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx)); +} + static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); @@ -619,6 +624,24 @@ static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); } +static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0; + bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0; + + return u == nsu; +} + +static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1; + bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1; + + return p == nsk; +} + /** * kvm_pmu_create_perf_event - create a perf event for a counter * @pmc: Counter context @@ -629,17 +652,15 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct perf_event *event; struct perf_event_attr attr; - u64 eventsel, reg, data; - bool p, u, nsk, nsu; + u64 eventsel, evtreg; - reg = counter_index_to_evtreg(pmc->idx); - data = __vcpu_sys_reg(vcpu, reg); + evtreg = kvm_pmc_read_evtreg(pmc); kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else - eventsel = data & kvm_pmu_event_mask(vcpu->kvm); + eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm); /* * Neither SW increment nor chained events need to be backed @@ -657,18 +678,13 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; - p = data & ARMV8_PMU_EXCLUDE_EL1; - u = data & ARMV8_PMU_EXCLUDE_EL0; - nsk = data & ARMV8_PMU_EXCLUDE_NS_EL1; - nsu = data & ARMV8_PMU_EXCLUDE_NS_EL0; - memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(pmc); - attr.exclude_user = (u != nsu); - attr.exclude_kernel = (p != nsk); + attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); + attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; From fe827f9166622dbe384605b78092c285d5e92b76 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:50 +0000 Subject: [PATCH 73/83] KVM: arm64: nv: Honor MDCR_EL2.HPME When the PMU is configured with split counter ranges, HPME becomes the enable bit for the counters reserved for EL2. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-16-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 03cd1ad7a55a..349886f03fd5 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -620,8 +620,15 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - return (kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); + unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx))) + return false; + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return mdcr & MDCR_EL2_HPME; + + return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E; } static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) From 16535d55e91f4d55134370e78e2b7f217e2ebc19 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:51 +0000 Subject: [PATCH 74/83] KVM: arm64: nv: Honor MDCR_EL2.HLP Counters that fall in the hypervisor range (i.e. N >= HPMN) have a separate control for enabling 64 bit overflow. Take it into account. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-17-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 349886f03fd5..1e9cdbc235a8 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -89,7 +89,11 @@ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - u64 val = kvm_vcpu_read_pmcr(kvm_pmc_to_vcpu(pmc)); + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 val = kvm_vcpu_read_pmcr(vcpu); + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP; return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); From 8a34979030f6bcb713476ae485a58f5d8e96ebea Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:23:52 +0000 Subject: [PATCH 75/83] KVM: arm64: nv: Apply EL2 event filtering when in hyp context It hopefully comes as no surprise when I say that vEL2 actually runs at EL1. So, the guest hypervisor's EL2 event filter (NSH) needs to actually be applied to EL1 in the perf event. In addition to this, the disable bit for the guest counter range (HPMD) needs to have the effect of stopping the affected counters. Do exactly that by stuffing ::exclude_kernel with the combined effect of these controls. This isn't quite enough yet, as the backing perf events need to be reprogrammed upon nested ERET/exception entry to remap the effective filter onto ::exclude_kernel. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182354.3364124-18-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/pmu-emul.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 1e9cdbc235a8..e2eb2ba903b6 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -653,6 +653,17 @@ static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) return p == nsk; } +static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD)) + return false; + + return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; +} + /** * kvm_pmu_create_perf_event - create a perf event for a counter * @pmc: Counter context @@ -695,11 +706,19 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(pmc); attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); - attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; + /* + * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the + * guest's EL2 filter. + */ + if (unlikely(is_hyp_ctxt(vcpu))) + attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc); + else + attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); + /* * If counting with a 64bit counter, advertise it to the perf * code, carefully dealing with the initial sample period From ae323e035801def145776dddf46c01ca1b90d21d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 25 Oct 2024 18:25:59 +0000 Subject: [PATCH 76/83] KVM: arm64: nv: Reprogram PMU events affected by nested transition Start reprogramming PMU events at nested boundaries now that everything is in place to handle the EL2 event filter. Only repaint events where the filter differs between EL1 and EL2 as a slight optimization. PMU now 'works' for nested VMs, albeit slow. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241025182559.3364829-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/emulate-nested.c | 4 ++++ arch/arm64/kvm/pmu-emul.c | 29 +++++++++++++++++++++++++++++ include/kvm/arm_pmu.h | 3 +++ 3 files changed, 36 insertions(+) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 897ea81ed2b2..71c97c214c1c 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2450,6 +2450,8 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); + + kvm_pmu_nested_transition(vcpu); } static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, @@ -2532,6 +2534,8 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); + kvm_pmu_nested_transition(vcpu); + return 1; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e2eb2ba903b6..8ad62284fa23 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -1215,3 +1215,32 @@ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); } + +void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) +{ + bool reprogrammed = false; + unsigned long mask; + int i; + + if (!kvm_vcpu_has_pmu(vcpu)) + return; + + mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + for_each_set_bit(i, &mask, 32) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + + /* + * We only need to reconfigure events where the filter is + * different at EL1 vs. EL2, as we're multiplexing the true EL1 + * event filter bit for nested. + */ + if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc)) + continue; + + kvm_pmu_create_perf_event(pmc); + reprogrammed = true; + } + + if (reprogrammed) + kvm_vcpu_pmu_restore_guest(vcpu); +} diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 76244f0bd47a..e61dd7dd2286 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -98,6 +98,7 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm); u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx); +void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu); #else struct kvm_pmu { }; @@ -198,6 +199,8 @@ static inline bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int id return false; } +static inline void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) {} + #endif #endif From 38d7aacca09230fdb98a34194fec2af597e8e20d Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Mon, 28 Oct 2024 23:45:33 +0000 Subject: [PATCH 77/83] KVM: arm64: Get rid of userspace_irqchip_in_use Improper use of userspace_irqchip_in_use led to syzbot hitting the following WARN_ON() in kvm_timer_update_irq(): WARNING: CPU: 0 PID: 3281 at arch/arm64/kvm/arch_timer.c:459 kvm_timer_update_irq+0x21c/0x394 Call trace: kvm_timer_update_irq+0x21c/0x394 arch/arm64/kvm/arch_timer.c:459 kvm_timer_vcpu_reset+0x158/0x684 arch/arm64/kvm/arch_timer.c:968 kvm_reset_vcpu+0x3b4/0x560 arch/arm64/kvm/reset.c:264 kvm_vcpu_set_target arch/arm64/kvm/arm.c:1553 [inline] kvm_arch_vcpu_ioctl_vcpu_init arch/arm64/kvm/arm.c:1573 [inline] kvm_arch_vcpu_ioctl+0x112c/0x1b3c arch/arm64/kvm/arm.c:1695 kvm_vcpu_ioctl+0x4ec/0xf74 virt/kvm/kvm_main.c:4658 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __arm64_sys_ioctl+0x108/0x184 fs/ioctl.c:893 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x78/0x1b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0xe8/0x1b0 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x40/0x50 arch/arm64/kernel/syscall.c:151 el0_svc+0x54/0x14c arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 The following sequence led to the scenario: - Userspace creates a VM and a vCPU. - The vCPU is initialized with KVM_ARM_VCPU_PMU_V3 during KVM_ARM_VCPU_INIT. - Without any other setup, such as vGIC or vPMU, userspace issues KVM_RUN on the vCPU. Since the vPMU is requested, but not setup, kvm_arm_pmu_v3_enable() fails in kvm_arch_vcpu_run_pid_change(). As a result, KVM_RUN returns after enabling the timer, but before incrementing 'userspace_irqchip_in_use': kvm_arch_vcpu_run_pid_change() ret = kvm_arm_pmu_v3_enable() if (!vcpu->arch.pmu.created) return -EINVAL; if (ret) return ret; [...] if (!irqchip_in_kernel(kvm)) static_branch_inc(&userspace_irqchip_in_use); - Userspace ignores the error and issues KVM_ARM_VCPU_INIT again. Since the timer is already enabled, control moves through the following flow, ultimately hitting the WARN_ON(): kvm_timer_vcpu_reset() if (timer->enabled) kvm_timer_update_irq() if (!userspace_irqchip()) ret = kvm_vgic_inject_irq() ret = vgic_lazy_init() if (unlikely(!vgic_initialized(kvm))) if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) return -EBUSY; WARN_ON(ret); Theoretically, since userspace_irqchip_in_use's functionality can be simply replaced by '!irqchip_in_kernel()', get rid of the static key to avoid the mismanagement, which also helps with the syzbot issue. Cc: Reported-by: syzbot Suggested-by: Marc Zyngier Signed-off-by: Raghavendra Rao Ananta Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 -- arch/arm64/kvm/arch_timer.c | 3 +-- arch/arm64/kvm/arm.c | 18 +++--------------- 3 files changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 94cff508874b..b439aa6d4594 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -73,8 +73,6 @@ enum kvm_mode kvm_get_mode(void); static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; #endif -DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); - extern unsigned int __ro_after_init kvm_sve_max_vl; extern unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void); diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 879982b1cc73..1215df590418 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -206,8 +206,7 @@ void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) static inline bool userspace_irqchip(struct kvm *kvm) { - return static_branch_unlikely(&userspace_irqchip_in_use) && - unlikely(!irqchip_in_kernel(kvm)); + return unlikely(!irqchip_in_kernel(kvm)); } static void soft_timer_start(struct hrtimer *hrt, u64 ns) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ece934bb4531..33965a7c2131 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -69,7 +69,6 @@ DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); static bool vgic_present, kvm_arm_initialised; static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized); -DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); bool is_kvm_arm_initialised(void) { @@ -503,9 +502,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm))) - static_branch_dec(&userspace_irqchip_in_use); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); @@ -848,14 +844,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } - if (!irqchip_in_kernel(kvm)) { - /* - * Tell the rest of the code that there are userspace irqchip - * VMs in the wild. - */ - static_branch_inc(&userspace_irqchip_in_use); - } - mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); @@ -1064,7 +1052,7 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) * state gets updated in kvm_timer_update_run and * kvm_pmu_update_run below). */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { if (kvm_timer_should_notify_user(vcpu) || kvm_pmu_should_notify_user(vcpu)) { *ret = -EINTR; @@ -1186,7 +1174,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ kvm_pmu_sync_hwstate(vcpu); - if (static_branch_unlikely(&userspace_irqchip_in_use)) + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); @@ -1232,7 +1220,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * we don't want vtimer interrupts to race with syncing the * timer virtual interrupt state. */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); kvm_arch_vcpu_ctxsync_fp(vcpu); From e9b57d7f9740deb31acb02a00bdf6653e60c7e61 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 22 Oct 2024 08:39:43 +0100 Subject: [PATCH 78/83] KVM: arm64: Make L1Ip feature in CTR_EL0 writable from userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only allow userspace to set VIPT(0b10) or PIPT(0b11) for L1Ip based on what hardware reports as both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved. Using a VIPT for Guest where hardware reports PIPT may lead to over invalidation, but is still correct. Hence, we can allow downgrading PIPT to VIPT, but not the other way around. Reviewed-by: Sebastian Ott Reviewed-by: Marc Zyngier Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20241022073943.35764-1-shameerali.kolothum.thodi@huawei.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 7dc4a5ce5292..ca0001060421 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1881,6 +1881,34 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, user_val); } +static int set_ctr_el0(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val); + + /* + * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved. + * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based + * on what hardware reports. + * + * Using a VIPT software model on PIPT will lead to over invalidation, + * but still correct. Hence, we can allow downgrading PIPT to VIPT, + * but not the other way around. This is handled via arm64_ftr_safe_value() + * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value + * set as VIPT. + */ + switch (user_L1Ip) { + case CTR_EL0_L1Ip_RESERVED_VPIPT: + case CTR_EL0_L1Ip_RESERVED_AIVIVT: + return -EINVAL; + case CTR_EL0_L1Ip_VIPT: + case CTR_EL0_L1Ip_PIPT: + return set_id_reg(vcpu, rd, user_val); + default: + return -ENOENT; + } +} + /* * cpufeature ID register user accessors * @@ -2651,10 +2679,12 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, - ID_WRITABLE(CTR_EL0, CTR_EL0_DIC_MASK | - CTR_EL0_IDC_MASK | - CTR_EL0_DminLine_MASK | - CTR_EL0_IminLine_MASK), + ID_FILTERED(CTR_EL0, ctr_el0, + CTR_EL0_DIC_MASK | + CTR_EL0_IDC_MASK | + CTR_EL0_DminLine_MASK | + CTR_EL0_L1Ip_MASK | + CTR_EL0_IminLine_MASK), { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, From 5afe18dfa47daead88517b095b6e0ce012f031f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 7 Nov 2024 11:39:59 -0800 Subject: [PATCH 79/83] KVM: selftests: Don't bother deleting memslots in KVM when freeing VMs When freeing a VM, don't call into KVM to manually remove each memslot, simply cleanup and free any userspace assets associated with the memory region. KVM is ultimately responsible for ensuring kernel resources are freed when the VM is destroyed, deleting memslots one-by-one is unnecessarily slow, and unless a test is already leaking the VM fd, the VM will be destroyed when kvm_vm_release() is called. Not deleting KVM's memslot also allows cleaning up dead VMs without having to care whether or not the to-be-freed VM is dead or alive. Reported-by: Eric Auger Reviewed-by: Eric Auger Tested-by: Eric Auger Reported-by: Mark Brown Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/kvmarm/Zy0bcM0m-N18gAZz@google.com/ Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/lib/kvm_util.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index a2b7df5f1d39..480e3a40d197 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -720,9 +720,6 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, rb_erase(®ion->hva_node, &vm->regions.hva_tree); hash_del(®ion->slot_node); - region->region.memory_size = 0; - vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); - sparsebit_free(®ion->unused_phy_pages); sparsebit_free(®ion->protected_phy_pages); ret = munmap(region->mmap_start, region->mmap_size); @@ -1197,7 +1194,12 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) */ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) { - __vm_mem_region_delete(vm, memslot2region(vm, slot)); + struct userspace_mem_region *region = memslot2region(vm, slot); + + region->region.memory_size = 0; + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); + + __vm_mem_region_delete(vm, region); } void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, From 7fe28d7e68f92cc3d0668b8f2fbdf5c303ac3022 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Thu, 7 Nov 2024 13:41:34 -0800 Subject: [PATCH 80/83] KVM: arm64: vgic-its: Add a data length check in vgic_its_save_* In all the vgic_its_save_*() functinos, they do not check whether the data length is 8 bytes before calling vgic_write_guest_lock. This patch adds the check. To prevent the kernel from being blown up when the fault occurs, KVM_BUG_ON() is used. And the other BUG_ON()s are replaced together. Cc: stable@vger.kernel.org Signed-off-by: Kunkun Jiang [Jing: Update with the new entry read/write helpers] Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20241107214137.428439-4-jingzhangos@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-its.c | 20 ++++++++------------ arch/arm64/kvm/vgic/vgic.h | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index ba945ba78cc7..68ba7e2453cd 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2086,7 +2086,6 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, struct its_ite *ite, gpa_t gpa, int ite_esz) { - struct kvm *kvm = its->dev->kvm; u32 next_offset; u64 val; @@ -2095,7 +2094,8 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return vgic_write_guest_lock(kvm, gpa, &val, ite_esz); + + return vgic_its_write_entry_lock(its, gpa, val, ite_esz); } /** @@ -2239,7 +2239,6 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, gpa_t ptr, int dte_esz) { - struct kvm *kvm = its->dev->kvm; u64 val, itt_addr_field; u32 next_offset; @@ -2250,7 +2249,8 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return vgic_write_guest_lock(kvm, ptr, &val, dte_esz); + + return vgic_its_write_entry_lock(its, ptr, val, dte_esz); } /** @@ -2437,7 +2437,8 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return vgic_write_guest_lock(its->dev->kvm, gpa, &val, esz); + + return vgic_its_write_entry_lock(its, gpa, val, esz); } /* @@ -2453,8 +2454,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) u64 val; int ret; - BUG_ON(esz > sizeof(val)); - ret = kvm_read_guest_lock(kvm, gpa, &val, esz); + ret = vgic_its_read_entry_lock(its, gpa, &val, esz); if (ret) return ret; val = le64_to_cpu(val); @@ -2492,7 +2492,6 @@ static int vgic_its_save_collection_table(struct vgic_its *its) u64 baser = its->baser_coll_table; gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; - u64 val; size_t max_size, filled = 0; int ret, cte_esz = abi->cte_esz; @@ -2516,10 +2515,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) * table is not fully filled, add a last dummy element * with valid bit unset */ - val = 0; - BUG_ON(cte_esz > sizeof(val)); - ret = vgic_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); - return ret; + return vgic_its_write_entry_lock(its, gpa, 0, cte_esz); } /* diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index f2486b4d9f95..309295f5e1b0 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -146,6 +146,29 @@ static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } +static inline int vgic_its_read_entry_lock(struct vgic_its *its, gpa_t eaddr, + u64 *eval, unsigned long esize) +{ + struct kvm *kvm = its->dev->kvm; + + if (KVM_BUG_ON(esize != sizeof(*eval), kvm)) + return -EINVAL; + + return kvm_read_guest_lock(kvm, eaddr, eval, esize); + +} + +static inline int vgic_its_write_entry_lock(struct vgic_its *its, gpa_t eaddr, + u64 eval, unsigned long esize) +{ + struct kvm *kvm = its->dev->kvm; + + if (KVM_BUG_ON(esize != sizeof(eval), kvm)) + return -EINVAL; + + return vgic_write_guest_lock(kvm, eaddr, &eval, esize); +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC From e9649129d33dca561305fc590a7c4ba8c3e5675a Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Thu, 7 Nov 2024 13:41:36 -0800 Subject: [PATCH 81/83] KVM: arm64: vgic-its: Clear DTE when MAPD unmaps a device vgic_its_save_device_tables will traverse its->device_list to save DTE for each device. vgic_its_restore_device_tables will traverse each entry of device table and check if it is valid. Restore if valid. But when MAPD unmaps a device, it does not invalidate the corresponding DTE. In the scenario of continuous saves and restores, there may be a situation where a device's DTE is not saved but is restored. This is unreasonable and may cause restore to fail. This patch clears the corresponding DTE when MAPD unmaps a device. Cc: stable@vger.kernel.org Fixes: 57a9a117154c ("KVM: arm64: vgic-its: Device table save/restore") Co-developed-by: Shusen Li Signed-off-by: Shusen Li Signed-off-by: Kunkun Jiang [Jing: Update with entry write helper] Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20241107214137.428439-5-jingzhangos@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-its.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 68ba7e2453cd..b77fa99eafed 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -1139,9 +1139,11 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, bool valid = its_cmd_get_validbit(its_cmd); u8 num_eventid_bits = its_cmd_get_size(its_cmd); gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); + int dte_esz = vgic_its_get_abi(its)->dte_esz; struct its_device *device; + gpa_t gpa; - if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) + if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa)) return E_ITS_MAPD_DEVICE_OOR; if (valid && num_eventid_bits > VITS_TYPER_IDBITS) @@ -1162,7 +1164,7 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, * is an error, so we are done in any case. */ if (!valid) - return 0; + return vgic_its_write_entry_lock(its, gpa, 0, dte_esz); device = vgic_its_alloc_device(its, device_id, itt_addr, num_eventid_bits); From 7602ffd1d5e8927fadd5187cb4aed2fdc9c47143 Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Thu, 7 Nov 2024 13:41:37 -0800 Subject: [PATCH 82/83] KVM: arm64: vgic-its: Clear ITE when DISCARD frees an ITE When DISCARD frees an ITE, it does not invalidate the corresponding ITE. In the scenario of continuous saves and restores, there may be a situation where an ITE is not saved but is restored. This is unreasonable and may cause restore to fail. This patch clears the corresponding ITE when DISCARD frees an ITE. Cc: stable@vger.kernel.org Fixes: eff484e0298d ("KVM: arm64: vgic-its: ITT save and restore") Signed-off-by: Kunkun Jiang [Jing: Update with entry write helper] Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20241107214137.428439-6-jingzhangos@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-its.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index b77fa99eafed..198296933e7e 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -782,6 +782,9 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, ite = find_ite(its, device_id, event_id); if (ite && its_is_collection_mapped(ite->collection)) { + struct its_device *device = find_its_device(its, device_id); + int ite_esz = vgic_its_get_abi(its)->ite_esz; + gpa_t gpa = device->itt_addr + ite->event_id * ite_esz; /* * Though the spec talks about removing the pending state, we * don't bother here since we clear the ITTE anyway and the @@ -790,7 +793,8 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, vgic_its_invalidate_cache(its); its_free_ite(kvm, ite); - return 0; + + return vgic_its_write_entry_lock(its, gpa, 0, ite_esz); } return E_ITS_DISCARD_UNMAPPED_INTERRUPT; From 60ad25e14ab5a4e56c8bf7f7d6846eacb9cd53df Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 12 Nov 2024 10:56:03 +0000 Subject: [PATCH 83/83] KVM: arm64: Pass on SVE mapping failures This function can fail but its return value isn't passed onto the caller. Presumably this could result in a broken state. Fixes: 66d5b53e20a6 ("KVM: arm64: Allocate memory mapped at hyp for host sve state in pKVM") Signed-off-by: James Clark Reviewed-by: Fuad Tabba Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20241112105604.795809-1-james.clark@linaro.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/setup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 8fec099c2775..cbdd18cd3f98 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -146,8 +146,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } - pkvm_create_host_sve_mappings(); - return 0; + return pkvm_create_host_sve_mappings(); } static void update_nvhe_init_params(void)