linux-stable/arch/arm64/kvm/emulate-nested.c
Marc Zyngier e58ec47bf6 KVM: arm64: nv: Add trap forwarding infrastructure
A significant part of what a NV hypervisor needs to do is to decide
whether a trap from a L2+ guest has to be forwarded to a L1 guest
or handled locally. This is done by checking for the trap bits that
the guest hypervisor has set and acting accordingly, as described by
the architecture.

A previous approach was to sprinkle a bunch of checks in all the
system register accessors, but this is pretty error prone and doesn't
help getting an overview of what is happening.

Instead, implement a set of global tables that describe a trap bit,
combinations of trap bits, behaviours on trap, and what bits must
be evaluated on a system register trap.

Although this is painful to describe, this allows to specify each
and every control bit in a static manner. To make it efficient,
the table is inserted in an xarray that is global to the system,
and checked each time we trap a system register while running
a L2 guest.

Add the basic infrastructure for now, while additional patches will
implement configuration registers.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Jing Zhang <jingzhangos@google.com>
Reviewed-by: Miguel Luis <miguel.luis@oracle.com>
Link: https://lore.kernel.org/r/20230815183903.2735724-15-maz@kernel.org
2023-08-17 10:00:27 +01:00

486 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2016 - Linaro and Columbia University
* Author: Jintack Lim <jintack.lim@linaro.org>
*/
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_nested.h>
#include "hyp/include/hyp/adjust_pc.h"
#include "trace.h"
enum trap_behaviour {
BEHAVE_HANDLE_LOCALLY = 0,
BEHAVE_FORWARD_READ = BIT(0),
BEHAVE_FORWARD_WRITE = BIT(1),
BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE,
};
struct trap_bits {
const enum vcpu_sysreg index;
const enum trap_behaviour behaviour;
const u64 value;
const u64 mask;
};
/* Coarse Grained Trap definitions */
enum cgt_group_id {
/* Indicates no coarse trap control */
__RESERVED__,
/*
* The first batch of IDs denote coarse trapping that are used
* on their own instead of being part of a combination of
* trap controls.
*/
/*
* Anything after this point is a combination of coarse trap
* controls, which must all be evaluated to decide what to do.
*/
__MULTIPLE_CONTROL_BITS__,
/*
* Anything after this point requires a callback evaluating a
* complex trap condition. Hopefully we'll never need this...
*/
__COMPLEX_CONDITIONS__,
/* Must be last */
__NR_CGT_GROUP_IDS__
};
static const struct trap_bits coarse_trap_bits[] = {
};
#define MCB(id, ...) \
[id - __MULTIPLE_CONTROL_BITS__] = \
(const enum cgt_group_id[]){ \
__VA_ARGS__, __RESERVED__ \
}
static const enum cgt_group_id *coarse_control_combo[] = {
};
typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *);
#define CCC(id, fn) \
[id - __COMPLEX_CONDITIONS__] = fn
static const complex_condition_check ccc[] = {
};
/*
* Bit assignment for the trap controls. We use a 64bit word with the
* following layout for each trapped sysreg:
*
* [9:0] enum cgt_group_id (10 bits)
* [62:10] Unused (53 bits)
* [63] RES0 - Must be zero, as lost on insertion in the xarray
*/
#define TC_CGT_BITS 10
union trap_config {
u64 val;
struct {
unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */
unsigned long unused:53; /* Unused, should be zero */
unsigned long mbz:1; /* Must Be Zero */
};
};
struct encoding_to_trap_config {
const u32 encoding;
const u32 end;
const union trap_config tc;
const unsigned int line;
};
#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \
{ \
.encoding = sr_start, \
.end = sr_end, \
.tc = { \
.cgt = trap_id, \
}, \
.line = __LINE__, \
}
#define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id)
/*
* Map encoding to trap bits for exception reported with EC=0x18.
* These must only be evaluated when running a nested hypervisor, but
* that the current context is not a hypervisor context. When the
* trapped access matches one of the trap controls, the exception is
* re-injected in the nested hypervisor.
*/
static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
};
static DEFINE_XARRAY(sr_forward_xa);
static union trap_config get_trap_config(u32 sysreg)
{
return (union trap_config) {
.val = xa_to_value(xa_load(&sr_forward_xa, sysreg)),
};
}
static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc,
const char *type, int err)
{
kvm_err("%s line %d encoding range "
"(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n",
type, tc->line,
sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding),
sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding),
sys_reg_Op2(tc->encoding),
sys_reg_Op0(tc->end), sys_reg_Op1(tc->end),
sys_reg_CRn(tc->end), sys_reg_CRm(tc->end),
sys_reg_Op2(tc->end),
err);
}
int __init populate_nv_trap_config(void)
{
int ret = 0;
BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *));
BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS));
for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) {
const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i];
void *prev;
if (cgt->tc.val & BIT(63)) {
kvm_err("CGT[%d] has MBZ bit set\n", i);
ret = -EINVAL;
}
if (cgt->encoding != cgt->end) {
prev = xa_store_range(&sr_forward_xa,
cgt->encoding, cgt->end,
xa_mk_value(cgt->tc.val),
GFP_KERNEL);
} else {
prev = xa_store(&sr_forward_xa, cgt->encoding,
xa_mk_value(cgt->tc.val), GFP_KERNEL);
if (prev && !xa_is_err(prev)) {
ret = -EINVAL;
print_nv_trap_error(cgt, "Duplicate CGT", ret);
}
}
if (xa_is_err(prev)) {
ret = xa_err(prev);
print_nv_trap_error(cgt, "Failed CGT insertion", ret);
}
}
kvm_info("nv: %ld coarse grained trap handlers\n",
ARRAY_SIZE(encoding_to_cgt));
for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) {
const enum cgt_group_id *cgids;
cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__];
for (int i = 0; cgids[i] != __RESERVED__; i++) {
if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) {
kvm_err("Recursive MCB %d/%d\n", id, cgids[i]);
ret = -EINVAL;
}
}
}
if (ret)
xa_destroy(&sr_forward_xa);
return ret;
}
static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu,
const struct trap_bits *tb)
{
enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY;
u64 val;
val = __vcpu_sys_reg(vcpu, tb->index);
if ((val & tb->mask) == tb->value)
b |= tb->behaviour;
return b;
}
static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu,
const enum cgt_group_id id,
enum trap_behaviour b)
{
switch (id) {
const enum cgt_group_id *cgids;
case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1:
if (likely(id != __RESERVED__))
b |= get_behaviour(vcpu, &coarse_trap_bits[id]);
break;
case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1:
/* Yes, this is recursive. Don't do anything stupid. */
cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__];
for (int i = 0; cgids[i] != __RESERVED__; i++)
b |= __compute_trap_behaviour(vcpu, cgids[i], b);
break;
default:
if (ARRAY_SIZE(ccc))
b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu);
break;
}
return b;
}
static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu,
const union trap_config tc)
{
enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY;
return __compute_trap_behaviour(vcpu, tc.cgt, b);
}
bool __check_nv_sr_forward(struct kvm_vcpu *vcpu)
{
union trap_config tc;
enum trap_behaviour b;
bool is_read;
u32 sysreg;
u64 esr;
if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu))
return false;
esr = kvm_vcpu_get_esr(vcpu);
sysreg = esr_sys64_to_sysreg(esr);
is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
tc = get_trap_config(sysreg);
/*
* A value of 0 for the whole entry means that we know nothing
* for this sysreg, and that it cannot be re-injected into the
* nested hypervisor. In this situation, let's cut it short.
*
* Note that ultimately, we could also make use of the xarray
* to store the index of the sysreg in the local descriptor
* array, avoiding another search... Hint, hint...
*/
if (!tc.val)
return false;
b = compute_trap_behaviour(vcpu, tc);
if (((b & BEHAVE_FORWARD_READ) && is_read) ||
((b & BEHAVE_FORWARD_WRITE) && !is_read))
goto inject;
return false;
inject:
trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read);
kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
return true;
}
static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr)
{
u64 mode = spsr & PSR_MODE_MASK;
/*
* Possible causes for an Illegal Exception Return from EL2:
* - trying to return to EL3
* - trying to return to an illegal M value
* - trying to return to a 32bit EL
* - trying to return to EL1 with HCR_EL2.TGE set
*/
if (mode == PSR_MODE_EL3t || mode == PSR_MODE_EL3h ||
mode == 0b00001 || (mode & BIT(1)) ||
(spsr & PSR_MODE32_BIT) ||
(vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t ||
mode == PSR_MODE_EL1h))) {
/*
* The guest is playing with our nerves. Preserve EL, SP,
* masks, flags from the existing PSTATE, and set IL.
* The HW will then generate an Illegal State Exception
* immediately after ERET.
*/
spsr = *vcpu_cpsr(vcpu);
spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT |
PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT |
PSR_MODE_MASK | PSR_MODE32_BIT);
spsr |= PSR_IL_BIT;
}
return spsr;
}
void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu)
{
u64 spsr, elr, mode;
bool direct_eret;
/*
* Going through the whole put/load motions is a waste of time
* if this is a VHE guest hypervisor returning to its own
* userspace, or the hypervisor performing a local exception
* return. No need to save/restore registers, no need to
* switch S2 MMU. Just do the canonical ERET.
*/
spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2);
spsr = kvm_check_illegal_exception_return(vcpu, spsr);
mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT);
direct_eret = (mode == PSR_MODE_EL0t &&
vcpu_el2_e2h_is_set(vcpu) &&
vcpu_el2_tge_is_set(vcpu));
direct_eret |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t);
if (direct_eret) {
*vcpu_pc(vcpu) = vcpu_read_sys_reg(vcpu, ELR_EL2);
*vcpu_cpsr(vcpu) = spsr;
trace_kvm_nested_eret(vcpu, *vcpu_pc(vcpu), spsr);
return;
}
preempt_disable();
kvm_arch_vcpu_put(vcpu);
elr = __vcpu_sys_reg(vcpu, ELR_EL2);
trace_kvm_nested_eret(vcpu, elr, spsr);
/*
* Note that the current exception level is always the virtual EL2,
* since we set HCR_EL2.NV bit only when entering the virtual EL2.
*/
*vcpu_pc(vcpu) = elr;
*vcpu_cpsr(vcpu) = spsr;
kvm_arch_vcpu_load(vcpu, smp_processor_id());
preempt_enable();
}
static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
enum exception_type type)
{
trace_kvm_inject_nested_exception(vcpu, esr_el2, type);
switch (type) {
case except_type_sync:
kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2);
break;
case except_type_irq:
kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ);
break;
default:
WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type);
}
}
/*
* Emulate taking an exception to EL2.
* See ARM ARM J8.1.2 AArch64.TakeException()
*/
static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2,
enum exception_type type)
{
u64 pstate, mode;
bool direct_inject;
if (!vcpu_has_nv(vcpu)) {
kvm_err("Unexpected call to %s for the non-nesting configuration\n",
__func__);
return -EINVAL;
}
/*
* As for ERET, we can avoid doing too much on the injection path by
* checking that we either took the exception from a VHE host
* userspace or from vEL2. In these cases, there is no change in
* translation regime (or anything else), so let's do as little as
* possible.
*/
pstate = *vcpu_cpsr(vcpu);
mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT);
direct_inject = (mode == PSR_MODE_EL0t &&
vcpu_el2_e2h_is_set(vcpu) &&
vcpu_el2_tge_is_set(vcpu));
direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t);
if (direct_inject) {
kvm_inject_el2_exception(vcpu, esr_el2, type);
return 1;
}
preempt_disable();
/*
* We may have an exception or PC update in the EL0/EL1 context.
* Commit it before entering EL2.
*/
__kvm_adjust_pc(vcpu);
kvm_arch_vcpu_put(vcpu);
kvm_inject_el2_exception(vcpu, esr_el2, type);
/*
* A hard requirement is that a switch between EL1 and EL2
* contexts has to happen between a put/load, so that we can
* pick the correct timer and interrupt configuration, among
* other things.
*
* Make sure the exception actually took place before we load
* the new context.
*/
__kvm_adjust_pc(vcpu);
kvm_arch_vcpu_load(vcpu, smp_processor_id());
preempt_enable();
return 1;
}
int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2)
{
return kvm_inject_nested(vcpu, esr_el2, except_type_sync);
}
int kvm_inject_nested_irq(struct kvm_vcpu *vcpu)
{
/*
* Do not inject an irq if the:
* - Current exception level is EL2, and
* - virtual HCR_EL2.TGE == 0
* - virtual HCR_EL2.IMO == 0
*
* See Table D1-17 "Physical interrupt target and masking when EL3 is
* not implemented and EL2 is implemented" in ARM DDI 0487C.a.
*/
if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) &&
!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO))
return 1;
/* esr_el2 value doesn't matter for exits due to irqs. */
return kvm_inject_nested(vcpu, 0, except_type_irq);
}