linux-next/arch/arm64/kvm/at.c
Marc Zyngier 1c6801d565 KVM: arm64: Handle WXN attribute
Until now, we didn't really care about WXN as it didn't have an
effect on the R/W permissions (only the execution could be droppped),
and therefore not of interest for AT.

However, with S1POE, WXN can revoke the Write permission if an
overlay is active and that execution is allowed. This *is* relevant
to AT.

Add full handling of WXN so that we correctly handle this case.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20241023145345.1613824-38-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2024-10-31 02:44:23 +00:00

1440 lines
32 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 - Linaro Ltd
* Author: Jintack Lim <jintack.lim@linaro.org>
*/
#include <linux/kvm_host.h>
#include <asm/esr.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
enum trans_regime {
TR_EL10,
TR_EL20,
TR_EL2,
};
struct s1_walk_info {
u64 baddr;
enum trans_regime regime;
unsigned int max_oa_bits;
unsigned int pgshift;
unsigned int txsz;
int sl;
bool hpd;
bool e0poe;
bool poe;
bool pan;
bool be;
bool s2;
};
struct s1_walk_result {
union {
struct {
u64 desc;
u64 pa;
s8 level;
u8 APTable;
bool UXNTable;
bool PXNTable;
bool uwxn;
bool uov;
bool ur;
bool uw;
bool ux;
bool pwxn;
bool pov;
bool pr;
bool pw;
bool px;
};
struct {
u8 fst;
bool ptw;
bool s2;
};
};
bool failed;
};
static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
{
wr->fst = fst;
wr->ptw = ptw;
wr->s2 = s2;
wr->failed = true;
}
#define S1_MMU_DISABLED (-127)
static int get_ia_size(struct s1_walk_info *wi)
{
return 64 - wi->txsz;
}
/* Return true if the IPA is out of the OA range */
static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
{
return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
}
/* Return the translation regime that applies to an AT instruction */
static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
{
/*
* We only get here from guest EL2, so the translation
* regime AT applies to is solely defined by {E2H,TGE}.
*/
switch (op) {
case OP_AT_S1E2R:
case OP_AT_S1E2W:
case OP_AT_S1E2A:
return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
break;
default:
return (vcpu_el2_e2h_is_set(vcpu) &&
vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
}
}
static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
{
if (!kvm_has_s1pie(vcpu->kvm))
return false;
switch (regime) {
case TR_EL2:
case TR_EL20:
return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE;
case TR_EL10:
return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) &&
(__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1x_PIE);
default:
BUG();
}
}
static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
{
u64 val;
if (!kvm_has_s1poe(vcpu->kvm)) {
wi->poe = wi->e0poe = false;
return;
}
switch (wi->regime) {
case TR_EL2:
case TR_EL20:
val = vcpu_read_sys_reg(vcpu, TCR2_EL2);
wi->poe = val & TCR2_EL2_POE;
wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE);
break;
case TR_EL10:
if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) {
wi->poe = wi->e0poe = false;
return;
}
val = __vcpu_sys_reg(vcpu, TCR2_EL1);
wi->poe = val & TCR2_EL1x_POE;
wi->e0poe = val & TCR2_EL1x_E0POE;
}
}
static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
struct s1_walk_result *wr, u64 va)
{
u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
unsigned int stride, x;
bool va55, tbi, lva, as_el0;
hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
wi->regime = compute_translation_regime(vcpu, op);
as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
(*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
va55 = va & BIT(55);
if (wi->regime == TR_EL2 && va55)
goto addrsz;
wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
switch (wi->regime) {
case TR_EL10:
sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
ttbr = (va55 ?
vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
vcpu_read_sys_reg(vcpu, TTBR0_EL1));
break;
case TR_EL2:
case TR_EL20:
sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
ttbr = (va55 ?
vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
vcpu_read_sys_reg(vcpu, TTBR0_EL2));
break;
default:
BUG();
}
tbi = (wi->regime == TR_EL2 ?
FIELD_GET(TCR_EL2_TBI, tcr) :
(va55 ?
FIELD_GET(TCR_TBI1, tcr) :
FIELD_GET(TCR_TBI0, tcr)));
if (!tbi && (u64)sign_extend64(va, 55) != va)
goto addrsz;
va = (u64)sign_extend64(va, 55);
/* Let's put the MMU disabled case aside immediately */
switch (wi->regime) {
case TR_EL10:
/*
* If dealing with the EL1&0 translation regime, 3 things
* can disable the S1 translation:
*
* - HCR_EL2.DC = 1
* - HCR_EL2.{E2H,TGE} = {0,1}
* - SCTLR_EL1.M = 0
*
* The TGE part is interesting. If we have decided that this
* is EL1&0, then it means that either {E2H,TGE} == {1,0} or
* {0,x}, and we only need to test for TGE == 1.
*/
if (hcr & (HCR_DC | HCR_TGE)) {
wr->level = S1_MMU_DISABLED;
break;
}
fallthrough;
case TR_EL2:
case TR_EL20:
if (!(sctlr & SCTLR_ELx_M))
wr->level = S1_MMU_DISABLED;
break;
}
if (wr->level == S1_MMU_DISABLED) {
if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
goto addrsz;
wr->pa = va;
return 0;
}
wi->be = sctlr & SCTLR_ELx_EE;
wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
wi->hpd &= (wi->regime == TR_EL2 ?
FIELD_GET(TCR_EL2_HPD, tcr) :
(va55 ?
FIELD_GET(TCR_HPD1, tcr) :
FIELD_GET(TCR_HPD0, tcr)));
/* R_JHSVW */
wi->hpd |= s1pie_enabled(vcpu, wi->regime);
/* Do we have POE? */
compute_s1poe(vcpu, wi);
/* R_BVXDG */
wi->hpd |= (wi->poe || wi->e0poe);
/* Someone was silly enough to encode TG0/TG1 differently */
if (va55) {
wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
tg = FIELD_GET(TCR_TG1_MASK, tcr);
switch (tg << TCR_TG1_SHIFT) {
case TCR_TG1_4K:
wi->pgshift = 12; break;
case TCR_TG1_16K:
wi->pgshift = 14; break;
case TCR_TG1_64K:
default: /* IMPDEF: treat any other value as 64k */
wi->pgshift = 16; break;
}
} else {
wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
tg = FIELD_GET(TCR_TG0_MASK, tcr);
switch (tg << TCR_TG0_SHIFT) {
case TCR_TG0_4K:
wi->pgshift = 12; break;
case TCR_TG0_16K:
wi->pgshift = 14; break;
case TCR_TG0_64K:
default: /* IMPDEF: treat any other value as 64k */
wi->pgshift = 16; break;
}
}
/* R_PLCGL, R_YXNYW */
if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
if (wi->txsz > 39)
goto transfault_l0;
} else {
if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
goto transfault_l0;
}
/* R_GTJBY, R_SXWGM */
switch (BIT(wi->pgshift)) {
case SZ_4K:
lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
break;
case SZ_16K:
lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
break;
case SZ_64K:
lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
break;
}
if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
goto transfault_l0;
ia_bits = get_ia_size(wi);
/* R_YYVYV, I_THCZK */
if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
(va55 && va < GENMASK(63, ia_bits)))
goto transfault_l0;
/* I_ZFSYQ */
if (wi->regime != TR_EL2 &&
(tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
goto transfault_l0;
/* R_BNDVG and following statements */
if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
goto transfault_l0;
/* AArch64.S1StartLevel() */
stride = wi->pgshift - 3;
wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
ps = (wi->regime == TR_EL2 ?
FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
/* Compute minimal alignment */
x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
wi->baddr = ttbr & TTBRx_EL1_BADDR;
/* R_VPBBF */
if (check_output_size(wi->baddr, wi))
goto addrsz;
wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
return 0;
addrsz: /* Address Size Fault level 0 */
fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
return -EFAULT;
transfault_l0: /* Translation Fault level 0 */
fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
return -EFAULT;
}
static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_result *wr, u64 va)
{
u64 va_top, va_bottom, baddr, desc;
int level, stride, ret;
level = wi->sl;
stride = wi->pgshift - 3;
baddr = wi->baddr;
va_top = get_ia_size(wi) - 1;
while (1) {
u64 index, ipa;
va_bottom = (3 - level) * stride + wi->pgshift;
index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
ipa = baddr | index;
if (wi->s2) {
struct kvm_s2_trans s2_trans = {};
ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
if (ret) {
fail_s1_walk(wr,
(s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
true, true);
return ret;
}
if (!kvm_s2_trans_readable(&s2_trans)) {
fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
true, true);
return -EPERM;
}
ipa = kvm_s2_trans_output(&s2_trans);
}
ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
if (ret) {
fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
true, false);
return ret;
}
if (wi->be)
desc = be64_to_cpu((__force __be64)desc);
else
desc = le64_to_cpu((__force __le64)desc);
/* Invalid descriptor */
if (!(desc & BIT(0)))
goto transfault;
/* Block mapping, check validity down the line */
if (!(desc & BIT(1)))
break;
/* Page mapping */
if (level == 3)
break;
/* Table handling */
if (!wi->hpd) {
wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
}
baddr = desc & GENMASK_ULL(47, wi->pgshift);
/* Check for out-of-range OA */
if (check_output_size(baddr, wi))
goto addrsz;
/* Prepare for next round */
va_top = va_bottom - 1;
level++;
}
/* Block mapping, check the validity of the level */
if (!(desc & BIT(1))) {
bool valid_block = false;
switch (BIT(wi->pgshift)) {
case SZ_4K:
valid_block = level == 1 || level == 2;
break;
case SZ_16K:
case SZ_64K:
valid_block = level == 2;
break;
}
if (!valid_block)
goto transfault;
}
if (check_output_size(desc & GENMASK(47, va_bottom), wi))
goto addrsz;
va_bottom += contiguous_bit_shift(desc, wi, level);
wr->failed = false;
wr->level = level;
wr->desc = desc;
wr->pa = desc & GENMASK(47, va_bottom);
wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
return 0;
addrsz:
fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
return -EINVAL;
transfault:
fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
return -ENOENT;
}
struct mmu_config {
u64 ttbr0;
u64 ttbr1;
u64 tcr;
u64 mair;
u64 tcr2;
u64 pir;
u64 pire0;
u64 por_el0;
u64 por_el1;
u64 sctlr;
u64 vttbr;
u64 vtcr;
u64 hcr;
};
static void __mmu_config_save(struct mmu_config *config)
{
config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
config->tcr = read_sysreg_el1(SYS_TCR);
config->mair = read_sysreg_el1(SYS_MAIR);
if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
config->tcr2 = read_sysreg_el1(SYS_TCR2);
if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
config->pir = read_sysreg_el1(SYS_PIR);
config->pire0 = read_sysreg_el1(SYS_PIRE0);
}
if (system_supports_poe()) {
config->por_el1 = read_sysreg_el1(SYS_POR);
config->por_el0 = read_sysreg_s(SYS_POR_EL0);
}
}
config->sctlr = read_sysreg_el1(SYS_SCTLR);
config->vttbr = read_sysreg(vttbr_el2);
config->vtcr = read_sysreg(vtcr_el2);
config->hcr = read_sysreg(hcr_el2);
}
static void __mmu_config_restore(struct mmu_config *config)
{
write_sysreg(config->hcr, hcr_el2);
/*
* ARM errata 1165522 and 1530923 require TGE to be 1 before
* we update the guest state.
*/
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
write_sysreg_el1(config->ttbr0, SYS_TTBR0);
write_sysreg_el1(config->ttbr1, SYS_TTBR1);
write_sysreg_el1(config->tcr, SYS_TCR);
write_sysreg_el1(config->mair, SYS_MAIR);
if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
write_sysreg_el1(config->tcr2, SYS_TCR2);
if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
write_sysreg_el1(config->pir, SYS_PIR);
write_sysreg_el1(config->pire0, SYS_PIRE0);
}
if (system_supports_poe()) {
write_sysreg_el1(config->por_el1, SYS_POR);
write_sysreg_s(config->por_el0, SYS_POR_EL0);
}
}
write_sysreg_el1(config->sctlr, SYS_SCTLR);
write_sysreg(config->vttbr, vttbr_el2);
write_sysreg(config->vtcr, vtcr_el2);
}
static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
u64 host_pan;
bool fail;
host_pan = read_sysreg_s(SYS_PSTATE_PAN);
write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
switch (op) {
case OP_AT_S1E1RP:
fail = __kvm_at(OP_AT_S1E1RP, vaddr);
break;
case OP_AT_S1E1WP:
fail = __kvm_at(OP_AT_S1E1WP, vaddr);
break;
}
write_sysreg_s(host_pan, SYS_PSTATE_PAN);
return fail;
}
#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
#define MEMATTR_NC 0b0100
#define MEMATTR_Wt 0b1000
#define MEMATTR_Wb 0b1100
#define MEMATTR_WbRaWa 0b1111
#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
static u8 s2_memattr_to_attr(u8 memattr)
{
memattr &= 0b1111;
switch (memattr) {
case 0b0000:
case 0b0001:
case 0b0010:
case 0b0011:
return memattr << 2;
case 0b0100:
return MEMATTR(Wb, Wb);
case 0b0101:
return MEMATTR(NC, NC);
case 0b0110:
return MEMATTR(Wt, NC);
case 0b0111:
return MEMATTR(Wb, NC);
case 0b1000:
/* Reserved, assume NC */
return MEMATTR(NC, NC);
case 0b1001:
return MEMATTR(NC, Wt);
case 0b1010:
return MEMATTR(Wt, Wt);
case 0b1011:
return MEMATTR(Wb, Wt);
case 0b1100:
/* Reserved, assume NC */
return MEMATTR(NC, NC);
case 0b1101:
return MEMATTR(NC, Wb);
case 0b1110:
return MEMATTR(Wt, Wb);
case 0b1111:
return MEMATTR(Wb, Wb);
default:
unreachable();
}
}
static u8 combine_s1_s2_attr(u8 s1, u8 s2)
{
bool transient;
u8 final = 0;
/* Upgrade transient s1 to non-transient to simplify things */
switch (s1) {
case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
transient = true;
s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
break;
case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
transient = true;
s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
break;
default:
transient = false;
}
/* S2CombineS1AttrHints() */
if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
(s2 & GENMASK(3, 2)) == MEMATTR_NC)
final = MEMATTR_NC;
else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
(s2 & GENMASK(3, 2)) == MEMATTR_Wt)
final = MEMATTR_Wt;
else
final = MEMATTR_Wb;
if (final != MEMATTR_NC) {
/* Inherit RaWa hints form S1 */
if (transient) {
switch (s1 & GENMASK(3, 2)) {
case MEMATTR_Wt:
final = 0;
break;
case MEMATTR_Wb:
final = MEMATTR_NC;
break;
}
}
final |= s1 & GENMASK(1, 0);
}
return final;
}
#define ATTR_NSH 0b00
#define ATTR_RSV 0b01
#define ATTR_OSH 0b10
#define ATTR_ISH 0b11
static u8 compute_sh(u8 attr, u64 desc)
{
u8 sh;
/* Any form of device, as well as NC has SH[1:0]=0b10 */
if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
return ATTR_OSH;
sh = FIELD_GET(PTE_SHARED, desc);
if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
sh = ATTR_NSH;
return sh;
}
static u8 combine_sh(u8 s1_sh, u8 s2_sh)
{
if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
return ATTR_OSH;
if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
return ATTR_ISH;
return ATTR_NSH;
}
static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
struct kvm_s2_trans *tr)
{
u8 s1_parattr, s2_memattr, final_attr;
u64 par;
/* If S2 has failed to translate, report the damage */
if (tr->esr) {
par = SYS_PAR_EL1_RES1;
par |= SYS_PAR_EL1_F;
par |= SYS_PAR_EL1_S;
par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
return par;
}
s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
s2_memattr &= ~BIT(3);
/* Combination of R_VRJSW and R_RHWZM */
switch (s2_memattr) {
case 0b0101:
if (MEMATTR_IS_DEVICE(s1_parattr))
final_attr = s1_parattr;
else
final_attr = MEMATTR(NC, NC);
break;
case 0b0110:
case 0b1110:
final_attr = MEMATTR(WbRaWa, WbRaWa);
break;
case 0b0111:
case 0b1111:
/* Preserve S1 attribute */
final_attr = s1_parattr;
break;
case 0b0100:
case 0b1100:
case 0b1101:
/* Reserved, do something non-silly */
final_attr = s1_parattr;
break;
default:
/* MemAttr[2]=0, Device from S2 */
final_attr = s2_memattr & GENMASK(1,0) << 2;
}
} else {
/* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
if (MEMATTR_IS_DEVICE(s1_parattr) ||
MEMATTR_IS_DEVICE(s2_parattr)) {
final_attr = min(s1_parattr, s2_parattr);
} else {
/* At this stage, this is memory vs memory */
final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
s2_parattr & 0xf);
final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
s2_parattr >> 4) << 4;
}
}
if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
!MEMATTR_IS_DEVICE(final_attr))
final_attr = MEMATTR(NC, NC);
par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
par |= tr->output & GENMASK(47, 12);
par |= FIELD_PREP(SYS_PAR_EL1_SH,
combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
compute_sh(final_attr, tr->desc)));
return par;
}
static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
enum trans_regime regime)
{
u64 par;
if (wr->failed) {
par = SYS_PAR_EL1_RES1;
par |= SYS_PAR_EL1_F;
par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
par |= wr->s2 ? SYS_PAR_EL1_S : 0;
} else if (wr->level == S1_MMU_DISABLED) {
/* MMU off or HCR_EL2.DC == 1 */
par = SYS_PAR_EL1_NSE;
par |= wr->pa & GENMASK_ULL(47, 12);
if (regime == TR_EL10 &&
(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
MEMATTR(WbRaWa, WbRaWa));
par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
} else {
par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
}
} else {
u64 mair, sctlr;
u8 sh;
par = SYS_PAR_EL1_NSE;
mair = (regime == TR_EL10 ?
vcpu_read_sys_reg(vcpu, MAIR_EL1) :
vcpu_read_sys_reg(vcpu, MAIR_EL2));
mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
mair &= 0xff;
sctlr = (regime == TR_EL10 ?
vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
vcpu_read_sys_reg(vcpu, SCTLR_EL2));
/* Force NC for memory if SCTLR_ELx.C is clear */
if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
mair = MEMATTR(NC, NC);
par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
par |= wr->pa & GENMASK_ULL(47, 12);
sh = compute_sh(mair, wr->desc);
par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
}
return par;
}
static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
{
u64 sctlr;
if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
return false;
if (s1pie_enabled(vcpu, regime))
return true;
if (regime == TR_EL10)
sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
else
sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
return sctlr & SCTLR_EL1_EPAN;
}
static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
struct s1_walk_info *wi,
struct s1_walk_result *wr)
{
bool wxn;
/* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
if (wi->regime != TR_EL2) {
switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
case 0b00:
wr->pr = wr->pw = true;
wr->ur = wr->uw = false;
break;
case 0b01:
wr->pr = wr->pw = wr->ur = wr->uw = true;
break;
case 0b10:
wr->pr = true;
wr->pw = wr->ur = wr->uw = false;
break;
case 0b11:
wr->pr = wr->ur = true;
wr->pw = wr->uw = false;
break;
}
/* We don't use px for anything yet, but hey... */
wr->px = !((wr->desc & PTE_PXN) || wr->uw);
wr->ux = !(wr->desc & PTE_UXN);
} else {
wr->ur = wr->uw = wr->ux = false;
if (!(wr->desc & PTE_RDONLY)) {
wr->pr = wr->pw = true;
} else {
wr->pr = true;
wr->pw = false;
}
/* XN maps to UXN */
wr->px = !(wr->desc & PTE_UXN);
}
switch (wi->regime) {
case TR_EL2:
case TR_EL20:
wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
break;
case TR_EL10:
wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
break;
}
wr->pwxn = wr->uwxn = wxn;
wr->pov = wi->poe;
wr->uov = wi->e0poe;
}
static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
struct s1_walk_info *wi,
struct s1_walk_result *wr)
{
/* Hierarchical part of AArch64.S1DirectBasePermissions() */
if (wi->regime != TR_EL2) {
switch (wr->APTable) {
case 0b00:
break;
case 0b01:
wr->ur = wr->uw = false;
break;
case 0b10:
wr->pw = wr->uw = false;
break;
case 0b11:
wr->pw = wr->ur = wr->uw = false;
break;
}
wr->px &= !wr->PXNTable;
wr->ux &= !wr->UXNTable;
} else {
if (wr->APTable & BIT(1))
wr->pw = false;
/* XN maps to UXN */
wr->px &= !wr->UXNTable;
}
}
#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
#define set_priv_perms(wr, r, w, x) \
do { \
(wr)->pr = (r); \
(wr)->pw = (w); \
(wr)->px = (x); \
} while (0)
#define set_unpriv_perms(wr, r, w, x) \
do { \
(wr)->ur = (r); \
(wr)->uw = (w); \
(wr)->ux = (x); \
} while (0)
#define set_priv_wxn(wr, v) \
do { \
(wr)->pwxn = (v); \
} while (0)
#define set_unpriv_wxn(wr, v) \
do { \
(wr)->uwxn = (v); \
} while (0)
/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */
#define set_perms(w, wr, ip) \
do { \
/* R_LLZDZ */ \
switch ((ip)) { \
case 0b0000: \
set_ ## w ## _perms((wr), false, false, false); \
break; \
case 0b0001: \
set_ ## w ## _perms((wr), true , false, false); \
break; \
case 0b0010: \
set_ ## w ## _perms((wr), false, false, true ); \
break; \
case 0b0011: \
set_ ## w ## _perms((wr), true , false, true ); \
break; \
case 0b0100: \
set_ ## w ## _perms((wr), false, false, false); \
break; \
case 0b0101: \
set_ ## w ## _perms((wr), true , true , false); \
break; \
case 0b0110: \
set_ ## w ## _perms((wr), true , true , true ); \
break; \
case 0b0111: \
set_ ## w ## _perms((wr), true , true , true ); \
break; \
case 0b1000: \
set_ ## w ## _perms((wr), true , false, false); \
break; \
case 0b1001: \
set_ ## w ## _perms((wr), true , false, false); \
break; \
case 0b1010: \
set_ ## w ## _perms((wr), true , false, true ); \
break; \
case 0b1011: \
set_ ## w ## _perms((wr), false, false, false); \
break; \
case 0b1100: \
set_ ## w ## _perms((wr), true , true , false); \
break; \
case 0b1101: \
set_ ## w ## _perms((wr), false, false, false); \
break; \
case 0b1110: \
set_ ## w ## _perms((wr), true , true , true ); \
break; \
case 0b1111: \
set_ ## w ## _perms((wr), false, false, false); \
break; \
} \
\
/* R_HJYGR */ \
set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \
\
} while (0)
static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
struct s1_walk_info *wi,
struct s1_walk_result *wr)
{
u8 up, pp, idx;
idx = pte_pi_index(wr->desc);
switch (wi->regime) {
case TR_EL10:
pp = perm_idx(vcpu, PIR_EL1, idx);
up = perm_idx(vcpu, PIRE0_EL1, idx);
break;
case TR_EL20:
pp = perm_idx(vcpu, PIR_EL2, idx);
up = perm_idx(vcpu, PIRE0_EL2, idx);
break;
case TR_EL2:
pp = perm_idx(vcpu, PIR_EL2, idx);
up = 0;
break;
}
set_perms(priv, wr, pp);
if (wi->regime != TR_EL2)
set_perms(unpriv, wr, up);
else
set_unpriv_perms(wr, false, false, false);
wr->pov = wi->poe && !(pp & BIT(3));
wr->uov = wi->e0poe && !(up & BIT(3));
/* R_VFPJF */
if (wr->px && wr->uw) {
set_priv_perms(wr, false, false, false);
set_unpriv_perms(wr, false, false, false);
}
}
static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
struct s1_walk_info *wi,
struct s1_walk_result *wr)
{
u8 idx, pov_perms, uov_perms;
idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
switch (wi->regime) {
case TR_EL10:
pov_perms = perm_idx(vcpu, POR_EL1, idx);
uov_perms = perm_idx(vcpu, POR_EL0, idx);
break;
case TR_EL20:
pov_perms = perm_idx(vcpu, POR_EL2, idx);
uov_perms = perm_idx(vcpu, POR_EL0, idx);
break;
case TR_EL2:
pov_perms = perm_idx(vcpu, POR_EL2, idx);
uov_perms = 0;
break;
}
if (pov_perms & ~POE_RXW)
pov_perms = POE_NONE;
if (wi->poe && wr->pov) {
wr->pr &= pov_perms & POE_R;
wr->px &= pov_perms & POE_X;
wr->pw &= pov_perms & POE_W;
}
if (uov_perms & ~POE_RXW)
uov_perms = POE_NONE;
if (wi->e0poe && wr->uov) {
wr->ur &= uov_perms & POE_R;
wr->ux &= uov_perms & POE_X;
wr->uw &= uov_perms & POE_W;
}
}
static void compute_s1_permissions(struct kvm_vcpu *vcpu,
struct s1_walk_info *wi,
struct s1_walk_result *wr)
{
bool pan;
if (!s1pie_enabled(vcpu, wi->regime))
compute_s1_direct_permissions(vcpu, wi, wr);
else
compute_s1_indirect_permissions(vcpu, wi, wr);
if (!wi->hpd)
compute_s1_hierarchical_permissions(vcpu, wi, wr);
if (wi->poe || wi->e0poe)
compute_s1_overlay_permissions(vcpu, wi, wr);
/* R_QXXPC */
if (wr->pwxn) {
if (!wr->pov && wr->pw)
wr->px = false;
if (wr->pov && wr->px)
wr->pw = false;
}
/* R_NPBXC */
if (wr->uwxn) {
if (!wr->uov && wr->uw)
wr->ux = false;
if (wr->uov && wr->ux)
wr->uw = false;
}
pan = wi->pan && (wr->ur || wr->uw ||
(pan3_enabled(vcpu, wi->regime) && wr->ux));
wr->pw &= !pan;
wr->pr &= !pan;
}
static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
struct s1_walk_result wr = {};
struct s1_walk_info wi = {};
bool perm_fail = false;
int ret, idx;
ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
if (ret)
goto compute_par;
if (wr.level == S1_MMU_DISABLED)
goto compute_par;
idx = srcu_read_lock(&vcpu->kvm->srcu);
ret = walk_s1(vcpu, &wi, &wr, vaddr);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
if (ret)
goto compute_par;
compute_s1_permissions(vcpu, &wi, &wr);
switch (op) {
case OP_AT_S1E1RP:
case OP_AT_S1E1R:
case OP_AT_S1E2R:
perm_fail = !wr.pr;
break;
case OP_AT_S1E1WP:
case OP_AT_S1E1W:
case OP_AT_S1E2W:
perm_fail = !wr.pw;
break;
case OP_AT_S1E0R:
perm_fail = !wr.ur;
break;
case OP_AT_S1E0W:
perm_fail = !wr.uw;
break;
case OP_AT_S1E1A:
case OP_AT_S1E2A:
break;
default:
BUG();
}
if (perm_fail)
fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
compute_par:
return compute_par_s1(vcpu, &wr, wi.regime);
}
/*
* Return the PAR_EL1 value as the result of a valid translation.
*
* If the translation is unsuccessful, the value may only contain
* PAR_EL1.F, and cannot be taken at face value. It isn't an
* indication of the translation having failed, only that the fast
* path did not succeed, *unless* it indicates a S1 permission fault.
*/
static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
struct mmu_config config;
struct kvm_s2_mmu *mmu;
bool fail;
u64 par;
par = SYS_PAR_EL1_F;
/*
* We've trapped, so everything is live on the CPU. As we will
* be switching contexts behind everybody's back, disable
* interrupts while holding the mmu lock.
*/
guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
/*
* If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
* the right one (as we trapped from vEL2). If not, save the
* full MMU context.
*/
if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
goto skip_mmu_switch;
/*
* Obtaining the S2 MMU for a L2 is horribly racy, and we may not
* find it (recycled by another vcpu, for example). When this
* happens, admit defeat immediately and use the SW (slow) path.
*/
mmu = lookup_s2_mmu(vcpu);
if (!mmu)
return par;
__mmu_config_save(&config);
write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
if (kvm_has_tcr2(vcpu->kvm)) {
write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
if (kvm_has_s1pie(vcpu->kvm)) {
write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
}
if (kvm_has_s1poe(vcpu->kvm)) {
write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
}
}
write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
__load_stage2(mmu, mmu->arch);
skip_mmu_switch:
/* Clear TGE, enable S2 translation, we're rolling */
write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2);
isb();
switch (op) {
case OP_AT_S1E1RP:
case OP_AT_S1E1WP:
fail = at_s1e1p_fast(vcpu, op, vaddr);
break;
case OP_AT_S1E1R:
fail = __kvm_at(OP_AT_S1E1R, vaddr);
break;
case OP_AT_S1E1W:
fail = __kvm_at(OP_AT_S1E1W, vaddr);
break;
case OP_AT_S1E0R:
fail = __kvm_at(OP_AT_S1E0R, vaddr);
break;
case OP_AT_S1E0W:
fail = __kvm_at(OP_AT_S1E0W, vaddr);
break;
case OP_AT_S1E1A:
fail = __kvm_at(OP_AT_S1E1A, vaddr);
break;
default:
WARN_ON_ONCE(1);
fail = true;
break;
}
if (!fail)
par = read_sysreg_par();
if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
__mmu_config_restore(&config);
return par;
}
static bool par_check_s1_perm_fault(u64 par)
{
u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
!(par & SYS_PAR_EL1_S));
}
void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
/*
* If PAR_EL1 reports that AT failed on a S1 permission fault, we
* know for sure that the PTW was able to walk the S1 tables and
* there's nothing else to do.
*
* If AT failed for any other reason, then we must walk the guest S1
* to emulate the instruction.
*/
if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
par = handle_at_slow(vcpu, op, vaddr);
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
}
void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
u64 par;
/*
* We've trapped, so everything is live on the CPU. As we will be
* switching context behind everybody's back, disable interrupts...
*/
scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
u64 val, hcr;
bool fail;
val = hcr = read_sysreg(hcr_el2);
val &= ~HCR_TGE;
val |= HCR_VM;
if (!vcpu_el2_e2h_is_set(vcpu))
val |= HCR_NV | HCR_NV1;
write_sysreg(val, hcr_el2);
isb();
par = SYS_PAR_EL1_F;
switch (op) {
case OP_AT_S1E2R:
fail = __kvm_at(OP_AT_S1E1R, vaddr);
break;
case OP_AT_S1E2W:
fail = __kvm_at(OP_AT_S1E1W, vaddr);
break;
case OP_AT_S1E2A:
fail = __kvm_at(OP_AT_S1E1A, vaddr);
break;
default:
WARN_ON_ONCE(1);
fail = true;
}
isb();
if (!fail)
par = read_sysreg_par();
write_sysreg(hcr, hcr_el2);
isb();
}
/* We failed the translation, let's replay it in slow motion */
if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
par = handle_at_slow(vcpu, op, vaddr);
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
}
void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
struct kvm_s2_trans out = {};
u64 ipa, par;
bool write;
int ret;
/* Do the stage-1 translation */
switch (op) {
case OP_AT_S12E1R:
op = OP_AT_S1E1R;
write = false;
break;
case OP_AT_S12E1W:
op = OP_AT_S1E1W;
write = true;
break;
case OP_AT_S12E0R:
op = OP_AT_S1E0R;
write = false;
break;
case OP_AT_S12E0W:
op = OP_AT_S1E0W;
write = true;
break;
default:
WARN_ON_ONCE(1);
return;
}
__kvm_at_s1e01(vcpu, op, vaddr);
par = vcpu_read_sys_reg(vcpu, PAR_EL1);
if (par & SYS_PAR_EL1_F)
return;
/*
* If we only have a single stage of translation (E2H=0 or
* TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
*/
if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
!(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
return;
/* Do the stage-2 translation */
ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
out.esr = 0;
ret = kvm_walk_nested_s2(vcpu, ipa, &out);
if (ret < 0)
return;
/* Check the access permission */
if (!out.esr &&
((!write && !out.readable) || (write && !out.writable)))
out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
par = compute_par_s12(vcpu, par, &out);
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
}