Two larger x86 series:

* Redo incorrect fix for SEV/SMAP erratum
 
 * Windows 11 Hyper-V workaround
 
 Other x86 changes:
 
 * Various x86 cleanups
 
 * Re-enable access_tracking_perf_test
 
 * Fix for #GP handling on SVM
 
 * Fix for CPUID leaf 0Dh in KVM_GET_SUPPORTED_CPUID
 
 * Fix for ICEBP in interrupt shadow
 
 * Avoid false-positive RCU splat
 
 * Enable Enlightened MSR-Bitmap support for real
 
 ARM:
 
 * Correctly update the shadow register on exception injection when
 running in nVHE mode
 
 * Correctly use the mm_ops indirection when performing cache invalidation
 from the page-table walker
 
 * Restrict the vgic-v3 workaround for SEIS to the two known broken
 implementations
 
 Generic code changes:
 
 * Dead code cleanup
 
 There will be another pull request for ARM fixes next week, but
 those patches need a bit more soak time.
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmHz5eIUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroNv4wgAopj0Zlutrrtw3KT4/XnmSdMPgN0j
 jQNzysSLTO5wGQCEogycjYXkGUDFu1Gdi+K91QAyjeKja20pIhPLeS2CBDRJyOc5
 73K7sxqz51JnQiVFzkTuA+qzn+lXaJ9LUXtdg8BnQMSKyt2AJOqE8uT10kcYOD5q
 mW4V3QUA0QpVKN0cYHv/G/zvBwQGGSLZetFbuAzwH2EDTpIi1aio5ZN1r0AoH18L
 2x5kYPpqmnoBvo2cB4b7SNmxv3ZPQ5K+wta0uwZ4pO+UuYiRd84RPr5lErywJC3w
 nci0eC0DoXrC6h+35UItqM8RqAGv6LADbDnr1RGojmfogSD0OtbX8y3hjw==
 =iKnI
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "Two larger x86 series:

   - Redo incorrect fix for SEV/SMAP erratum

   - Windows 11 Hyper-V workaround

  Other x86 changes:

   - Various x86 cleanups

   - Re-enable access_tracking_perf_test

   - Fix for #GP handling on SVM

   - Fix for CPUID leaf 0Dh in KVM_GET_SUPPORTED_CPUID

   - Fix for ICEBP in interrupt shadow

   - Avoid false-positive RCU splat

   - Enable Enlightened MSR-Bitmap support for real

  ARM:

   - Correctly update the shadow register on exception injection when
     running in nVHE mode

   - Correctly use the mm_ops indirection when performing cache
     invalidation from the page-table walker

   - Restrict the vgic-v3 workaround for SEIS to the two known broken
     implementations

  Generic code changes:

   - Dead code cleanup"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (43 commits)
  KVM: eventfd: Fix false positive RCU usage warning
  KVM: nVMX: Allow VMREAD when Enlightened VMCS is in use
  KVM: nVMX: Implement evmcs_field_offset() suitable for handle_vmread()
  KVM: nVMX: Rename vmcs_to_field_offset{,_table}
  KVM: nVMX: eVMCS: Filter out VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
  KVM: nVMX: Also filter MSR_IA32_VMX_TRUE_PINBASED_CTLS when eVMCS
  selftests: kvm: check dynamic bits against KVM_X86_XCOMP_GUEST_SUPP
  KVM: x86: add system attribute to retrieve full set of supported xsave states
  KVM: x86: Add a helper to retrieve userspace address from kvm_device_attr
  selftests: kvm: move vm_xsave_req_perm call to amx_test
  KVM: x86: Sync the states size with the XCR0/IA32_XSS at, any time
  KVM: x86: Update vCPU's runtime CPUID on write to MSR_IA32_XSS
  KVM: x86: Keep MSR_IA32_XSS unchanged for INIT
  KVM: x86: Free kvm_cpuid_entry2 array on post-KVM_RUN KVM_SET_CPUID{,2}
  KVM: nVMX: WARN on any attempt to allocate shadow VMCS for vmcs02
  KVM: selftests: Don't skip L2's VMCALL in SMM test for SVM guest
  KVM: x86: Check .flags in kvm_cpuid_check_equal() too
  KVM: x86: Forcibly leave nested virt when SMM state is toggled
  KVM: SVM: drop unnecessary code in svm_hv_vmcb_dirty_nested_enlightenments()
  KVM: SVM: hyper-v: Enable Enlightened MSR-Bitmap support for real
  ...
This commit is contained in:
Linus Torvalds 2022-01-28 19:00:26 +02:00
commit 3cd7cd8a62
35 changed files with 492 additions and 230 deletions

View File

@ -3268,6 +3268,7 @@ number.
:Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
KVM_CAP_VCPU_ATTRIBUTES for vcpu device
KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set)
:Type: device ioctl, vm ioctl, vcpu ioctl
:Parameters: struct kvm_device_attr
:Returns: 0 on success, -1 on error
@ -3302,7 +3303,8 @@ transferred is defined by the particular attribute.
------------------------
:Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
KVM_CAP_VCPU_ATTRIBUTES for vcpu device
KVM_CAP_VCPU_ATTRIBUTES for vcpu device
KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device
:Type: device ioctl, vm ioctl, vcpu ioctl
:Parameters: struct kvm_device_attr
:Returns: 0 on success, -1 on error

View File

@ -38,7 +38,10 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val)
{
write_sysreg_el1(val, SYS_SPSR);
if (has_vhe())
write_sysreg_el1(val, SYS_SPSR);
else
__vcpu_sys_reg(vcpu, SPSR_EL1) = val;
}
static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)

View File

@ -983,13 +983,9 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
*/
stage2_put_pte(ptep, mmu, addr, level, mm_ops);
if (need_flush) {
kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops);
dcache_clean_inval_poc((unsigned long)pte_follow,
(unsigned long)pte_follow +
kvm_granule_size(level));
}
if (need_flush && mm_ops->dcache_clean_inval_poc)
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
kvm_granule_size(level));
if (childp)
mm_ops->put_page(childp);
@ -1151,15 +1147,13 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
struct kvm_pgtable *pgt = arg;
struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
kvm_pte_t pte = *ptep;
kvm_pte_t *pte_follow;
if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
return 0;
pte_follow = kvm_pte_follow(pte, mm_ops);
dcache_clean_inval_poc((unsigned long)pte_follow,
(unsigned long)pte_follow +
kvm_granule_size(level));
if (mm_ops->dcache_clean_inval_poc)
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
kvm_granule_size(level));
return 0;
}

View File

@ -983,6 +983,9 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
/* IDbits */
val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
/* SEIS */
if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK)
val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT);
/* A3V */
val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
/* EOImode */

View File

@ -609,6 +609,18 @@ static int __init early_gicv4_enable(char *buf)
}
early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
static const struct midr_range broken_seis[] = {
MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM),
MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM),
{},
};
static bool vgic_v3_broken_seis(void)
{
return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) &&
is_midr_in_range_list(read_cpuid_id(), broken_seis));
}
/**
* vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
* @info: pointer to the GIC description
@ -676,9 +688,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
group1_trap = true;
}
if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) {
kvm_info("GICv3 with locally generated SEI\n");
if (vgic_v3_broken_seis()) {
kvm_info("GICv3 with broken locally generated SEI\n");
kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK;
group0_trap = true;
group1_trap = true;
if (ich_vtr_el2 & ICH_VTR_TDS_MASK)

View File

@ -1483,7 +1483,8 @@ struct kvm_x86_ops {
int (*get_msr_feature)(struct kvm_msr_entry *entry);
bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
@ -1496,6 +1497,7 @@ struct kvm_x86_ops {
};
struct kvm_x86_nested_ops {
void (*leave_nested)(struct kvm_vcpu *vcpu);
int (*check_events)(struct kvm_vcpu *vcpu);
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
void (*triple_fault)(struct kvm_vcpu *vcpu);
@ -1861,7 +1863,6 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,

View File

@ -452,6 +452,9 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
/* attributes for system fd (group 0) */
#define KVM_X86_XCOMP_GUEST_SUPP 0
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];

View File

@ -133,6 +133,7 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
orig = &vcpu->arch.cpuid_entries[i];
if (e2[i].function != orig->function ||
e2[i].index != orig->index ||
e2[i].flags != orig->flags ||
e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
return -EINVAL;
@ -196,10 +197,26 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.pv_cpuid.features = best->eax;
}
/*
* Calculate guest's supported XCR0 taking into account guest CPUID data and
* supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0).
*/
static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
{
struct kvm_cpuid_entry2 *best;
best = cpuid_entry2_find(entries, nent, 0xd, 0);
if (!best)
return 0;
return (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
}
static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
int nent)
{
struct kvm_cpuid_entry2 *best;
u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
best = cpuid_entry2_find(entries, nent, 1, 0);
if (best) {
@ -238,6 +255,21 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
/*
* Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
* the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
* requested XCR0 value. The enclave's XFRM must be a subset of XCRO
* at the time of EENTER, thus adjust the allowed XFRM by the guest's
* supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
* '1' even on CPUs that don't support XSAVE.
*/
best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
if (best) {
best->ecx &= guest_supported_xcr0 & 0xffffffff;
best->edx &= guest_supported_xcr0 >> 32;
best->ecx |= XFEATURE_MASK_FPSSE;
}
}
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
@ -261,27 +293,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_apic_set_version(vcpu);
}
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
if (!best)
vcpu->arch.guest_supported_xcr0 = 0;
else
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
/*
* Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
* the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
* requested XCR0 value. The enclave's XFRM must be a subset of XCRO
* at the time of EENTER, thus adjust the allowed XFRM by the guest's
* supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
* '1' even on CPUs that don't support XSAVE.
*/
best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
if (best) {
best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
best->ecx |= XFEATURE_MASK_FPSSE;
}
vcpu->arch.guest_supported_xcr0 =
cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
kvm_update_pv_runtime(vcpu);
@ -346,8 +359,14 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
* whether the supplied CPUID data is equal to what's already set.
*/
if (vcpu->arch.last_vmentry_cpu != -1)
return kvm_cpuid_check_equal(vcpu, e2, nent);
if (vcpu->arch.last_vmentry_cpu != -1) {
r = kvm_cpuid_check_equal(vcpu, e2, nent);
if (r)
return r;
kvfree(e2);
return 0;
}
r = kvm_check_cpuid(vcpu, e2, nent);
if (r)
@ -887,13 +906,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
}
break;
case 0xd: {
u64 guest_perm = xstate_get_guest_group_perm();
u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm();
u64 permitted_xss = supported_xss;
entry->eax &= supported_xcr0 & guest_perm;
entry->ebx = xstate_required_size(supported_xcr0, false);
entry->eax &= permitted_xcr0;
entry->ebx = xstate_required_size(permitted_xcr0, false);
entry->ecx = entry->ebx;
entry->edx &= (supported_xcr0 & guest_perm) >> 32;
if (!supported_xcr0)
entry->edx &= permitted_xcr0 >> 32;
if (!permitted_xcr0)
break;
entry = do_host_cpuid(array, function, 1);
@ -902,20 +922,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
cpuid_entry_override(entry, CPUID_D_1_EAX);
if (entry->eax & (F(XSAVES)|F(XSAVEC)))
entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
true);
else {
WARN_ON_ONCE(supported_xss != 0);
WARN_ON_ONCE(permitted_xss != 0);
entry->ebx = 0;
}
entry->ecx &= supported_xss;
entry->edx &= supported_xss >> 32;
entry->ecx &= permitted_xss;
entry->edx &= permitted_xss >> 32;
for (i = 2; i < 64; ++i) {
bool s_state;
if (supported_xcr0 & BIT_ULL(i))
if (permitted_xcr0 & BIT_ULL(i))
s_state = false;
else if (supported_xss & BIT_ULL(i))
else if (permitted_xss & BIT_ULL(i))
s_state = true;
else
continue;
@ -929,7 +949,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* invalid sub-leafs. Only valid sub-leafs should
* reach this point, and they should have a non-zero
* save state size. Furthermore, check whether the
* processor agrees with supported_xcr0/supported_xss
* processor agrees with permitted_xcr0/permitted_xss
* on whether this is an XCR0- or IA32_XSS-managed area.
*/
if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {

View File

@ -2629,7 +2629,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
cancel_apic_timer(apic);
apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));

View File

@ -983,9 +983,9 @@ void svm_free_nested(struct vcpu_svm *svm)
/*
* Forcibly leave nested mode in order to be able to reset the VCPU later on.
*/
void svm_leave_nested(struct vcpu_svm *svm)
void svm_leave_nested(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
struct vcpu_svm *svm = to_svm(vcpu);
if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
@ -1411,7 +1411,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
svm_leave_nested(svm);
svm_leave_nested(vcpu);
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
return 0;
}
@ -1478,7 +1478,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
*/
if (is_guest_mode(vcpu))
svm_leave_nested(svm);
svm_leave_nested(vcpu);
else
svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
@ -1532,6 +1532,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
}
struct kvm_x86_nested_ops svm_nested_ops = {
.leave_nested = svm_leave_nested,
.check_events = svm_check_nested_events,
.triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,

View File

@ -2100,8 +2100,13 @@ void __init sev_hardware_setup(void)
if (!sev_enabled || !npt_enabled)
goto out;
/* Does the CPU support SEV? */
if (!boot_cpu_has(X86_FEATURE_SEV))
/*
* SEV must obviously be supported in hardware. Sanity check that the
* CPU supports decode assists, which is mandatory for SEV guests to
* support instruction emulation.
*/
if (!boot_cpu_has(X86_FEATURE_SEV) ||
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
goto out;
/* Retrieve SEV CPUID information */

View File

@ -290,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
if (!(efer & EFER_SVME)) {
svm_leave_nested(svm);
svm_leave_nested(vcpu);
svm_set_gif(svm, true);
/* #GP intercept is still needed for vmware backdoor */
if (!enable_vmware_backdoor)
@ -312,7 +312,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return ret;
}
if (svm_gp_erratum_intercept)
/*
* Never intercept #GP for SEV guests, KVM can't
* decrypt guest memory to workaround the erratum.
*/
if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
}
}
@ -1010,9 +1014,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
* as VMware does.
* as VMware does. Don't intercept #GP for SEV guests as KVM can't
* decrypt guest memory to decode the faulting instruction.
*/
if (enable_vmware_backdoor)
if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
svm_set_intercept(svm, INTERCEPT_INTR);
@ -2091,10 +2096,6 @@ static int gp_interception(struct kvm_vcpu *vcpu)
if (error_code)
goto reinject;
/* All SVM instructions expect page aligned RAX */
if (svm->vmcb->save.rax & ~PAGE_MASK)
goto reinject;
/* Decode the instruction for usage later */
if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
goto reinject;
@ -2112,8 +2113,13 @@ static int gp_interception(struct kvm_vcpu *vcpu)
if (!is_guest_mode(vcpu))
return kvm_emulate_instruction(vcpu,
EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
} else
} else {
/* All SVM instructions expect page aligned RAX */
if (svm->vmcb->save.rax & ~PAGE_MASK)
goto reinject;
return emulate_svm_instr(vcpu, opcode);
}
reinject:
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@ -4252,79 +4258,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
}
}
static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
bool smep, smap, is_user;
unsigned long cr4;
u64 error_code;
/* Emulation is always possible when KVM has access to all guest state. */
if (!sev_guest(vcpu->kvm))
return true;
/* #UD and #GP should never be intercepted for SEV guests. */
WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
EMULTYPE_TRAP_UD_FORCED |
EMULTYPE_VMWARE_GP));
/*
* When the guest is an SEV-ES guest, emulation is not possible.
* Emulation is impossible for SEV-ES guests as KVM doesn't have access
* to guest register state.
*/
if (sev_es_guest(vcpu->kvm))
return false;
/*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
*
* Errata:
* When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
* possible that CPU microcode implementing DecodeAssist will fail
* to read bytes of instruction which caused #NPF. In this case,
* GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
* return 0 instead of the correct guest instruction bytes.
*
* This happens because CPU microcode reading instruction bytes
* uses a special opcode which attempts to read data using CPL=0
* privileges. The microcode reads CS:RIP and if it hits a SMAP
* fault, it gives up and returns no instruction bytes.
*
* Detection:
* We reach here in case CPU supports DecodeAssist, raised #NPF and
* returned 0 in GuestIntrBytes field of the VMCB.
* First, errata can only be triggered in case vCPU CR4.SMAP=1.
* Second, if vCPU CR4.SMEP=1, errata could only be triggered
* in case vCPU CPL==3 (Because otherwise guest would have triggered
* a SMEP fault instead of #NPF).
* Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
* As most guests enable SMAP if they have also enabled SMEP, use above
* logic in order to attempt minimize false-positive of detecting errata
* while still preserving all cases semantic correctness.
*
* Workaround:
* To determine what instruction the guest was executing, the hypervisor
* will have to decode the instruction at the instruction pointer.
*
* In non SEV guest, hypervisor will be able to read the guest
* memory to decode the instruction pointer when insn_len is zero
* so we return true to indicate that decoding is possible.
*
* But in the SEV guest, the guest memory is encrypted with the
* guest specific key and hypervisor will not be able to decode the
* instruction pointer so we will not able to workaround it. Lets
* print the error and request to kill the guest.
* Emulation is possible if the instruction is already decoded, e.g.
* when completing I/O after returning from userspace.
*/
if (likely(!insn || insn_len))
if (emul_type & EMULTYPE_NO_DECODE)
return true;
/*
* If RIP is invalid, go ahead with emulation which will cause an
* internal error exit.
* Emulation is possible for SEV guests if and only if a prefilled
* buffer containing the bytes of the intercepted instruction is
* available. SEV guest memory is encrypted with a guest specific key
* and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
* decode garbage.
*
* Inject #UD if KVM reached this point without an instruction buffer.
* In practice, this path should never be hit by a well-behaved guest,
* e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
* is still theoretically reachable, e.g. via unaccelerated fault-like
* AVIC access, and needs to be handled by KVM to avoid putting the
* guest into an infinite loop. Injecting #UD is somewhat arbitrary,
* but its the least awful option given lack of insight into the guest.
*/
if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
if (unlikely(!insn)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return false;
}
/*
* Emulate for SEV guests if the insn buffer is not empty. The buffer
* will be empty if the DecodeAssist microcode cannot fetch bytes for
* the faulting instruction because the code fetch itself faulted, e.g.
* the guest attempted to fetch from emulated MMIO or a guest page
* table used to translate CS:RIP resides in emulated MMIO.
*/
if (likely(insn_len))
return true;
/*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
*
* Errata:
* When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
* possible that CPU microcode implementing DecodeAssist will fail to
* read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
* be '0'. This happens because microcode reads CS:RIP using a _data_
* loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
* gives up and does not fill the instruction bytes buffer.
*
* As above, KVM reaches this point iff the VM is an SEV guest, the CPU
* supports DecodeAssist, a #NPF was raised, KVM's page fault handler
* triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
* GuestIntrBytes field of the VMCB.
*
* This does _not_ mean that the erratum has been encountered, as the
* DecodeAssist will also fail if the load for CS:RIP hits a legitimate
* #PF, e.g. if the guest attempt to execute from emulated MMIO and
* encountered a reserved/not-present #PF.
*
* To hit the erratum, the following conditions must be true:
* 1. CR4.SMAP=1 (obviously).
* 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
* have been hit as the guest would have encountered a SMEP
* violation #PF, not a #NPF.
* 3. The #NPF is not due to a code fetch, in which case failure to
* retrieve the instruction bytes is legitimate (see abvoe).
*
* In addition, don't apply the erratum workaround if the #NPF occurred
* while translating guest page tables (see below).
*/
error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
goto resume_guest;
cr4 = kvm_read_cr4(vcpu);
smep = cr4 & X86_CR4_SMEP;
smap = cr4 & X86_CR4_SMAP;
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
if (!sev_guest(vcpu->kvm))
return true;
pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
/*
* If the fault occurred in userspace, arbitrarily inject #GP
* to avoid killing the guest and to hopefully avoid confusing
* the guest kernel too much, e.g. injecting #PF would not be
* coherent with respect to the guest's page tables. Request
* triple fault if the fault occurred in the kernel as there's
* no fault that KVM can inject without confusing the guest.
* In practice, the triple fault is moot as no sane SEV kernel
* will execute from user memory while also running with SMAP=1.
*/
if (is_user)
kvm_inject_gp(vcpu, 0);
else
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
}
resume_guest:
/*
* If the erratum was not hit, simply resume the guest and let it fault
* again. While awful, e.g. the vCPU may get stuck in an infinite loop
* if the fault is at CPL=0, it's the lesser of all evils. Exiting to
* userspace will kill the guest, and letting the emulator read garbage
* will yield random behavior and potentially corrupt the guest.
*
* Simply resuming the guest is technically not a violation of the SEV
* architecture. AMD's APM states that all code fetches and page table
* accesses for SEV guest are encrypted, regardless of the C-Bit. The
* APM also states that encrypted accesses to MMIO are "ignored", but
* doesn't explicitly define "ignored", i.e. doing nothing and letting
* the guest spin is technically "ignoring" the access.
*/
return false;
}

View File

@ -304,11 +304,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
& ~VMCB_ALWAYS_DIRTY_MASK;
}
static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit)
{
return (vmcb->control.clean & (1 << bit));
}
static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
{
vmcb->control.clean &= ~(1 << bit);
@ -525,7 +520,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
void svm_leave_nested(struct vcpu_svm *svm);
void svm_leave_nested(struct kvm_vcpu *vcpu);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct kvm_vcpu *vcpu);

View File

@ -46,6 +46,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
hve->hv_enlightenments_control.msr_bitmap = 1;
}
static inline void svm_hv_hardware_setup(void)
@ -83,14 +86,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
struct hv_enlightenments *hve =
(struct hv_enlightenments *)vmcb->control.reserved_sw;
/*
* vmcb can be NULL if called during early vcpu init.
* And its okay not to mark vmcb dirty during vcpu init
* as we mark it dirty unconditionally towards end of vcpu
* init phase.
*/
if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
hve->hv_enlightenments_control.msr_bitmap)
if (hve->hv_enlightenments_control.msr_bitmap)
vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
}

View File

@ -54,7 +54,6 @@ struct nested_vmx_msrs {
struct vmcs_config {
int size;
int order;
u32 basic_cap;
u32 revision_id;
u32 pin_based_exec_ctrl;

View File

@ -12,8 +12,6 @@
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
#if IS_ENABLED(CONFIG_HYPERV)
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
{EVMCS1_OFFSET(name), clean_field}
@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
#if IS_ENABLED(CONFIG_HYPERV)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
@ -362,6 +361,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
case MSR_IA32_VMX_PROCBASED_CTLS2:
ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
break;

View File

@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
SECONDARY_EXEC_SHADOW_VMCS | \
SECONDARY_EXEC_TSC_SCALING | \
SECONDARY_EXEC_PAUSE_LOOP_EXITING)
#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
#if IS_ENABLED(CONFIG_HYPERV)
struct evmcs_field {
u16 offset;
u16 clean_field;
@ -73,26 +73,56 @@ struct evmcs_field {
extern const struct evmcs_field vmcs_field_to_evmcs_1[];
extern const unsigned int nr_evmcs_1_fields;
static __always_inline int get_evmcs_offset(unsigned long field,
u16 *clean_field)
static __always_inline int evmcs_field_offset(unsigned long field,
u16 *clean_field)
{
unsigned int index = ROL16(field, 6);
const struct evmcs_field *evmcs_field;
if (unlikely(index >= nr_evmcs_1_fields)) {
WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
field);
if (unlikely(index >= nr_evmcs_1_fields))
return -ENOENT;
}
evmcs_field = &vmcs_field_to_evmcs_1[index];
/*
* Use offset=0 to detect holes in eVMCS. This offset belongs to
* 'revision_id' but this field has no encoding and is supposed to
* be accessed directly.
*/
if (unlikely(!evmcs_field->offset))
return -ENOENT;
if (clean_field)
*clean_field = evmcs_field->clean_field;
return evmcs_field->offset;
}
static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
unsigned long field, u16 offset)
{
/*
* vmcs12_read_any() doesn't care whether the supplied structure
* is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
* the exact offset of the required field, use it for convenience
* here.
*/
return vmcs12_read_any((void *)evmcs, field, offset);
}
#if IS_ENABLED(CONFIG_HYPERV)
static __always_inline int get_evmcs_offset(unsigned long field,
u16 *clean_field)
{
int offset = evmcs_field_offset(field, clean_field);
WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
field);
return offset;
}
static __always_inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;

View File

@ -7,6 +7,7 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
@ -4851,18 +4852,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
/*
* We should allocate a shadow vmcs for vmcs01 only when L1
* executes VMXON and free it when L1 executes VMXOFF.
* As it is invalid to execute VMXON twice, we shouldn't reach
* here when vmcs01 already have an allocated shadow vmcs.
* KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
* when L1 executes VMXOFF or the vCPU is forced out of nested
* operation. VMXON faults if the CPU is already post-VMXON, so it
* should be impossible to already have an allocated shadow VMCS. KVM
* doesn't support virtualization of VMCS shadowing, so vmcs01 should
* always be the loaded VMCS.
*/
WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
return loaded_vmcs->shadow_vmcs;
loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
if (loaded_vmcs->shadow_vmcs)
vmcs_clear(loaded_vmcs->shadow_vmcs);
if (!loaded_vmcs->shadow_vmcs) {
loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
if (loaded_vmcs->shadow_vmcs)
vmcs_clear(loaded_vmcs->shadow_vmcs);
}
return loaded_vmcs->shadow_vmcs;
}
@ -5099,27 +5102,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
/*
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
* any VMREAD sets the ALU flags for VMfailInvalid.
*/
if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
/* Decode instruction info and find the field to read */
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
offset = vmcs_field_to_offset(field);
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
/*
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
* any VMREAD sets the ALU flags for VMfailInvalid.
*/
if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
offset = get_vmcs12_field_offset(field);
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/* Read the field, zero-extended to a u64 value */
value = vmcs12_read_any(vmcs12, field, offset);
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
/* Read the field, zero-extended to a u64 value */
value = vmcs12_read_any(vmcs12, field, offset);
} else {
/*
* Hyper-V TLFS (as of 6.0b) explicitly states, that while an
* enlightened VMCS is active VMREAD/VMWRITE instructions are
* unsupported. Unfortunately, certain versions of Windows 11
* don't comply with this requirement which is not enforced in
* genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
* workaround, as misbehaving guests will panic on VM-Fail.
* Note, enlightened VMCS is incompatible with shadow VMCS so
* all VMREADs from L2 should go to L1.
*/
if (WARN_ON_ONCE(is_guest_mode(vcpu)))
return nested_vmx_failInvalid(vcpu);
offset = evmcs_field_offset(field, NULL);
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/* Read the field, zero-extended to a u64 value */
value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
}
/*
* Now copy part of this value to register or memory, as requested.
@ -5214,7 +5239,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
offset = vmcs_field_to_offset(field);
offset = get_vmcs12_field_offset(field);
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
@ -6462,7 +6487,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
max_idx = 0;
for (i = 0; i < nr_vmcs12_fields; i++) {
/* The vmcs12 table is very, very sparsely populated. */
if (!vmcs_field_to_offset_table[i])
if (!vmcs12_field_offsets[i])
continue;
idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
@ -6771,6 +6796,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
}
struct kvm_x86_nested_ops vmx_nested_ops = {
.leave_nested = vmx_leave_nested,
.check_events = vmx_check_nested_events,
.hv_timer_pending = nested_vmx_preemption_timer_pending,
.triple_fault = nested_vmx_triple_fault,

View File

@ -8,7 +8,7 @@
FIELD(number, name), \
[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
const unsigned short vmcs_field_to_offset_table[] = {
const unsigned short vmcs12_field_offsets[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD(HOST_RSP, host_rsp),
FIELD(HOST_RIP, host_rip),
};
const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table);
const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);

View File

@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(guest_pml_index, 996);
}
extern const unsigned short vmcs_field_to_offset_table[];
extern const unsigned short vmcs12_field_offsets[];
extern const unsigned int nr_vmcs12_fields;
static inline short vmcs_field_to_offset(unsigned long field)
static inline short get_vmcs12_field_offset(unsigned long field)
{
unsigned short offset;
unsigned int index;
@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
return -ENOENT;
index = array_index_nospec(index, nr_vmcs12_fields);
offset = vmcs_field_to_offset_table[index];
offset = vmcs12_field_offsets[index];
if (offset == 0)
return -ENOENT;
return offset;

View File

@ -1487,11 +1487,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
* not point tthe failing instruction, and even if it did, the code
* not point at the failing instruction, and even if it did, the code
* stream is inaccessible. Inject #UD instead of exiting to userspace
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
@ -2603,7 +2604,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
@ -2628,7 +2628,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
struct page *pages;
struct vmcs *vmcs;
pages = __alloc_pages_node(node, flags, vmcs_config.order);
pages = __alloc_pages_node(node, flags, 0);
if (!pages)
return NULL;
vmcs = page_address(pages);
@ -2647,7 +2647,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
void free_vmcs(struct vmcs *vmcs)
{
free_pages((unsigned long)vmcs, vmcs_config.order);
free_page((unsigned long)vmcs);
}
/*
@ -4094,10 +4094,14 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
/*
* If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites
* HOST_IA32_SYSENTER_ESP.
* SYSENTER is used for 32-bit system calls on either 32-bit or
* 64-bit kernels. It is always zero If neither is allowed, otherwise
* vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
* have already done so!).
*/
vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
@ -4901,8 +4905,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
/*
* If the #DB was due to ICEBP, a.k.a. INT1, skip the
* instruction. ICEBP generates a trap-like #DB, but
* despite its interception control being tied to #DB,
* is an instruction intercept, i.e. the VM-Exit occurs
* on the ICEBP itself. Note, skipping ICEBP also
* clears STI and MOVSS blocking.
*
* For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
* if single-step is enabled in RFLAGS and STI or MOVSS
* blocking is active, as the CPU doesn't set the bit
* on VM-Exit due to #DB interception. VM-Entry has a
* consistency check that a single-step #DB is pending
* in this scenario as the previous instruction cannot
* have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
* don't modify RFLAGS), therefore the one instruction
* delay when activating single-step breakpoints must
* have already expired. Note, the CPU sets/clears BS
* as appropriate for all other VM-Exits types.
*/
if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu));
else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
@ -5397,7 +5426,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
return 1;
/*

View File

@ -3535,6 +3535,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~supported_xss)
return 1;
vcpu->arch.ia32_xss = data;
kvm_update_cpuid_runtime(vcpu);
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
@ -4229,6 +4230,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_SYS_ATTRIBUTES:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@ -4331,7 +4333,49 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
}
return r;
}
static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
{
void __user *uaddr = (void __user*)(unsigned long)attr->addr;
if ((u64)(unsigned long)uaddr != attr->addr)
return ERR_PTR(-EFAULT);
return uaddr;
}
static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
{
u64 __user *uaddr = kvm_get_attr_addr(attr);
if (attr->group)
return -ENXIO;
if (IS_ERR(uaddr))
return PTR_ERR(uaddr);
switch (attr->attr) {
case KVM_X86_XCOMP_GUEST_SUPP:
if (put_user(supported_xcr0, uaddr))
return -EFAULT;
return 0;
default:
return -ENXIO;
break;
}
}
static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
{
if (attr->group)
return -ENXIO;
switch (attr->attr) {
case KVM_X86_XCOMP_GUEST_SUPP:
return 0;
default:
return -ENXIO;
}
}
long kvm_arch_dev_ioctl(struct file *filp,
@ -4422,6 +4466,22 @@ long kvm_arch_dev_ioctl(struct file *filp,
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
break;
case KVM_GET_DEVICE_ATTR: {
struct kvm_device_attr attr;
r = -EFAULT;
if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
break;
r = kvm_x86_dev_get_attr(&attr);
break;
}
case KVM_HAS_DEVICE_ATTR: {
struct kvm_device_attr attr;
r = -EFAULT;
if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
break;
r = kvm_x86_dev_has_attr(&attr);
break;
}
default:
r = -EINVAL;
break;
@ -4860,8 +4920,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
kvm_x86_ops.nested_ops->leave_nested(vcpu);
kvm_smm_changed(vcpu, events->smi.smm);
}
vcpu->arch.smi_pending = events->smi.pending;
@ -5022,11 +5084,11 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
u64 __user *uaddr = kvm_get_attr_addr(attr);
int r;
if ((u64)(unsigned long)uaddr != attr->addr)
return -EFAULT;
if (IS_ERR(uaddr))
return PTR_ERR(uaddr);
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET:
@ -5045,12 +5107,12 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
u64 __user *uaddr = kvm_get_attr_addr(attr);
struct kvm *kvm = vcpu->kvm;
int r;
if ((u64)(unsigned long)uaddr != attr->addr)
return -EFAULT;
if (IS_ERR(uaddr))
return PTR_ERR(uaddr);
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET: {
@ -6810,6 +6872,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
insn, insn_len);
}
int handle_ud(struct kvm_vcpu *vcpu)
{
static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
@ -6817,7 +6886,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
return 1;
if (force_emulation_prefix &&
@ -8193,7 +8262,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool writeback = true;
bool write_fault_to_spt;
if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
@ -9706,7 +9775,7 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
return;
@ -11209,7 +11278,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.msr_misc_features_enables = 0;
vcpu->arch.xcr0 = XFEATURE_MASK_FP;
__kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
__kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
}
/* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
@ -11226,8 +11296,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
vcpu->arch.ia32_xss = 0;
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);

View File

@ -316,10 +316,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
"\tnotq %0\n"
"\t" LOCK_PREFIX "andq %0, %2\n"
"2:\n"
"\t.section .fixup,\"ax\"\n"
"3:\tjmp\t2b\n"
"\t.previous\n"
_ASM_EXTABLE_UA(1b, 3b)
_ASM_EXTABLE_UA(1b, 2b)
: "=r" (evtchn_pending_sel),
"+m" (vi->evtchn_pending_sel),
"+m" (v->arch.xen.evtchn_pending_sel)
@ -335,10 +332,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
"\tnotl %0\n"
"\t" LOCK_PREFIX "andl %0, %2\n"
"2:\n"
"\t.section .fixup,\"ax\"\n"
"3:\tjmp\t2b\n"
"\t.previous\n"
_ASM_EXTABLE_UA(1b, 3b)
_ASM_EXTABLE_UA(1b, 2b)
: "=r" (evtchn_pending_sel32),
"+m" (vi->evtchn_pending_sel),
"+m" (v->arch.xen.evtchn_pending_sel)

View File

@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
#define KVM_CAP_VM_GPA_BITS 207
#define KVM_CAP_XSAVE2 208
#define KVM_CAP_SYS_ATTRIBUTES 209
#ifdef KVM_CAP_IRQ_ROUTING

View File

@ -452,6 +452,9 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
/* attributes for system fd (group 0) */
#define KVM_X86_XCOMP_GUEST_SUPP 0
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];

View File

@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
#define KVM_CAP_VM_GPA_BITS 207
#define KVM_CAP_XSAVE2 208
#define KVM_CAP_SYS_ATTRIBUTES 209
#ifdef KVM_CAP_IRQ_ROUTING

View File

@ -85,6 +85,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test
TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
TEST_GEN_PROGS_x86_64 += x86_64/amx_test
TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test

View File

@ -345,7 +345,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
* guest_code - The vCPU's entry point
*/
void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
void vm_xsave_req_perm(void);
bool vm_is_unrestricted_guest(struct kvm_vm *vm);

View File

@ -458,6 +458,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
void vm_xsave_req_perm(int bit);
enum x86_page_size {
X86_PAGE_SIZE_4K = 0,

View File

@ -393,13 +393,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
struct kvm_vm *vm;
int i;
#ifdef __x86_64__
/*
* Permission needs to be requested before KVM_SET_CPUID2.
*/
vm_xsave_req_perm();
#endif
/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES;

View File

@ -665,16 +665,31 @@ static bool is_xfd_supported(void)
return !!(eax & CPUID_XFD_BIT);
}
void vm_xsave_req_perm(void)
void vm_xsave_req_perm(int bit)
{
unsigned long bitmask;
int kvm_fd;
u64 bitmask;
long rc;
struct kvm_device_attr attr = {
.group = 0,
.attr = KVM_X86_XCOMP_GUEST_SUPP,
.addr = (unsigned long) &bitmask
};
kvm_fd = open_kvm_dev_path_or_exit();
rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
close(kvm_fd);
if (rc == -1 && (errno == ENXIO || errno == EINVAL))
exit(KSFT_SKIP);
TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
if (!(bitmask & (1ULL << bit)))
exit(KSFT_SKIP);
if (!is_xfd_supported())
return;
exit(KSFT_SKIP);
rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
XSTATE_XTILE_DATA_BIT);
/*
* The older kernel version(<5.15) can't support
* ARCH_REQ_XCOMP_GUEST_PERM and directly return.
@ -684,7 +699,7 @@ void vm_xsave_req_perm(void)
rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK,
TEST_ASSERT(bitmask & (1ULL << bit),
"prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
bitmask);
}

View File

@ -329,6 +329,8 @@ int main(int argc, char *argv[])
u32 amx_offset;
int stage, ret;
vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT);
/* Create VM */
vm = vm_create_default(VCPU_ID, 0, guest_code);

View File

@ -105,7 +105,6 @@ static void guest_code(void *arg)
if (cpu_has_svm()) {
run_guest(svm->vmcb, svm->vmcb_gpa);
svm->vmcb->save.rip += 3;
run_guest(svm->vmcb, svm->vmcb_gpa);
} else {
vmlaunch();

View File

@ -463,8 +463,8 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
idx = srcu_read_lock(&kvm->irq_srcu);
gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
if (gsi != -1)
hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
link)
hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
link, srcu_read_lock_held(&kvm->irq_srcu))
if (kian->gsi == gsi) {
srcu_read_unlock(&kvm->irq_srcu, idx);
return true;
@ -480,8 +480,8 @@ void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
{
struct kvm_irq_ack_notifier *kian;
hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
link)
hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
link, srcu_read_lock_held(&kvm->irq_srcu))
if (kian->gsi == gsi)
kian->irq_acked(kian);
}

View File

@ -2248,7 +2248,6 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
return NULL;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
@ -2463,9 +2462,8 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
}
static int hva_to_pfn_remapped(struct vm_area_struct *vma,
unsigned long addr, bool *async,
bool write_fault, bool *writable,
kvm_pfn_t *p_pfn)
unsigned long addr, bool write_fault,
bool *writable, kvm_pfn_t *p_pfn)
{
kvm_pfn_t pfn;
pte_t *ptep;
@ -2575,7 +2573,7 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
if (vma == NULL)
pfn = KVM_PFN_ERR_FAULT;
else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
if (r == -EAGAIN)
goto retry;
if (r < 0)