- selftest compilation fix for non-x86
 
 - KVM: avoid warning on s390 in mark_page_dirty
 
 x86:
 - fix page write-protection bug and improve comments
 
 - use binary search to lookup the PMU event filter, add test
 
 - enable_pmu module parameter support for Intel CPUs
 
 - switch blocked_vcpu_on_cpu_lock to raw spinlock
 
 - cleanups of blocked vCPU logic
 
 - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression)
 
 - various small fixes
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmHpmT0UHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroOstggAi1VSpT43oGslQjXNDZacHEARoYQs
 b0XpoW7HXicGSGRMWspCmiAPdJyYTsioEACttAmXUMs7brAgHb9n/vzdlcLh1ymL
 rQw2YFQlfqqB1Ki1iRhNkWlH9xOECsu28WLng6ylrx51GuT/pzWRt+V3EGUFTxIT
 ldW9HgZg2oFJIaLjg2hQVR/8EbBf0QdsAD3KV3tyvhBlXPkyeLOMcGe9onfjZ/NE
 JQeW7FtKtP4SsIFt1KrJpDPjtiwFt3bRM0gfgGw7//clvtKIqt1LYXZiq4C3b7f5
 tfYiC8lO2vnOoYcfeYEmvybbSsoS/CgSliZB32qkwoVvRMIl82YmxtDD+Q==
 =/Mak
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull more kvm updates from Paolo Bonzini:
 "Generic:

   - selftest compilation fix for non-x86

   - KVM: avoid warning on s390 in mark_page_dirty

 x86:

   - fix page write-protection bug and improve comments

   - use binary search to lookup the PMU event filter, add test

   - enable_pmu module parameter support for Intel CPUs

   - switch blocked_vcpu_on_cpu_lock to raw spinlock

   - cleanups of blocked vCPU logic

   - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression)

   - various small fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (46 commits)
  docs: kvm: fix WARNINGs from api.rst
  selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c
  selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c
  kvm: selftests: Do not indent with spaces
  kvm: selftests: sync uapi/linux/kvm.h with Linux header
  selftests: kvm: add amx_test to .gitignore
  KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled
  KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops
  KVM: SVM: Drop AVIC's intermediate avic_set_running() helper
  KVM: VMX: Don't do full kick when handling posted interrupt wakeup
  KVM: VMX: Fold fallback path into triggering posted IRQ helper
  KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ
  KVM: VMX: Don't do full kick when triggering posted interrupt "fails"
  KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU
  KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption
  KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path
  KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs
  KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode
  KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks
  KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers
  ...
This commit is contained in:
Linus Torvalds 2022-01-22 09:40:01 +02:00
commit 636b5284d8
36 changed files with 1428 additions and 635 deletions

View File

@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
The Stats Data block contains an array of 64-bit values in the same order
as the descriptors in Descriptors block.
4.42 KVM_GET_XSAVE2
------------------
4.134 KVM_GET_XSAVE2
--------------------
:Capability: KVM_CAP_XSAVE2
:Architectures: x86
@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as
limit the attack surface on KVM's MSR emulation code.
8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID
-----------------------------
-------------------------------------
Architectures: x86

View File

@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush)
KVM_X86_OP_NULL(tlb_remote_flush_with_range)
KVM_X86_OP(tlb_flush_gva)
KVM_X86_OP(tlb_flush_guest)
KVM_X86_OP(vcpu_pre_run)
KVM_X86_OP(run)
KVM_X86_OP_NULL(handle_exit)
KVM_X86_OP_NULL(skip_emulated_instruction)
@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
KVM_X86_OP_NULL(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_NULL(update_cpu_dirty_logging)
KVM_X86_OP_NULL(pre_block)
KVM_X86_OP_NULL(post_block)
KVM_X86_OP_NULL(vcpu_blocking)
KVM_X86_OP_NULL(vcpu_unblocking)
KVM_X86_OP_NULL(update_pi_irte)

View File

@ -1381,6 +1381,7 @@ struct kvm_x86_ops {
*/
void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
@ -1454,18 +1455,6 @@ struct kvm_x86_ops {
const struct kvm_pmu_ops *pmu_ops;
const struct kvm_x86_nested_ops *nested_ops;
/*
* Architecture specific hooks for vCPU blocking due to
* HLT instruction.
* Returns for .pre_block():
* - 0 means continue to block the vCPU.
* - 1 means we cannot block the vCPU since some event
* happens during this period, such as, 'ON' bit in
* posted-interrupts descriptor is set.
*/
int (*pre_block)(struct kvm_vcpu *vcpu);
void (*post_block)(struct kvm_vcpu *vcpu);
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);

View File

@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
}
/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
int nent)
{
struct kvm_cpuid_entry2 *orig;
int i;
if (nent != vcpu->arch.cpuid_nent)
return -EINVAL;
for (i = 0; i < nent; i++) {
orig = &vcpu->arch.cpuid_entries[i];
if (e2[i].function != orig->function ||
e2[i].index != orig->index ||
e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
return -EINVAL;
}
return 0;
}
static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
{
u32 function;
@ -145,14 +167,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
}
}
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
struct kvm_cpuid_entry2 *entries, int nent)
{
u32 base = vcpu->arch.kvm_cpuid_base;
if (!base)
return NULL;
return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
}
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
{
return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
vcpu->arch.cpuid_nent);
}
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
@ -167,11 +196,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.pv_cpuid.features = best->eax;
}
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
int nent)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 1, 0);
best = cpuid_entry2_find(entries, nent, 1, 0);
if (best) {
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
@ -182,33 +212,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
}
best = kvm_find_cpuid_entry(vcpu, 7, 0);
best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best)
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
best = cpuid_entry2_find(entries, nent, 0xD, 1);
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
best = kvm_find_kvm_cpuid_features(vcpu);
best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
best = cpuid_entry2_find(entries, nent, 0x1, 0);
if (best)
cpuid_entry_change(best, X86_FEATURE_MWAIT,
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
}
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
{
__kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
@ -298,6 +333,22 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
{
int r;
__kvm_update_cpuid_runtime(vcpu, e2, nent);
/*
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
* the core vCPU model on the fly. It would've been better to forbid any
* KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
* some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
* whether the supplied CPUID data is equal to what's already set.
*/
if (vcpu->arch.last_vmentry_cpu != -1)
return kvm_cpuid_check_equal(vcpu, e2, nent);
r = kvm_check_cpuid(vcpu, e2, nent);
if (r)
return r;
@ -307,7 +358,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_nent = nent;
kvm_update_kvm_cpuid_base(vcpu);
kvm_update_cpuid_runtime(vcpu);
kvm_vcpu_after_set_cpuid(vcpu);
return 0;
@ -795,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
perf_get_x86_pmu_capability(&cap);
/*
* Only support guest architectural pmu on a host
* with architectural pmu.
* The guest architecture pmu is only supported if the architecture
* pmu exists on the host and the module parameters allow it.
*/
if (!cap.version)
if (!cap.version || !enable_pmu)
memset(&cap, 0, sizeof(cap));
eax.split.version_id = min(cap.version, 2);
@ -886,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
--array->nent;
continue;
}
if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
entry->ecx &= ~BIT_ULL(2);
entry->edx = 0;
}
break;

View File

@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
restart_apic_timer(vcpu->arch.apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
{
@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
start_sw_timer(apic);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
{

View File

@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
continue;
flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
start, end - 1, true, flush);
}
@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
/*
* We can flush all the TLBs out of the mmu lock without TLB
* corruption since we just change the spte from writable to
* readonly so that we only need to care the case of changing
* spte from present to present (changing the spte from present
* to nonpresent will flush all the TLBs immediately), in other
* words, the only case we care is mmu_spte_update() where we
* have checked Host-writable | MMU-writable instead of
* PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
* anymore.
* Flush TLBs if any SPTEs had to be write-protected to ensure that
* guest writes are reflected in the dirty bitmap before the memslot
* update completes, i.e. before enabling dirty logging is visible to
* userspace.
*
* Perform the TLB flush outside the mmu_lock to reduce the amount of
* time the lock is held. However, this does mean that another CPU can
* now grab mmu_lock and encounter a write-protected SPTE while CPUs
* still have a writable mapping for the associated GFN in their TLB.
*
* This is safe but requires KVM to be careful when making decisions
* based on the write-protection status of an SPTE. Specifically, KVM
* also write-protects SPTEs to monitor changes to guest page tables
* during shadow paging, and must guarantee no CPUs can write to those
* page before the lock is dropped. As mentioned in the previous
* paragraph, a write-protected SPTE is no guarantee that CPU cannot
* perform writes. So to determine if a TLB flush is truly required, KVM
* will clear a separate software-only bit (MMU-writable) and skip the
* flush if-and-only-if this bit was already clear.
*
* See DEFAULT_SPTE_MMU_WRITEABLE for more details.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);

View File

@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~shadow_host_writable_mask;
new_spte &= ~shadow_mmu_writable_mask;
new_spte = mark_spte_for_access_track(new_spte);

View File

@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
* as not-present for access tracking purposes. We do not save the W bit as the
@ -78,6 +74,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
/*
* *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
* writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
* that map guest pages in read-only memslots and read-only VMAs.
*
* Invariants:
* - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
*
*
* *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
* allows writes to the guest page mapped by the SPTE. This bit is cleared when
* the guest page mapped by the SPTE contains a page table that is being
* monitored for shadow paging. In this case the SPTE can only be made writable
* by unsyncing the shadow page under the mmu_lock.
*
* Invariants:
* - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
* - If MMU-writable is set, Host-writable must be set.
*
* If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
* to track writes for dirty logging. For such SPTEs, KVM will locklessly set
* PT_WRITABLE_MASK upon the next write from the guest and record the write in
* the dirty log (see fast_page_fault()).
*/
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
/*
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
* to not overlap the A/D type mask or the saved access bits of access-tracked
@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
return (spte & shadow_host_writable_mask) &&
(spte & shadow_mmu_writable_mask);
if (spte & shadow_mmu_writable_mask) {
WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
return true;
}
WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
return false;
}
static inline u64 get_mmio_spte_generation(u64 spte)

View File

@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
if (!is_writable_pte(iter.old_spte))
break;
new_spte = iter.old_spte &
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
if (new_spte == iter.old_spte)
break;
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
}

View File

@ -13,6 +13,8 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <asm/perf_event.h>
#include "x86.h"
#include "cpuid.h"
@ -109,6 +111,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
.config = config,
};
if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
return;
attr.sample_period = get_sample_period(pmc, pmc->counter);
if (in_tx)
@ -169,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return true;
}
static int cmp_u64(const void *a, const void *b)
{
return *(__u64 *)a - *(__u64 *)b;
}
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
unsigned config, type = PERF_TYPE_RAW;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
int i;
bool allow_event = true;
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@ -189,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (filter) {
for (i = 0; i < filter->nevents; i++)
if (filter->events[i] ==
(eventsel & AMD64_RAW_EVENT_MASK_NB))
break;
if (filter->action == KVM_PMU_EVENT_ALLOW &&
i == filter->nevents)
allow_event = false;
if (filter->action == KVM_PMU_EVENT_DENY &&
i < filter->nevents)
allow_event = false;
__u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
if (bsearch(&key, filter->events, filter->nevents,
sizeof(__u64), cmp_u64))
allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
else
allow_event = filter->action == KVM_PMU_EVENT_DENY;
}
if (!allow_event)
return;
@ -573,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
/* Ensure nevents can't be changed between the user copies. */
*filter = tmp;
/*
* Sort the in-kernel list so that we can search it with bsearch.
*/
sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));

View File

@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
struct kvm_vcpu *vcpu;
unsigned long i;
/*
* Wake any target vCPUs that are blocking, i.e. waiting for a wake
* event. There's no need to signal doorbells, as hardware has handled
* vCPUs that were in guest at the time of the IPI, and vCPUs that have
* since entered the guest will have processed pending IRQs at VMRUN.
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
bool m = kvm_apic_match_dest(vcpu, source,
icrl & APIC_SHORT_MASK,
GET_APIC_DEST_FIELD(icrh),
icrl & APIC_DEST_MASK);
if (m && !avic_vcpu_is_running(vcpu))
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
GET_APIC_DEST_FIELD(icrh),
icrl & APIC_DEST_MASK))
kvm_vcpu_wake_up(vcpu);
}
}
@ -672,9 +675,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
return -1;
kvm_lapic_set_irr(vec, vcpu->arch.apic);
/*
* Pairs with the smp_mb_*() after setting vcpu->guest_mode in
* vcpu_enter_guest() to ensure the write to the vIRR is ordered before
* the read of guest_mode, which guarantees that either VMRUN will see
* and process the new vIRR entry, or that the below code will signal
* the doorbell if the vCPU is already running in the guest.
*/
smp_mb__after_atomic();
if (avic_vcpu_is_running(vcpu)) {
/*
* Signal the doorbell to tell hardware to inject the IRQ if the vCPU
* is in the guest. If the vCPU is not in the guest, hardware will
* automatically process AVIC interrupts at VMRUN.
*/
if (vcpu->mode == IN_GUEST_MODE) {
int cpu = READ_ONCE(vcpu->cpu);
/*
@ -688,8 +704,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
if (cpu != get_cpu())
wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
put_cpu();
} else
} else {
/*
* Wake the vCPU if it was blocking. KVM will then detect the
* pending IRQ when checking if the vCPU has a wake event.
*/
kvm_vcpu_wake_up(vcpu);
}
return 0;
}
@ -957,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
/*
* Since the host physical APIC id is 8 bits,
* we can support host APIC ID upto 255.
@ -964,19 +987,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
* No need to update anything if the vCPU is blocking, i.e. if the vCPU
* is being scheduled in after being preempted. The CPU entries in the
* Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
* If the vCPU was migrated, its new CPU value will be stuffed when the
* vCPU unblocks.
*/
if (kvm_vcpu_is_blocking(vcpu))
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
if (svm->avic_is_running)
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
svm->avic_is_running);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
@ -984,42 +1013,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
entry = READ_ONCE(*(svm->avic_physical_id_cache));
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
return;
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
/*
* This function is called during VCPU halt/unhalt.
*/
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int cpu = get_cpu();
if (!kvm_vcpu_apicv_active(vcpu))
return;
preempt_disable();
/*
* Unload the AVIC when the vCPU is about to block, _before_
* the vCPU actually blocks.
*
* Any IRQs that arrive before IsRunning=0 will not cause an
* incomplete IPI vmexit on the source, therefore vIRR will also
* be checked by kvm_vcpu_check_block() before blocking. The
* memory barrier implicit in set_current_state orders writing
* IsRunning=0 before reading the vIRR. The processor needs a
* matching memory barrier on interrupt delivery between writing
* IRR and reading IsRunning; the lack of this barrier might be
* the cause of errata #1235).
*/
avic_vcpu_put(vcpu);
preempt_enable();
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
int cpu;
if (!kvm_vcpu_apicv_active(vcpu))
return;
cpu = get_cpu();
WARN_ON(cpu != vcpu->cpu);
svm->avic_is_running = is_run;
if (kvm_vcpu_apicv_active(vcpu)) {
if (is_run)
avic_vcpu_load(vcpu, cpu);
else
avic_vcpu_put(vcpu);
}
avic_vcpu_load(vcpu, cpu);
put_cpu();
}
void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
{
avic_set_running(vcpu, false);
}
void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
kvm_vcpu_update_apicv(vcpu);
avic_set_running(vcpu, true);
}

View File

@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
if (!pmu)
if (!enable_pmu)
return NULL;
switch (msr) {

View File

@ -192,10 +192,6 @@ module_param(vgif, int, 0444);
static int lbrv = true;
module_param(lbrv, int, 0444);
/* enable/disable PMU virtualization */
bool pmu = true;
module_param(pmu, bool, 0444);
static int tsc_scaling = true;
module_param(tsc_scaling, int, 0444);
@ -873,47 +869,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
/*
* The default MMIO mask is a single bit (excluding the present bit),
* which could conflict with the memory encryption bit. Check for
* memory encryption support and override the default MMIO mask if
* memory encryption is enabled.
*/
static __init void svm_adjust_mmio_mask(void)
{
unsigned int enc_bit, mask_bit;
u64 msr, mask;
/* If there is no memory encryption support, use existing mask */
if (cpuid_eax(0x80000000) < 0x8000001f)
return;
/* If memory encryption is not enabled, use existing mask */
rdmsrl(MSR_AMD64_SYSCFG, msr);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
mask_bit = boot_cpu_data.x86_phys_bits;
/* Increment the mask bit if it is the same as the encryption bit */
if (enc_bit == mask_bit)
mask_bit++;
/*
* If the mask bit location is below 52, then some bits above the
* physical addressing limit will always be reserved, so use the
* rsvd_bits() function to generate the mask. This mask, along with
* the present bit, will be used to generate a page fault with
* PFER.RSV = 1.
*
* If the mask bit location is 52 (or above), then clear the mask.
*/
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}
static void svm_hardware_teardown(void)
{
int cpu;
@ -928,198 +883,6 @@ static void svm_hardware_teardown(void)
iopm_base = 0;
}
static __init void svm_set_cpu_caps(void)
{
kvm_set_cpu_caps();
supported_xss = 0;
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
if (tsc_scaling)
kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
/* AMD PMU PERFCTR_CORE CPUID */
if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
}
static __init int svm_hardware_setup(void)
{
int cpu;
struct page *iopm_pages;
void *iopm_va;
int r;
unsigned int order = get_order(IOPM_SIZE);
/*
* NX is required for shadow paging and for NPT if the NX huge pages
* mitigation is enabled.
*/
if (!boot_cpu_has(X86_FEATURE_NX)) {
pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
kvm_enable_efer_bits(EFER_NX);
iopm_pages = alloc_pages(GFP_KERNEL, order);
if (!iopm_pages)
return -ENOMEM;
iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
init_msrpm_offsets();
supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
if (tsc_scaling) {
if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
tsc_scaling = false;
} else {
pr_info("TSC scaling supported\n");
kvm_has_tsc_control = true;
kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
kvm_tsc_scaling_ratio_frac_bits = 32;
}
}
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
pause_filter_count = 0;
pause_filter_thresh = 0;
} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
pause_filter_thresh = 0;
}
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
/*
* KVM's MMU doesn't support using 2-level paging for itself, and thus
* NPT isn't supported if the host is using 2-level paging since host
* CR4 is unchanged on VMRUN.
*/
if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
npt_enabled = false;
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Note, SEV setup consumes npt_enabled. */
sev_hardware_setup();
svm_hv_hardware_setup();
svm_adjust_mmio_mask();
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
goto err;
}
if (nrips) {
if (!boot_cpu_has(X86_FEATURE_NRIPS))
nrips = false;
}
enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
if (enable_apicv) {
pr_info("AVIC enabled\n");
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
}
if (vls) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
!IS_ENABLED(CONFIG_X86_64)) {
vls = false;
} else {
pr_info("Virtual VMLOAD VMSAVE supported\n");
}
}
if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
svm_gp_erratum_intercept = false;
if (vgif) {
if (!boot_cpu_has(X86_FEATURE_VGIF))
vgif = false;
else
pr_info("Virtual GIF supported\n");
}
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
else
pr_info("LBR virtualization supported\n");
}
if (!pmu)
pr_info("PMU virtualization is disabled\n");
svm_set_cpu_caps();
/*
* It seems that on AMD processors PTE's accessed bit is
* being set by the CPU hardware before the NPF vmexit.
* This is not expected behaviour and our tests fail because
* of it.
* A workaround here is to disable support for
* GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
* In this case userspace can know if there is support using
* KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
* it
* If future AMD CPU models change the behaviour described above,
* this variable can be changed accordingly
*/
allow_smaller_maxphyaddr = !npt_enabled;
return 0;
err:
svm_hardware_teardown();
return r;
}
static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
@ -1444,12 +1207,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (err)
goto error_free_vmsa_page;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
*/
if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
svm->avic_is_running = true;
svm->msrpm = svm_vcpu_alloc_msrpm();
if (!svm->msrpm) {
err = -ENOMEM;
@ -3833,6 +3590,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
svm_complete_interrupts(vcpu);
}
static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
return 1;
}
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
@ -4629,8 +4391,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.prepare_guest_switch = svm_prepare_guest_switch,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
.vcpu_blocking = svm_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking,
.vcpu_blocking = avic_vcpu_blocking,
.vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
@ -4662,6 +4424,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.tlb_flush_gva = svm_flush_tlb_gva,
.tlb_flush_guest = svm_flush_tlb,
.vcpu_pre_run = svm_vcpu_pre_run,
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
@ -4742,6 +4505,243 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
};
/*
* The default MMIO mask is a single bit (excluding the present bit),
* which could conflict with the memory encryption bit. Check for
* memory encryption support and override the default MMIO mask if
* memory encryption is enabled.
*/
static __init void svm_adjust_mmio_mask(void)
{
unsigned int enc_bit, mask_bit;
u64 msr, mask;
/* If there is no memory encryption support, use existing mask */
if (cpuid_eax(0x80000000) < 0x8000001f)
return;
/* If memory encryption is not enabled, use existing mask */
rdmsrl(MSR_AMD64_SYSCFG, msr);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
mask_bit = boot_cpu_data.x86_phys_bits;
/* Increment the mask bit if it is the same as the encryption bit */
if (enc_bit == mask_bit)
mask_bit++;
/*
* If the mask bit location is below 52, then some bits above the
* physical addressing limit will always be reserved, so use the
* rsvd_bits() function to generate the mask. This mask, along with
* the present bit, will be used to generate a page fault with
* PFER.RSV = 1.
*
* If the mask bit location is 52 (or above), then clear the mask.
*/
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}
static __init void svm_set_cpu_caps(void)
{
kvm_set_cpu_caps();
supported_xss = 0;
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
if (tsc_scaling)
kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
/* AMD PMU PERFCTR_CORE CPUID */
if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
}
static __init int svm_hardware_setup(void)
{
int cpu;
struct page *iopm_pages;
void *iopm_va;
int r;
unsigned int order = get_order(IOPM_SIZE);
/*
* NX is required for shadow paging and for NPT if the NX huge pages
* mitigation is enabled.
*/
if (!boot_cpu_has(X86_FEATURE_NX)) {
pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
kvm_enable_efer_bits(EFER_NX);
iopm_pages = alloc_pages(GFP_KERNEL, order);
if (!iopm_pages)
return -ENOMEM;
iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
init_msrpm_offsets();
supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
if (tsc_scaling) {
if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
tsc_scaling = false;
} else {
pr_info("TSC scaling supported\n");
kvm_has_tsc_control = true;
kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
kvm_tsc_scaling_ratio_frac_bits = 32;
}
}
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
pause_filter_count = 0;
pause_filter_thresh = 0;
} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
pause_filter_thresh = 0;
}
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
/*
* KVM's MMU doesn't support using 2-level paging for itself, and thus
* NPT isn't supported if the host is using 2-level paging since host
* CR4 is unchanged on VMRUN.
*/
if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
npt_enabled = false;
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Note, SEV setup consumes npt_enabled. */
sev_hardware_setup();
svm_hv_hardware_setup();
svm_adjust_mmio_mask();
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
goto err;
}
if (nrips) {
if (!boot_cpu_has(X86_FEATURE_NRIPS))
nrips = false;
}
enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
if (enable_apicv) {
pr_info("AVIC enabled\n");
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
} else {
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
}
if (vls) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
!IS_ENABLED(CONFIG_X86_64)) {
vls = false;
} else {
pr_info("Virtual VMLOAD VMSAVE supported\n");
}
}
if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
svm_gp_erratum_intercept = false;
if (vgif) {
if (!boot_cpu_has(X86_FEATURE_VGIF))
vgif = false;
else
pr_info("Virtual GIF supported\n");
}
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
else
pr_info("LBR virtualization supported\n");
}
if (!enable_pmu)
pr_info("PMU virtualization is disabled\n");
svm_set_cpu_caps();
/*
* It seems that on AMD processors PTE's accessed bit is
* being set by the CPU hardware before the NPF vmexit.
* This is not expected behaviour and our tests fail because
* of it.
* A workaround here is to disable support for
* GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
* In this case userspace can know if there is support using
* KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
* it
* If future AMD CPU models change the behaviour described above,
* this variable can be changed accordingly
*/
allow_smaller_maxphyaddr = !npt_enabled;
return 0;
err:
svm_hardware_teardown();
return r;
}
static struct kvm_x86_init_ops svm_init_ops __initdata = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,

View File

@ -32,7 +32,6 @@
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern bool intercept_smi;
extern bool pmu;
/*
* Clean bits in VMCB.
@ -226,7 +225,6 @@ struct vcpu_svm {
u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
bool avic_is_running;
/*
* Per-vcpu list of struct amd_svm_iommu_ir:
@ -574,17 +572,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 *entry = svm->avic_physical_id_cache;
if (!entry)
return false;
return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
}
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@ -605,8 +592,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
/* sev.c */

View File

@ -5,6 +5,7 @@
#include <asm/vmx.h>
#include "lapic.h"
#include "x86.h"
extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled;
@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void)
{
u64 perf_cap = 0;
if (!enable_pmu)
return perf_cap;
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);

View File

@ -21,7 +21,6 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
/* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
/* The above index must match CPUID 0x0A.EBX bit vector */
[7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
};
@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
if (intel_arch_events[i].eventsel == event_select &&
intel_arch_events[i].unit_mask == unit_mask &&
(pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i)))
break;
for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
if (intel_arch_events[i].eventsel != event_select ||
intel_arch_events[i].unit_mask != unit_mask)
continue;
/* disable event that reported as not present by cpuid */
if ((i < 7) && !(pmu->available_event_types & (1 << i)))
return PERF_COUNT_HW_MAX + 1;
break;
}
if (i == ARRAY_SIZE(intel_arch_events))
return PERF_COUNT_HW_MAX;
@ -481,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits = 0xffffffff00200000ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
if (!entry || !enable_pmu)
return;
eax.full = entry->eax;
edx.full = entry->edx;

View File

@ -19,7 +19,7 @@
* wake the target vCPUs. vCPUs are removed from the list and the notification
* vector is reset when the vCPU is scheduled in.
*/
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
/*
* Protect the per-CPU list with a per-CPU spinlock to handle task migration.
* When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
* CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
* occur if a wakeup IRQ arrives and attempts to acquire the lock.
*/
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
@ -51,7 +51,9 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new;
unsigned long flags;
unsigned int dest;
/*
@ -62,23 +64,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
if (!enable_apicv || !lapic_in_kernel(vcpu))
return;
/* Nothing to do if PI.SN and PI.NDST both have the desired value. */
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
/*
* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
* PI.NDST: pi_post_block is the one expected to change PID.NDST and the
* wakeup handler expects the vCPU to be on the blocked_vcpu_list that
* matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
* correctly.
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
* full update can be skipped as neither the vector nor the destination
* needs to be changed.
*/
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
pi_clear_sn(pi_desc);
goto after_clear_sn;
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
/*
* Clear SN if it was set due to being preempted. Again, do
* this even if there is no assigned device for simplicity.
*/
if (pi_test_and_clear_sn(pi_desc))
goto after_clear_sn;
return;
}
local_irq_save(flags);
/*
* If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
* list of the _previous_ pCPU, which will not be the same as the
* current pCPU if the task was migrated.
*/
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
list_del(&vmx->pi_wakeup_list);
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
}
/* The full case. Set the new destination and clear SN. */
dest = cpu_physical_id(cpu);
if (!x2apic_mode)
dest = (dest << 8) & 0xFF00;
@ -86,10 +99,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
do {
old.control = new.control = READ_ONCE(pi_desc->control);
/*
* Clear SN (as above) and refresh the destination APIC ID to
* handle task migration (@cpu != vcpu->cpu).
*/
new.ndst = dest;
new.sn = 0;
/*
* Restore the notification vector; in the blocking case, the
* descriptor was modified on "put" to use the wakeup vector.
*/
new.nv = POSTED_INTR_VECTOR;
} while (pi_try_set_control(pi_desc, old.control, new.control));
local_irq_restore(flags);
after_clear_sn:
/*
@ -111,83 +136,25 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
irq_remapping_cap(IRQ_POSTING_CAP);
}
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
/* Set SN when the vCPU is preempted */
if (vcpu->preempted)
pi_set_sn(pi_desc);
}
static void __pi_post_block(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
unsigned int dest;
/*
* Remove the vCPU from the wakeup list of the _previous_ pCPU, which
* will not be the same as the current pCPU if the task was migrated.
*/
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
list_del(&vcpu->blocked_vcpu_list);
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
dest = cpu_physical_id(vcpu->cpu);
if (!x2apic_mode)
dest = (dest << 8) & 0xFF00;
WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
"Wakeup handler not enabled while the vCPU was blocking");
do {
old.control = new.control = READ_ONCE(pi_desc->control);
new.ndst = dest;
/* set 'NV' to 'notification vector' */
new.nv = POSTED_INTR_VECTOR;
} while (pi_try_set_control(pi_desc, old.control, new.control));
vcpu->pre_pcpu = -1;
}
/*
* This routine does the following things for vCPU which is going
* to be blocked if VT-d PI is enabled.
* - Store the vCPU to the wakeup list, so when interrupts happen
* we can find the right vCPU to wake up.
* - Change the Posted-interrupt descriptor as below:
* 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
* - If 'ON' is set during this process, which means at least one
* interrupt is posted for this vCPU, we cannot block it, in
* this case, return 1, otherwise, return 0.
*
* Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
* WAKEUP as the notification vector in the PI descriptor.
*/
int pi_pre_block(struct kvm_vcpu *vcpu)
static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new;
unsigned long flags;
if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
vmx_interrupt_blocked(vcpu))
return 0;
local_irq_save(flags);
vcpu->pre_pcpu = vcpu->cpu;
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
list_add_tail(&vcpu->blocked_vcpu_list,
&per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
list_add_tail(&vmx->pi_wakeup_list,
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
WARN(pi_desc->sn == 1,
"Posted Interrupt Suppress Notification set before blocking");
WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
do {
old.control = new.control = READ_ONCE(pi_desc->control);
@ -196,24 +163,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
new.nv = POSTED_INTR_WAKEUP_VECTOR;
} while (pi_try_set_control(pi_desc, old.control, new.control));
/* We should not block the vCPU if an interrupt is posted for it. */
if (pi_test_on(pi_desc))
__pi_post_block(vcpu);
/*
* Send a wakeup IPI to this CPU if an interrupt may have been posted
* before the notification vector was updated, in which case the IRQ
* will arrive on the non-wakeup vector. An IPI is needed as calling
* try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
* enabled until it is safe to call try_to_wake_up() on the task being
* scheduled out).
*/
if (pi_test_on(&new))
apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
local_irq_restore(flags);
return (vcpu->pre_pcpu == -1);
}
void pi_post_block(struct kvm_vcpu *vcpu)
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (vcpu->pre_pcpu == -1)
if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
local_irq_save(flags);
__pi_post_block(vcpu);
local_irq_restore(flags);
if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
pi_enable_wakeup_handler(vcpu);
/*
* Set SN when the vCPU is preempted. Note, the vCPU can both be seen
* as blocking and preempted, e.g. if it's preempted between setting
* its wait state and manually scheduling out.
*/
if (vcpu->preempted)
pi_set_sn(pi_desc);
}
/*
@ -221,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu)
*/
void pi_wakeup_handler(void)
{
struct kvm_vcpu *vcpu;
int cpu = smp_processor_id();
struct vcpu_vmx *vmx;
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
blocked_vcpu_list) {
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
pi_wakeup_list) {
if (pi_test_on(pi_desc))
kvm_vcpu_kick(vcpu);
if (pi_test_on(&vmx->pi_desc))
kvm_vcpu_wake_up(&vmx->vcpu);
}
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
void __init pi_init_cpu(int cpu)
{
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
@ -254,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
* Bail out of the block loop if the VM has an assigned
* device, but the blocking vCPU didn't reconfigure the
* PI.NV to the wakeup vector, i.e. the assigned device
* came along after the initial check in pi_pre_block().
* came along after the initial check in vmx_vcpu_pi_put().
*/
void vmx_pi_start_assignment(struct kvm *kvm)
{

View File

@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
{
return test_and_clear_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
int pi_pre_block(struct kvm_vcpu *vcpu);
void pi_post_block(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);

View File

@ -3931,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
pt_update_intercept_for_msr(vcpu);
}
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested)
static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
int pi_vec)
{
#ifdef CONFIG_SMP
int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
if (vcpu->mode == IN_GUEST_MODE) {
/*
* The vector of interrupt to be delivered to vcpu had
@ -3964,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
*/
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return true;
return;
}
#endif
return false;
/*
* The vCPU isn't in the guest; wake the vCPU in case it is blocking,
* otherwise do nothing as KVM will grab the highest priority pending
* IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
*/
kvm_vcpu_wake_up(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@ -3997,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
smp_mb__after_atomic();
/* the PIR and ON have been set by L1. */
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
kvm_vcpu_kick(vcpu);
kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
return 0;
}
return -1;
@ -4035,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
* guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
* posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
*/
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu);
kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
return 0;
}
@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
return vmx->emulation_required && !vmx->rmode.vm86_active &&
vcpu->arch.exception.pending;
}
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
if (vmx->emulation_required && !vmx->rmode.vm86_active &&
vcpu->arch.exception.pending) {
if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
return 1;
}
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -6928,6 +6945,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
INIT_LIST_HEAD(&vmx->pi_wakeup_list);
err = -ENOMEM;
vmx->vpid = allocate_vpid();
@ -7549,25 +7568,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
static int vmx_pre_block(struct kvm_vcpu *vcpu)
{
if (pi_pre_block(vcpu))
return 1;
if (kvm_lapic_hv_timer_in_use(vcpu))
kvm_lapic_switch_to_sw_timer(vcpu);
return 0;
}
static void vmx_post_block(struct kvm_vcpu *vcpu)
{
if (kvm_x86_ops.set_hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
pi_post_block(vcpu);
}
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@ -7710,6 +7710,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.tlb_flush_gva = vmx_flush_tlb_gva,
.tlb_flush_guest = vmx_flush_tlb_guest,
.vcpu_pre_run = vmx_vcpu_pre_run,
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
@ -7768,9 +7769,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,

View File

@ -317,6 +317,9 @@ struct vcpu_vmx {
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
/* Used if this vCPU is waiting for PI notification wakeup. */
struct list_head pi_wakeup_list;
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;

View File

@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
/* Enable/disable PMU virtualization */
bool __read_mostly enable_pmu = true;
EXPORT_SYMBOL_GPL(enable_pmu);
module_param(enable_pmu, bool, 0444);
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@ -5230,17 +5235,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
/*
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
* the core vCPU model on the fly, so fail.
*/
r = -EINVAL;
if (vcpu->arch.last_vmentry_cpu != -1)
goto out;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@ -5251,14 +5245,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
/*
* KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
* KVM_SET_CPUID case above.
*/
r = -EINVAL;
if (vcpu->arch.last_vmentry_cpu != -1)
goto out;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@ -9945,10 +9931,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_mb__after_srcu_read_unlock();
/*
* This handles the case where a posted interrupt was
* notified with kvm_vcpu_kick. Assigned devices can
* use the POSTED_INTR_VECTOR even if APICv is disabled,
* so do it even if APICv is disabled on this vCPU.
* Process pending posted interrupts to handle the case where the
* notification IRQ arrived in the host, or was never sent (because the
* target vCPU wasn't running). Do this regardless of the vCPU's APICv
* status, KVM doesn't update assigned devices when APICv is inhibited,
* i.e. they can post interrupts even if APICv is temporarily disabled.
*/
if (kvm_lapic_enabled(vcpu))
static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
@ -10113,8 +10100,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (!kvm_arch_vcpu_runnable(vcpu) &&
(!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
bool hv_timer;
if (!kvm_arch_vcpu_runnable(vcpu)) {
/*
* Switch to the software timer before halt-polling/blocking as
* the guest's timer may be a break event for the vCPU, and the
* hypervisor timer runs only when the CPU is in guest mode.
* Switch before halt-polling so that KVM recognizes an expired
* timer before blocking.
*/
hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
if (hv_timer)
kvm_lapic_switch_to_sw_timer(vcpu);
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
kvm_vcpu_halt(vcpu);
@ -10122,8 +10121,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_x86_ops.post_block)
static_call(kvm_x86_post_block)(vcpu);
if (hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
@ -10316,6 +10315,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
r = -EINTR;
goto out;
}
/*
* It should be impossible for the hypervisor timer to be in
* use before KVM has ever run the vCPU.
*/
WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
kvm_vcpu_block(vcpu);
if (kvm_apic_accept_events(vcpu) < 0) {
r = 0;
@ -10360,10 +10364,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
if (kvm_run->immediate_exit)
if (kvm_run->immediate_exit) {
r = -EINTR;
else
r = vcpu_run(vcpu);
goto out;
}
r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
if (r <= 0)
goto out;
r = vcpu_run(vcpu);
out:
kvm_put_guest_fpu(vcpu);

View File

@ -336,6 +336,7 @@ extern u64 host_xcr0;
extern u64 supported_xcr0;
extern u64 host_xss;
extern u64 supported_xss;
extern bool enable_pmu;
static inline bool kvm_mpx_supported(void)
{

View File

@ -309,9 +309,6 @@ struct kvm_vcpu {
u64 requests;
unsigned long guest_debug;
int pre_pcpu;
struct list_head blocked_vcpu_list;
struct mutex mutex;
struct kvm_run *run;

View File

@ -1131,7 +1131,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
#define KVM_CAP_ARM_MTE 205
#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
#define KVM_CAP_XSAVE2 207
#define KVM_CAP_VM_GPA_BITS 207
#define KVM_CAP_XSAVE2 208
#ifdef KVM_CAP_IRQ_ROUTING
@ -1163,11 +1164,20 @@ struct kvm_irq_routing_hv_sint {
__u32 sint;
};
struct kvm_irq_routing_xen_evtchn {
__u32 port;
__u32 vcpu;
__u32 priority;
};
#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
/* gsi routing entry types */
#define KVM_IRQ_ROUTING_IRQCHIP 1
#define KVM_IRQ_ROUTING_MSI 2
#define KVM_IRQ_ROUTING_S390_ADAPTER 3
#define KVM_IRQ_ROUTING_HV_SINT 4
#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
struct kvm_irq_routing_entry {
__u32 gsi;
@ -1179,6 +1189,7 @@ struct kvm_irq_routing_entry {
struct kvm_irq_routing_msi msi;
struct kvm_irq_routing_s390_adapter adapter;
struct kvm_irq_routing_hv_sint hv_sint;
struct kvm_irq_routing_xen_evtchn xen_evtchn;
__u32 pad[8];
} u;
};
@ -1209,6 +1220,7 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
struct kvm_xen_hvm_config {
__u32 flags;
@ -1552,8 +1564,6 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_XSAVE */
#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave)
/* Available with KVM_CAP_XSAVE2 */
#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
/* Available with KVM_CAP_XCRS */
#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
@ -1613,6 +1623,9 @@ struct kvm_enc_region {
#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3)
#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4)
/* Available with KVM_CAP_XSAVE2 */
#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
struct kvm_s390_pv_sec_parm {
__u64 origin;
__u64 length;

View File

@ -8,11 +8,12 @@
/s390x/memop
/s390x/resets
/s390x/sync_regs_test
/x86_64/amx_test
/x86_64/cpuid_test
/x86_64/cr4_cpuid_sync_test
/x86_64/debug_regs
/x86_64/evmcs_test
/x86_64/emulator_error_test
/x86_64/get_cpuid_test
/x86_64/get_msr_index_features
/x86_64/kvm_clock_test
/x86_64/kvm_pv_test
@ -22,6 +23,7 @@
/x86_64/mmio_warning_test
/x86_64/mmu_role_test
/x86_64/platform_info_test
/x86_64/pmu_event_filter_test
/x86_64/set_boot_cpu_id
/x86_64/set_sregs_test
/x86_64/sev_migrate_tests
@ -36,6 +38,7 @@
/x86_64/vmx_apic_access_test
/x86_64/vmx_close_while_nested_test
/x86_64/vmx_dirty_log_test
/x86_64/vmx_exception_with_invalid_guest_state
/x86_64/vmx_invalid_nested_guest_state
/x86_64/vmx_preemption_timer_test
/x86_64/vmx_set_nested_state_test

View File

@ -43,11 +43,11 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handler
LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test
TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
TEST_GEN_PROGS_x86_64 += x86_64/smm_test
@ -69,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test

View File

@ -364,6 +364,24 @@ static inline unsigned long get_xmm(int n)
}
bool is_intel_cpu(void);
bool is_amd_cpu(void);
static inline unsigned int x86_family(unsigned int eax)
{
unsigned int x86;
x86 = (eax >> 8) & 0xf;
if (x86 == 0xf)
x86 += (eax >> 20) & 0xff;
return x86;
}
static inline unsigned int x86_model(unsigned int eax)
{
return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
}
struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
@ -375,6 +393,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index);
struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_cpuid2 *cpuid);
void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_cpuid2 *cpuid);
@ -418,6 +438,11 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr);
void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
uint64_t pte);
/*
* get_cpuid() - find matching CPUID entry and return pointer to it.
*/
struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
uint32_t index);
/*
* set_cpuid() - overwrites a matching cpuid entry with the provided value.
* matches based on ent->function && ent->index. returns true

View File

@ -393,10 +393,12 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
struct kvm_vm *vm;
int i;
#ifdef __x86_64__
/*
* Permission needs to be requested before KVM_SET_CPUID2.
*/
vm_xsave_req_perm();
#endif
/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
@ -497,9 +499,11 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
uint64_t first_page, uint32_t num_pages)
{
struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
.first_page = first_page,
.num_pages = num_pages };
struct kvm_clear_dirty_log args = {
.dirty_bitmap = log, .slot = slot,
.first_page = first_page,
.num_pages = num_pages
};
int ret;
ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);

View File

@ -886,6 +886,17 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
return entry;
}
int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_cpuid2 *cpuid)
{
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
}
/*
* VM VCPU CPUID Set
*
@ -903,12 +914,9 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
void vcpu_set_cpuid(struct kvm_vm *vm,
uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
{
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
int rc;
TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
rc = __vcpu_set_cpuid(vm, vcpuid, cpuid);
TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
rc, errno);
@ -1136,25 +1144,25 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
list->nmsrs = nmsrs;
r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
r);
state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
r);
r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
r);
r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
r);
r = vcpu_save_xsave_state(vm, vcpu, state);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
r);
if (kvm_check_cap(KVM_CAP_XCRS)) {
r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
@ -1163,17 +1171,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
}
r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
r);
if (nested_size) {
state->nested.size = sizeof(state->nested_);
r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
r);
r);
TEST_ASSERT(state->nested.size <= nested_size,
"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
state->nested.size, nested_size);
"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
state->nested.size, nested_size);
} else
state->nested.size = 0;
@ -1181,12 +1189,12 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
for (i = 0; i < nmsrs; i++)
state->msrs.entries[i].index = list->indices[i];
r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
r, r == nmsrs ? -1 : list->indices[r]);
TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
r, r == nmsrs ? -1 : list->indices[r]);
r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
r);
free(list);
return state;
@ -1199,7 +1207,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
r);
r);
r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
TEST_ASSERT(r == state->msrs.nmsrs,
@ -1214,28 +1222,28 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
r);
r);
r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
r);
r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
r);
r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
r);
r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
r);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
r);
if (state->nested.size) {
r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
r);
r);
}
}
@ -1245,10 +1253,10 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state)
free(state);
}
bool is_intel_cpu(void)
static bool cpu_vendor_string_is(const char *vendor)
{
const uint32_t *chunk = (const uint32_t *)vendor;
int eax, ebx, ecx, edx;
const uint32_t *chunk;
const int leaf = 0;
__asm__ __volatile__(
@ -1257,10 +1265,22 @@ bool is_intel_cpu(void)
"=c"(ecx), "=d"(edx)
: /* input */ "0"(leaf), "2"(0));
chunk = (const uint32_t *)("GenuineIntel");
return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
}
bool is_intel_cpu(void)
{
return cpu_vendor_string_is("GenuineIntel");
}
/*
* Exclude early K5 samples with a vendor string of "AMDisbetter!"
*/
bool is_amd_cpu(void)
{
return cpu_vendor_string_is("AuthenticAMD");
}
uint32_t kvm_get_cpuid_max_basic(void)
{
return kvm_get_supported_cpuid_entry(0)->eax;
@ -1384,6 +1404,23 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
}
}
struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
uint32_t index)
{
int i;
for (i = 0; i < cpuid->nent; i++) {
struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
if (cur->function == function && cur->index == index)
return cur;
}
TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
return NULL;
}
bool set_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 *ent)
{
@ -1479,22 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
return cpuid;
}
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
static inline unsigned x86_family(unsigned int eax)
{
unsigned int x86;
x86 = (eax >> 8) & 0xf;
if (x86 == 0xf)
x86 += (eax >> 20) & 0xff;
return x86;
}
unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
{
const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
@ -1504,11 +1525,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
/* Avoid reserved HyperTransport region on AMD processors. */
eax = ecx = 0;
cpuid(&eax, &ebx, &ecx, &edx);
if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
if (!is_amd_cpu())
return max_gfn;
/* On parts with <40 physical address bits, the area is fully hidden */
@ -1518,6 +1535,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
/* Before family 17h, the HyperTransport area is just below 1T. */
ht_gfn = (1 << 28) - num_ht_pages;
eax = 1;
ecx = 0;
cpuid(&eax, &ebx, &ecx, &edx);
if (x86_family(eax) < 0x17)
goto done;

View File

@ -154,6 +154,34 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct
return guest_cpuids;
}
static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid)
{
struct kvm_cpuid_entry2 *ent;
int rc;
u32 eax, ebx, x;
/* Setting unmodified CPUID is allowed */
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
/* Changing CPU features is forbidden */
ent = get_cpuid(cpuid, 0x7, 0);
ebx = ent->ebx;
ent->ebx--;
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
TEST_ASSERT(rc, "Changing CPU features should fail");
ent->ebx = ebx;
/* Changing MAXPHYADDR is forbidden */
ent = get_cpuid(cpuid, 0x80000008, 0);
eax = ent->eax;
x = eax & 0xff;
ent->eax = (eax & ~0xffu) | (x - 1);
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
ent->eax = eax;
}
int main(void)
{
struct kvm_cpuid2 *supp_cpuid, *cpuid2;
@ -175,5 +203,7 @@ int main(void)
for (stage = 0; stage < 3; stage++)
run_vcpu(vm, VCPU_ID, stage);
set_cpuid_after_run(vm, cpuid2);
kvm_vm_free(vm);
}

View File

@ -0,0 +1,434 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Test for x86 KVM_SET_PMU_EVENT_FILTER.
*
* Copyright (C) 2022, Google LLC.
*
* This work is licensed under the terms of the GNU GPL, version 2.
*
* Verifies the expected behavior of allow lists and deny lists for
* virtual PMU events.
*/
#define _GNU_SOURCE /* for program_invocation_short_name */
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
/*
* In lieu of copying perf_event.h into tools...
*/
#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
union cpuid10_eax {
struct {
unsigned int version_id:8;
unsigned int num_counters:8;
unsigned int bit_width:8;
unsigned int mask_length:8;
} split;
unsigned int full;
};
union cpuid10_ebx {
struct {
unsigned int no_unhalted_core_cycles:1;
unsigned int no_instructions_retired:1;
unsigned int no_unhalted_reference_cycles:1;
unsigned int no_llc_reference:1;
unsigned int no_llc_misses:1;
unsigned int no_branch_instruction_retired:1;
unsigned int no_branch_misses_retired:1;
} split;
unsigned int full;
};
/* End of stuff taken from perf_event.h. */
/* Oddly, this isn't in perf_event.h. */
#define ARCH_PERFMON_BRANCHES_RETIRED 5
#define VCPU_ID 0
#define NUM_BRANCHES 42
/*
* This is how the event selector and unit mask are stored in an AMD
* core performance event-select register. Intel's format is similar,
* but the event selector is only 8 bits.
*/
#define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \
(umask & 0xff) << 8)
/*
* "Branch instructions retired", from the Intel SDM, volume 3,
* "Pre-defined Architectural Performance Events."
*/
#define INTEL_BR_RETIRED EVENT(0xc4, 0)
/*
* "Retired branch instructions", from Processor Programming Reference
* (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
* Preliminary Processor Programming Reference (PPR) for AMD Family
* 17h Model 31h, Revision B0 Processors, and Preliminary Processor
* Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
* B1 Processors Volume 1 of 2.
*/
#define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0)
/*
* This event list comprises Intel's eight architectural events plus
* AMD's "retired branch instructions" for Zen[123] (and possibly
* other AMD CPUs).
*/
static const uint64_t event_list[] = {
EVENT(0x3c, 0),
EVENT(0xc0, 0),
EVENT(0x3c, 1),
EVENT(0x2e, 0x4f),
EVENT(0x2e, 0x41),
EVENT(0xc4, 0),
EVENT(0xc5, 0),
EVENT(0xa4, 1),
AMD_ZEN_BR_RETIRED,
};
/*
* If we encounter a #GP during the guest PMU sanity check, then the guest
* PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
*/
static void guest_gp_handler(struct ex_regs *regs)
{
GUEST_SYNC(0);
}
/*
* Check that we can write a new value to the given MSR and read it back.
* The caller should provide a non-empty set of bits that are safe to flip.
*
* Return on success. GUEST_SYNC(0) on error.
*/
static void check_msr(uint32_t msr, uint64_t bits_to_flip)
{
uint64_t v = rdmsr(msr) ^ bits_to_flip;
wrmsr(msr, v);
if (rdmsr(msr) != v)
GUEST_SYNC(0);
v ^= bits_to_flip;
wrmsr(msr, v);
if (rdmsr(msr) != v)
GUEST_SYNC(0);
}
static void intel_guest_code(void)
{
check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
check_msr(MSR_P6_EVNTSEL0, 0xffff);
check_msr(MSR_IA32_PMC0, 0xffff);
GUEST_SYNC(1);
for (;;) {
uint64_t br0, br1;
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED);
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
br0 = rdmsr(MSR_IA32_PMC0);
__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
br1 = rdmsr(MSR_IA32_PMC0);
GUEST_SYNC(br1 - br0);
}
}
/*
* To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
* this code uses the always-available, legacy K7 PMU MSRs, which alias to
* the first four of the six extended core PMU MSRs.
*/
static void amd_guest_code(void)
{
check_msr(MSR_K7_EVNTSEL0, 0xffff);
check_msr(MSR_K7_PERFCTR0, 0xffff);
GUEST_SYNC(1);
for (;;) {
uint64_t br0, br1;
wrmsr(MSR_K7_EVNTSEL0, 0);
wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED);
br0 = rdmsr(MSR_K7_PERFCTR0);
__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
br1 = rdmsr(MSR_K7_PERFCTR0);
GUEST_SYNC(br1 - br0);
}
}
/*
* Run the VM to the next GUEST_SYNC(value), and return the value passed
* to the sync. Any other exit from the guest is fatal.
*/
static uint64_t run_vm_to_sync(struct kvm_vm *vm)
{
struct kvm_run *run = vcpu_state(vm, VCPU_ID);
struct ucall uc;
vcpu_run(vm, VCPU_ID);
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
"Exit_reason other than KVM_EXIT_IO: %u (%s)\n",
run->exit_reason,
exit_reason_str(run->exit_reason));
get_ucall(vm, VCPU_ID, &uc);
TEST_ASSERT(uc.cmd == UCALL_SYNC,
"Received ucall other than UCALL_SYNC: %lu", uc.cmd);
return uc.args[1];
}
/*
* In a nested environment or if the vPMU is disabled, the guest PMU
* might not work as architected (accessing the PMU MSRs may raise
* #GP, or writes could simply be discarded). In those situations,
* there is no point in running these tests. The guest code will perform
* a sanity check and then GUEST_SYNC(success). In the case of failure,
* the behavior of the guest on resumption is undefined.
*/
static bool sanity_check_pmu(struct kvm_vm *vm)
{
bool success;
vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
success = run_vm_to_sync(vm);
vm_install_exception_handler(vm, GP_VECTOR, NULL);
return success;
}
static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents)
{
struct kvm_pmu_event_filter *f;
int size = sizeof(*f) + nevents * sizeof(f->events[0]);
f = malloc(size);
TEST_ASSERT(f, "Out of memory");
memset(f, 0, size);
f->nevents = nevents;
return f;
}
static struct kvm_pmu_event_filter *event_filter(uint32_t action)
{
struct kvm_pmu_event_filter *f;
int i;
f = make_pmu_event_filter(ARRAY_SIZE(event_list));
f->action = action;
for (i = 0; i < ARRAY_SIZE(event_list); i++)
f->events[i] = event_list[i];
return f;
}
/*
* Remove the first occurrence of 'event' (if any) from the filter's
* event list.
*/
static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f,
uint64_t event)
{
bool found = false;
int i;
for (i = 0; i < f->nevents; i++) {
if (found)
f->events[i - 1] = f->events[i];
else
found = f->events[i] == event;
}
if (found)
f->nevents--;
return f;
}
static void test_without_filter(struct kvm_vm *vm)
{
uint64_t count = run_vm_to_sync(vm);
if (count != NUM_BRANCHES)
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
__func__, count, NUM_BRANCHES);
TEST_ASSERT(count, "Allowed PMU event is not counting");
}
static uint64_t test_with_filter(struct kvm_vm *vm,
struct kvm_pmu_event_filter *f)
{
vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f);
return run_vm_to_sync(vm);
}
static void test_member_deny_list(struct kvm_vm *vm)
{
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
uint64_t count = test_with_filter(vm, f);
free(f);
if (count)
pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
__func__, count);
TEST_ASSERT(!count, "Disallowed PMU Event is counting");
}
static void test_member_allow_list(struct kvm_vm *vm)
{
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
uint64_t count = test_with_filter(vm, f);
free(f);
if (count != NUM_BRANCHES)
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
__func__, count, NUM_BRANCHES);
TEST_ASSERT(count, "Allowed PMU event is not counting");
}
static void test_not_member_deny_list(struct kvm_vm *vm)
{
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
uint64_t count;
remove_event(f, INTEL_BR_RETIRED);
remove_event(f, AMD_ZEN_BR_RETIRED);
count = test_with_filter(vm, f);
free(f);
if (count != NUM_BRANCHES)
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
__func__, count, NUM_BRANCHES);
TEST_ASSERT(count, "Allowed PMU event is not counting");
}
static void test_not_member_allow_list(struct kvm_vm *vm)
{
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
uint64_t count;
remove_event(f, INTEL_BR_RETIRED);
remove_event(f, AMD_ZEN_BR_RETIRED);
count = test_with_filter(vm, f);
free(f);
if (count)
pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
__func__, count);
TEST_ASSERT(!count, "Disallowed PMU Event is counting");
}
/*
* Check for a non-zero PMU version, at least one general-purpose
* counter per logical processor, an EBX bit vector of length greater
* than 5, and EBX[5] clear.
*/
static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry)
{
union cpuid10_eax eax = { .full = entry->eax };
union cpuid10_ebx ebx = { .full = entry->ebx };
return eax.split.version_id && eax.split.num_counters > 0 &&
eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED &&
!ebx.split.no_branch_instruction_retired;
}
/*
* Note that CPUID leaf 0xa is Intel-specific. This leaf should be
* clear on AMD hardware.
*/
static bool use_intel_pmu(void)
{
struct kvm_cpuid_entry2 *entry;
entry = kvm_get_supported_cpuid_index(0xa, 0);
return is_intel_cpu() && entry && check_intel_pmu_leaf(entry);
}
static bool is_zen1(uint32_t eax)
{
return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f;
}
static bool is_zen2(uint32_t eax)
{
return x86_family(eax) == 0x17 &&
x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f;
}
static bool is_zen3(uint32_t eax)
{
return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f;
}
/*
* Determining AMD support for a PMU event requires consulting the AMD
* PPR for the CPU or reference material derived therefrom. The AMD
* test code herein has been verified to work on Zen1, Zen2, and Zen3.
*
* Feel free to add more AMD CPUs that are documented to support event
* select 0xc2 umask 0 as "retired branch instructions."
*/
static bool use_amd_pmu(void)
{
struct kvm_cpuid_entry2 *entry;
entry = kvm_get_supported_cpuid_index(1, 0);
return is_amd_cpu() && entry &&
(is_zen1(entry->eax) ||
is_zen2(entry->eax) ||
is_zen3(entry->eax));
}
int main(int argc, char *argv[])
{
void (*guest_code)(void) = NULL;
struct kvm_vm *vm;
int r;
/* Tell stdout not to buffer its content */
setbuf(stdout, NULL);
r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER);
if (!r) {
print_skip("KVM_CAP_PMU_EVENT_FILTER not supported");
exit(KSFT_SKIP);
}
if (use_intel_pmu())
guest_code = intel_guest_code;
else if (use_amd_pmu())
guest_code = amd_guest_code;
if (!guest_code) {
print_skip("Don't know how to test this guest PMU");
exit(KSFT_SKIP);
}
vm = vm_create_default(VCPU_ID, 0, guest_code);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
if (!sanity_check_pmu(vm)) {
print_skip("Guest PMU is not functional");
exit(KSFT_SKIP);
}
test_without_filter(vm);
test_member_deny_list(vm);
test_member_allow_list(vm);
test_not_member_deny_list(vm);
test_not_member_allow_list(vm);
kvm_vm_free(vm);
return 0;
}

View File

@ -77,8 +77,8 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
switch (get_ucall(vm, vcpuid, &uc)) {
case UCALL_SYNC:
TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
stage + 1, (ulong)uc.args[1]);
uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
stage + 1, (ulong)uc.args[1]);
return;
case UCALL_DONE:
return;

View File

@ -30,8 +30,8 @@ static struct kvm_vm *vm;
static void l2_guest_code(void)
{
/* Exit to L0 */
asm volatile("inb %%dx, %%al"
: : [port] "d" (PORT_L0_EXIT) : "rax");
asm volatile("inb %%dx, %%al"
: : [port] "d" (PORT_L0_EXIT) : "rax");
}
static void l1_guest_code(struct vmx_pages *vmx_pages)

View File

@ -0,0 +1,139 @@
// SPDX-License-Identifier: GPL-2.0-only
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
#include <signal.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include "kselftest.h"
#define VCPU_ID 0
static struct kvm_vm *vm;
static void guest_ud_handler(struct ex_regs *regs)
{
/* Loop on the ud2 until guest state is made invalid. */
}
static void guest_code(void)
{
asm volatile("ud2");
}
static void __run_vcpu_with_invalid_state(void)
{
struct kvm_run *run = vcpu_state(vm, VCPU_ID);
vcpu_run(vm, VCPU_ID);
TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
"Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n",
run->exit_reason, exit_reason_str(run->exit_reason));
TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
"Expected emulation failure, got %d\n",
run->emulation_failure.suberror);
}
static void run_vcpu_with_invalid_state(void)
{
/*
* Always run twice to verify KVM handles the case where _KVM_ queues
* an exception with invalid state and then exits to userspace, i.e.
* that KVM doesn't explode if userspace ignores the initial error.
*/
__run_vcpu_with_invalid_state();
__run_vcpu_with_invalid_state();
}
static void set_timer(void)
{
struct itimerval timer;
timer.it_value.tv_sec = 0;
timer.it_value.tv_usec = 200;
timer.it_interval = timer.it_value;
ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
}
static void set_or_clear_invalid_guest_state(bool set)
{
static struct kvm_sregs sregs;
if (!sregs.cr0)
vcpu_sregs_get(vm, VCPU_ID, &sregs);
sregs.tr.unusable = !!set;
vcpu_sregs_set(vm, VCPU_ID, &sregs);
}
static void set_invalid_guest_state(void)
{
set_or_clear_invalid_guest_state(true);
}
static void clear_invalid_guest_state(void)
{
set_or_clear_invalid_guest_state(false);
}
static void sigalrm_handler(int sig)
{
struct kvm_vcpu_events events;
TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
vcpu_events_get(vm, VCPU_ID, &events);
/*
* If an exception is pending, attempt KVM_RUN with invalid guest,
* otherwise rearm the timer and keep doing so until the timer fires
* between KVM queueing an exception and re-entering the guest.
*/
if (events.exception.pending) {
set_invalid_guest_state();
run_vcpu_with_invalid_state();
} else {
set_timer();
}
}
int main(int argc, char *argv[])
{
if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) {
print_skip("Must be run with kvm_intel.unrestricted_guest=0");
exit(KSFT_SKIP);
}
vm = vm_create_default(VCPU_ID, 0, (void *)guest_code);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
/*
* Stuff invalid guest state for L2 by making TR unusuable. The next
* KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
* emulating invalid guest state for L2.
*/
set_invalid_guest_state();
run_vcpu_with_invalid_state();
/*
* Verify KVM also handles the case where userspace gains control while
* an exception is pending and stuffs invalid state. Run with valid
* guest state and a timer firing every 200us, and attempt to enter the
* guest with invalid state when the handler interrupts KVM with an
* exception pending.
*/
clear_invalid_guest_state();
TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
"Failed to register SIGALRM handler, errno = %d (%s)",
errno, strerror(errno));
set_timer();
run_vcpu_with_invalid_state();
}

View File

@ -46,20 +46,20 @@ static struct kvm_vm *vm;
#define MIN_STEAL_TIME 50000
struct pvclock_vcpu_time_info {
u32 version;
u32 pad0;
u64 tsc_timestamp;
u64 system_time;
u32 tsc_to_system_mul;
s8 tsc_shift;
u8 flags;
u8 pad[2];
u32 version;
u32 pad0;
u64 tsc_timestamp;
u64 system_time;
u32 tsc_to_system_mul;
s8 tsc_shift;
u8 flags;
u8 pad[2];
} __attribute__((__packed__)); /* 32 bytes */
struct pvclock_wall_clock {
u32 version;
u32 sec;
u32 nsec;
u32 version;
u32 sec;
u32 nsec;
} __attribute__((__packed__));
struct vcpu_runstate_info {
@ -74,11 +74,11 @@ struct arch_vcpu_info {
};
struct vcpu_info {
uint8_t evtchn_upcall_pending;
uint8_t evtchn_upcall_mask;
unsigned long evtchn_pending_sel;
struct arch_vcpu_info arch;
struct pvclock_vcpu_time_info time;
uint8_t evtchn_upcall_pending;
uint8_t evtchn_upcall_mask;
unsigned long evtchn_pending_sel;
struct arch_vcpu_info arch;
struct pvclock_vcpu_time_info time;
}; /* 64 bytes (x86) */
struct shared_info {
@ -493,7 +493,7 @@ int main(int argc, char *argv[])
vm_ts.tv_sec = wc->sec;
vm_ts.tv_nsec = wc->nsec;
TEST_ASSERT(wc->version && !(wc->version & 1),
TEST_ASSERT(wc->version && !(wc->version & 1),
"Bad wallclock version %x", wc->version);
TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");

View File

@ -427,9 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
#endif
kvm_async_pf_vcpu_init(vcpu);
vcpu->pre_pcpu = -1;
INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
kvm_vcpu_set_in_spin_loop(vcpu, false);
kvm_vcpu_set_dy_eligible(vcpu, false);
vcpu->preempted = false;
@ -3163,8 +3160,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
#ifdef CONFIG_HAVE_KVM_DIRTY_RING
if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
return;
#endif
if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
unsigned long rel_gfn = gfn - memslot->base_gfn;