KVM x86 MMU changes for 6.13

- Cleanup KVM's handling of Accessed and Dirty bits to dedup code, improve
    documentation, harden against unexpected changes, and to simplify
    A/D-disabled MMUs by using the hardware-defined A/D bits to track if a
    PFN is Accessed and/or Dirty.
 
  - Elide TLB flushes when aging SPTEs, as has been done in x86's primary
    MMU for over 10 years.
 
  - Batch TLB flushes when zapping collapsible TDP MMU SPTEs, i.e. when
    dirty logging is toggled off, which reduces the time it takes to disable
    dirty logging by ~3x.
 
  - Recover huge pages in-place in the TDP MMU instead of zapping the SP
    and waiting until the page is re-accessed to create a huge mapping.
    Proactively installing huge pages can reduce vCPU jitter in extreme
    scenarios.
 
  - Remove support for (poorly) reclaiming page tables in shadow MMUs via
    the primary MMU's shrinker interface.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmczpgQACgkQOlYIJqCj
 N/0qTg//WMEFjC05VrtfwnI8+aCN3au1ALh1Lh1AQzgpVtJfINAmOBAVcIA8JEm9
 kX+kApVW0y5y88VgHEb3J43Rbk0LuLQTePQhsfZ8vVJNhNjnxeraOR8wyLsjOjVS
 Dpb7AtMxKM/9GqrP5YBFaPPf5YKu/FZFpZfNdKu4esf+Re+usWYpJcTOJOAkQeCL
 v9grKLdVZCWEydO9QXrO6EnOW+EWc5Rmrx4521OBggqbeslAjbD6mAhE32T5Eg2C
 fdwVAN/XcVv5g1lIof/fHyiXviiztSV+FJCxipxJYsn3HWPgSxBcRvrydsOLY5lT
 ghvHn0YptZaxND9oyksP63lrF7LuAb8FI44kr9OOZIZeT0QFVNGBWOn2jv6k1xkr
 hLJ1zqKgIwXMgSALlfk2VoywUDLRwv4OAoL31j7m+/nGMlFDtFfY3oARfpZ4kI7K
 Taop67iQkHFIh8eX0t3HCPbIop/0oMKINKHC0VmVDgml5l+vIUApRvD6gsXYpnTD
 Q+AdQgwYU0lM7J6U9GjebisSanim3zKpUggrG5FdAwGGNjns5HLtzLLyBu+KNHic
 aTPs8HCu0jK51Re3fS7Q5IZBzsec15uv1tyCIDVhNuqtocgzXER6Y4YojC8qNPla
 XR8H2d/SskJBNkdfbvWsAp+sd/nvtEgv8deEvTmO2SEg5pxhnN8=
 =xTZ8
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-mmu-6.13' of https://github.com/kvm-x86/linux into HEAD

KVM x86 MMU changes for 6.13

 - Cleanup KVM's handling of Accessed and Dirty bits to dedup code, improve
   documentation, harden against unexpected changes, and to simplify
   A/D-disabled MMUs by using the hardware-defined A/D bits to track if a
   PFN is Accessed and/or Dirty.

 - Elide TLB flushes when aging SPTEs, as has been done in x86's primary
   MMU for over 10 years.

 - Batch TLB flushes when zapping collapsible TDP MMU SPTEs, i.e. when
   dirty logging is toggled off, which reduces the time it takes to disable
   dirty logging by ~3x.

 - Recover huge pages in-place in the TDP MMU instead of zapping the SP
   and waiting until the page is re-accessed to create a huge mapping.
   Proactively installing huge pages can reduce vCPU jitter in extreme
   scenarios.

 - Remove support for (poorly) reclaiming page tables in shadow MMUs via
   the primary MMU's shrinker interface.
This commit is contained in:
Paolo Bonzini 2024-11-13 06:31:54 -05:00
commit c59de14133
11 changed files with 280 additions and 410 deletions

View File

@ -1306,7 +1306,6 @@ struct kvm_arch {
bool pre_fault_allowed;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
/*
* A list of kvm_mmu_page structs that, if zapped, could possibly be
* replaced by an NX huge page. A shadow page is on this list if its
@ -1955,8 +1954,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *memslot,
u64 start, u64 end,
int target_level);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_recover_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);

View File

@ -22,6 +22,7 @@ config KVM_X86
depends on X86_LOCAL_APIC
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
select HAVE_KVM_DIRTY_RING_TSO

View File

@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator {
static struct kmem_cache *pte_list_desc_cache;
struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
static void mmu_spte_set(u64 *sptep, u64 spte);
@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
__set_spte(sptep, new_spte);
}
/*
* Update the SPTE (excluding the PFN), but do not track changes in its
* accessed/dirty status.
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changed.
*
* Returns true if the TLB needs to be flushed
*/
static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
@ -498,7 +498,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
return old_spte;
return false;
}
if (!spte_has_volatile_bits(old_spte))
@ -506,49 +506,10 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
return old_spte;
}
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changed.
*
* Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
* TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
* spte, even though the writable spte might be cached on a CPU's TLB.
*
* Returns true if the TLB needs to be flushed
*/
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
bool flush = false;
u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
if (!is_shadow_present_pte(old_spte))
return false;
/*
* For the spte updated out of mmu-lock is safe, since
* we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
if (is_mmu_writable_spte(old_spte) &&
!is_writable_pte(new_spte))
flush = true;
/*
* Flush TLB when accessed/dirty states are changed in the page tables,
* to guarantee consistency between TLB and page tables.
*/
if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte))
flush = true;
if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte))
flush = true;
return flush;
return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
}
/*
@ -1606,8 +1567,13 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
} else {
/*
* WARN if mmu_spte_update() signals the need
* for a TLB flush, as Access tracking a SPTE
* should never trigger an _immediate_ flush.
*/
spte = mark_spte_for_access_track(spte);
mmu_spte_update_no_track(sptep, spte);
WARN_ON_ONCE(mmu_spte_update(sptep, spte));
}
young = true;
}
@ -1655,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
#endif
}
/*
* This value is the sum of all of the kvm instances's
* kvm->arch.n_used_mmu_pages values. We need a global,
* aggregate version in order to make the slab shrinker
* faster
*/
static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_mod_used_mmu_pages(kvm, +1);
kvm->arch.n_used_mmu_pages++;
kvm_account_pgtable_pages((void *)sp->spt, +1);
}
static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_mod_used_mmu_pages(kvm, -1);
kvm->arch.n_used_mmu_pages--;
kvm_account_pgtable_pages((void *)sp->spt, -1);
}
@ -3147,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
}
int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn,
int max_level)
const struct kvm_memory_slot *slot, gfn_t gfn)
{
bool is_private = kvm_slot_can_be_private(slot) &&
kvm_mem_is_private(kvm, gfn);
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
}
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@ -3373,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault
* by setting the Writable bit, which can be done out of mmu_lock.
*/
if (!fault->present)
return !kvm_ad_enabled();
return !kvm_ad_enabled;
/*
* Note, instruction fetches and writes are mutually exclusive, ignore
@ -3508,8 +3461,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* uses A/D bits for non-nested MMUs. Thus, if A/D bits are
* enabled, the SPTE can't be an access-tracked SPTE.
*/
if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
new_spte = restore_acc_track_spte(new_spte);
if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
new_spte = restore_acc_track_spte(new_spte) |
shadow_accessed_mask;
/*
* To keep things simple, only SPTEs that are MMU-writable can
@ -5485,7 +5439,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
role.efer_nx = true;
role.smm = cpu_role.base.smm;
role.guest_mode = cpu_role.base.guest_mode;
role.ad_disabled = !kvm_ad_enabled();
role.ad_disabled = !kvm_ad_enabled;
role.level = kvm_mmu_get_tdp_level(vcpu);
role.direct = true;
role.has_4_byte_gpte = false;
@ -6413,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
int nr_zapped, batch = 0;
LIST_HEAD(invalid_list);
bool unstable;
lockdep_assert_held(&kvm->slots_lock);
restart:
list_for_each_entry_safe_reverse(sp, node,
&kvm->arch.active_mmu_pages, link) {
@ -6446,7 +6403,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
}
unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
&kvm->arch.zapped_obsolete_pages, &nr_zapped);
&invalid_list, &nr_zapped);
batch += nr_zapped;
if (unstable)
@ -6462,7 +6419,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
* kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
* running with an obsolete MMU.
*/
kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
/*
@ -6525,16 +6482,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_tdp_mmu_zap_invalidated_roots(kvm);
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
{
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
void kvm_mmu_init_vm(struct kvm *kvm)
{
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
@ -6768,7 +6719,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm,
continue;
}
spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
spte = make_small_spte(kvm, huge_spte, sp->role, index);
mmu_spte_set(sptep, spte);
__rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
}
@ -6951,8 +6902,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
PG_LEVEL_NUM)) {
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_remote_tlbs_range())
@ -6980,8 +6930,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
kvm_flush_remote_tlbs_memslot(kvm, slot);
}
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
void kvm_mmu_recover_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
@ -6991,7 +6941,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
kvm_tdp_mmu_recover_huge_pages(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@ -7146,72 +7096,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
}
}
static unsigned long mmu_shrink_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct kvm *kvm;
int nr_to_scan = sc->nr_to_scan;
unsigned long freed = 0;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
/*
* Never scan more than sc->nr_to_scan VM instances.
* Will not hit this condition practically since we do not try
* to shrink more than one VM and it is very unlikely to see
* !n_used_mmu_pages so many times.
*/
if (!nr_to_scan--)
break;
/*
* n_used_mmu_pages is accessed without holding kvm->mmu_lock
* here. We may skip a VM instance errorneosly, but we do not
* want to shrink a VM that only started to populate its MMU
* anyway.
*/
if (!kvm->arch.n_used_mmu_pages &&
!kvm_has_zapped_obsolete_pages(kvm))
continue;
idx = srcu_read_lock(&kvm->srcu);
write_lock(&kvm->mmu_lock);
if (kvm_has_zapped_obsolete_pages(kvm)) {
kvm_mmu_commit_zap_page(kvm,
&kvm->arch.zapped_obsolete_pages);
goto unlock;
}
freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
unlock:
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
/*
* unfair on small ones
* per-vm shrinkers cry out
* sadness comes quickly
*/
list_move_tail(&kvm->vm_list, &vm_list);
break;
}
mutex_unlock(&kvm_lock);
return freed;
}
static unsigned long mmu_shrink_count(struct shrinker *shrink,
struct shrink_control *sc)
{
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
static struct shrinker *mmu_shrinker;
static void mmu_destroy_caches(void)
{
kmem_cache_destroy(pte_list_desc_cache);
@ -7338,23 +7222,8 @@ int kvm_mmu_vendor_module_init(void)
if (!mmu_page_header_cache)
goto out;
if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
goto out;
mmu_shrinker = shrinker_alloc(0, "x86-mmu");
if (!mmu_shrinker)
goto out_shrinker;
mmu_shrinker->count_objects = mmu_shrink_count;
mmu_shrinker->scan_objects = mmu_shrink_scan;
mmu_shrinker->seeks = DEFAULT_SEEKS * 10;
shrinker_register(mmu_shrinker);
return 0;
out_shrinker:
percpu_counter_destroy(&kvm_total_used_mmu_pages);
out:
mmu_destroy_caches();
return ret;
@ -7371,8 +7240,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
shrinker_free(mmu_shrinker);
}
/*

View File

@ -346,8 +346,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn,
int max_level);
const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);

View File

@ -24,6 +24,8 @@ static bool __ro_after_init allow_mmio_caching;
module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
EXPORT_SYMBOL_GPL(enable_mmio_caching);
bool __read_mostly kvm_ad_enabled;
u64 __read_mostly shadow_host_writable_mask;
u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
@ -133,12 +135,6 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
*/
bool spte_has_volatile_bits(u64 spte)
{
/*
* Always atomically update spte if it can be updated
* out of mmu-lock, it can ensure dirty bit is not lost,
* also, it can help us to get a stable is_writable_pte()
* to ensure tlb flush is not missed.
*/
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
return true;
@ -179,7 +175,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
spte |= shadow_present_mask;
if (!prefetch || synchronizing)
spte |= spte_shadow_accessed_mask(spte);
spte |= shadow_accessed_mask;
/*
* For simplicity, enforce the NX huge page mitigation even if not
@ -223,42 +219,27 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
spte |= (u64)pfn << PAGE_SHIFT;
if (pte_access & ACC_WRITE_MASK) {
spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
/*
* When overwriting an existing leaf SPTE, and the old SPTE was
* writable, skip trying to unsync shadow pages as any relevant
* shadow pages must already be unsync, i.e. the hash lookup is
* unnecessary (and expensive).
*
* The same reasoning applies to dirty page/folio accounting;
* KVM marked the folio dirty when the old SPTE was created,
* thus there's no need to mark the folio dirty again.
*
* Note, both cases rely on KVM not changing PFNs without first
* zapping the old SPTE, which is guaranteed by both the shadow
* MMU and the TDP MMU.
*/
if (is_last_spte(old_spte, level) && is_writable_pte(old_spte))
goto out;
/*
* Unsync shadow pages that are reachable by the new, writable
* SPTE. Write-protect the SPTE if the page can't be unsync'd,
* e.g. it's write-tracked (upper-level SPs) or has one or more
* shadow pages and unsync'ing pages is not allowed.
*
* When overwriting an existing leaf SPTE, and the old SPTE was
* writable, skip trying to unsync shadow pages as any relevant
* shadow pages must already be unsync, i.e. the hash lookup is
* unnecessary (and expensive). Note, this relies on KVM not
* changing PFNs without first zapping the old SPTE, which is
* guaranteed by both the shadow MMU and the TDP MMU.
*/
if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) {
if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) &&
mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch))
wrprot = true;
pte_access &= ~ACC_WRITE_MASK;
spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
}
else
spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask |
shadow_dirty_mask;
}
if (pte_access & ACC_WRITE_MASK)
spte |= spte_shadow_dirty_mask(spte);
out:
if (prefetch && !synchronizing)
spte = mark_spte_for_access_track(spte);
@ -281,15 +262,15 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return wrprot;
}
static u64 make_spte_executable(u64 spte)
static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
{
bool is_access_track = is_access_track_spte(spte);
if (is_access_track)
spte = restore_acc_track_spte(spte);
spte &= ~shadow_nx_mask;
spte |= shadow_x_mask;
KVM_MMU_WARN_ON(set & clear);
spte = (spte | set) & ~clear;
if (is_access_track)
spte = mark_spte_for_access_track(spte);
@ -297,6 +278,16 @@ static u64 make_spte_executable(u64 spte)
return spte;
}
static u64 make_spte_executable(u64 spte)
{
return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
}
static u64 make_spte_nonexecutable(u64 spte)
{
return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
}
/*
* Construct an SPTE that maps a sub-page of the given huge page SPTE where
* `index` identifies which sub-page.
@ -304,8 +295,8 @@ static u64 make_spte_executable(u64 spte)
* This is used during huge page splitting to build the SPTEs that make up the
* new page table.
*/
u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
union kvm_mmu_page_role role, int index)
u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
union kvm_mmu_page_role role, int index)
{
u64 child_spte = huge_spte;
@ -333,6 +324,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
return child_spte;
}
u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
{
u64 huge_spte;
KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm);
huge_spte = small_spte | PT_PAGE_SIZE_MASK;
/*
* huge_spte already has the address of the sub-page being collapsed
* from small_spte, so just clear the lower address bits to create the
* huge page address.
*/
huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
if (is_nx_huge_page_enabled(kvm))
huge_spte = make_spte_nonexecutable(huge_spte);
return huge_spte;
}
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
@ -365,7 +376,7 @@ u64 mark_spte_for_access_track(u64 spte)
spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
spte &= ~shadow_acc_track_mask;
spte &= ~(shadow_acc_track_mask | shadow_accessed_mask);
return spte;
}
@ -435,9 +446,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
{
kvm_ad_enabled = has_ad_bits;
shadow_user_mask = VMX_EPT_READABLE_MASK;
shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
shadow_nx_mask = 0ull;
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
@ -468,6 +481,8 @@ void kvm_mmu_reset_all_pte_masks(void)
u8 low_phys_bits;
u64 mask;
kvm_ad_enabled = true;
/*
* If the CPU has 46 or less physical address bits, then set an
* appropriate mask to guard against L1TF attacks. Otherwise, it is

View File

@ -167,6 +167,15 @@ static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
#define SHADOW_NONPRESENT_VALUE 0ULL
#endif
/*
* True if A/D bits are supported in hardware and are enabled by KVM. When
* enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable
* A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario
* where KVM is using A/D bits for L1, but not L2.
*/
extern bool __read_mostly kvm_ad_enabled;
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
@ -285,17 +294,6 @@ static inline bool is_ept_ve_possible(u64 spte)
(spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE;
}
/*
* Returns true if A/D bits are supported in hardware and are enabled by KVM.
* When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can
* disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
* scenario where KVM is using A/D bits for L1, but not L2.
*/
static inline bool kvm_ad_enabled(void)
{
return !!shadow_accessed_mask;
}
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
{
return sp->role.ad_disabled;
@ -318,18 +316,6 @@ static inline bool spte_ad_need_write_protect(u64 spte)
return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
static inline bool is_access_track_spte(u64 spte)
{
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
@ -357,17 +343,7 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte)
static inline bool is_accessed_spte(u64 spte)
{
u64 accessed_mask = spte_shadow_accessed_mask(spte);
return accessed_mask ? spte & accessed_mask
: !is_access_track_spte(spte);
}
static inline bool is_dirty_spte(u64 spte)
{
u64 dirty_mask = spte_shadow_dirty_mask(spte);
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
return spte & shadow_accessed_mask;
}
static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
@ -485,6 +461,33 @@ static inline bool is_mmu_writable_spte(u64 spte)
return spte & shadow_mmu_writable_mask;
}
/*
* If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for
* write-tracking, remote TLBs must be flushed, even if the SPTE was read-only,
* as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM
* flushes TLBs based on whether or not dirty bitmap/ring entries were reaped,
* not whether or not SPTEs were modified, i.e. only the write-tracking case
* needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
*
* Don't flush if the Accessed bit is cleared, as access tracking tolerates
* false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a
* mmu_notifier.clear_flush_young() event.
*
* Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
* flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and
* when clearing dirty logs, KVM flushes based on whether or not dirty entries
* were reaped from the bitmap/ring, not whether or not dirty SPTEs were found.
*
* Note, this logic only applies to shadow-present leaf SPTEs. The caller is
* responsible for checking that the old SPTE is shadow-present, and is also
* responsible for determining whether or not a TLB flush is required when
* modifying a shadow-present non-leaf SPTE.
*/
static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte)
{
return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
}
static inline u64 get_mmio_spte_generation(u64 spte)
{
u64 gen;
@ -501,8 +504,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool synchronizing,
bool host_writable, u64 *new_spte);
u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
union kvm_mmu_page_role role, int index);
u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
union kvm_mmu_page_role role, int index);
u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);

View File

@ -583,48 +583,6 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
return 0;
}
static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter)
{
int ret;
lockdep_assert_held_read(&kvm->mmu_lock);
/*
* Freeze the SPTE by setting it to a special, non-present value. This
* will stop other threads from immediately installing a present entry
* in its place before the TLBs are flushed.
*
* Delay processing of the zapped SPTE until after TLBs are flushed and
* the FROZEN_SPTE is replaced (see below).
*/
ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE);
if (ret)
return ret;
kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
/*
* No other thread can overwrite the frozen SPTE as they must either
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
* overwrite the special frozen SPTE value. Use the raw write helper to
* avoid an unnecessary check on volatile bits.
*/
__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
/*
* Process the zapped SPTE after flushing TLBs, and after replacing
* FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are
* blocked by the FROZEN_SPTE and reduces contention on the child
* SPTEs.
*/
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
SHADOW_NONPRESENT_VALUE, iter->level, true);
return 0;
}
/*
* tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
* @kvm: KVM instance
@ -680,6 +638,16 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
struct tdp_iter *iter)
{
if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
return false;
/* Ensure forward progress has been made before yielding. */
return iter->next_last_level_gfn != iter->yielded_gfn;
}
/*
* Yield if the MMU lock is contended or this thread needs to return control
* to the scheduler.
@ -698,31 +666,27 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
struct tdp_iter *iter,
bool flush, bool shared)
{
WARN_ON_ONCE(iter->yielded);
KVM_MMU_WARN_ON(iter->yielded);
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
if (!tdp_mmu_iter_need_resched(kvm, iter))
return false;
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
if (flush)
kvm_flush_remote_tlbs(kvm);
if (flush)
kvm_flush_remote_tlbs(kvm);
rcu_read_unlock();
rcu_read_unlock();
if (shared)
cond_resched_rwlock_read(&kvm->mmu_lock);
else
cond_resched_rwlock_write(&kvm->mmu_lock);
if (shared)
cond_resched_rwlock_read(&kvm->mmu_lock);
else
cond_resched_rwlock_write(&kvm->mmu_lock);
rcu_read_lock();
rcu_read_lock();
WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
iter->yielded = true;
}
return iter->yielded;
iter->yielded = true;
return true;
}
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
@ -1033,7 +997,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
!is_last_spte(iter->old_spte, iter->level))
(!is_last_spte(iter->old_spte, iter->level) ||
WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
/*
@ -1073,7 +1038,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
{
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
int ret = 0;
if (shared) {
@ -1190,33 +1155,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
return flush;
}
typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range);
static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
struct kvm_gfn_range *range,
tdp_handler_t handler)
{
struct kvm_mmu_page *root;
struct tdp_iter iter;
bool ret = false;
/*
* Don't support rescheduling, none of the MMU notifiers that funnel
* into this helper allow blocking; it'd be dead, wasteful code.
*/
for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
ret |= handler(kvm, &iter, range);
rcu_read_unlock();
}
return ret;
}
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
@ -1225,15 +1163,10 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
* from the clear_young() or clear_flush_young() notifier, which uses the
* return value to determine if the page has been accessed.
*/
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range)
static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
{
u64 new_spte;
/* If we have a non-accessed entry we don't need to change the pte. */
if (!is_accessed_spte(iter->old_spte))
return false;
if (spte_ad_enabled(iter->old_spte)) {
iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
iter->old_spte,
@ -1249,23 +1182,48 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
iter->old_spte, new_spte);
return true;
}
static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
struct kvm_gfn_range *range,
bool test_only)
{
struct kvm_mmu_page *root;
struct tdp_iter iter;
bool ret = false;
/*
* Don't support rescheduling, none of the MMU notifiers that funnel
* into this helper allow blocking; it'd be dead, wasteful code. Note,
* this helper must NOT be used to unmap GFNs, as it processes only
* valid roots!
*/
for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
guard(rcu)();
tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
if (test_only)
return true;
ret = true;
kvm_tdp_mmu_age_spte(&iter);
}
}
return ret;
}
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
}
static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range)
{
return is_accessed_spte(iter->old_spte);
return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
}
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
}
/*
@ -1356,7 +1314,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
* not been linked in yet and thus is not reachable from any other CPU.
*/
for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
/*
* Replace the huge spte with a pointer to the populated lower level
@ -1489,16 +1447,15 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
* from level, so it is valid to key off any shadow page to determine if
* write protection is needed for an entire tree.
*/
return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled();
return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
}
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
shadow_dirty_mask;
struct tdp_iter iter;
bool spte_set = false;
rcu_read_lock();
@ -1519,31 +1476,24 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
goto retry;
spte_set = true;
}
rcu_read_unlock();
return spte_set;
}
/*
* Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
* memslot. Returns true if an SPTE has been changed and the TLBs need to be
* flushed.
* memslot.
*/
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
bool spte_set = false;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
return spte_set;
clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
}
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
@ -1602,21 +1552,55 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
const struct kvm_memory_slot *slot)
static int tdp_mmu_make_huge_spte(struct kvm *kvm,
struct tdp_iter *parent,
u64 *huge_spte)
{
struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
gfn_t start = parent->gfn;
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
struct tdp_iter iter;
tdp_root_for_each_leaf_pte(iter, root, start, end) {
/*
* Use the parent iterator when checking for forward progress so
* that KVM doesn't get stuck continuously trying to yield (i.e.
* returning -EAGAIN here and then failing the forward progress
* check in the caller ad nauseam).
*/
if (tdp_mmu_iter_need_resched(kvm, parent))
return -EAGAIN;
*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
return 0;
}
return -ENOENT;
}
static void recover_huge_pages_range(struct kvm *kvm,
struct kvm_mmu_page *root,
const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
int max_mapping_level;
bool flush = false;
u64 huge_spte;
int r;
if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
return;
rcu_read_lock();
for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
flush = false;
continue;
}
if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
!is_shadow_present_pte(iter.old_spte))
@ -1640,31 +1624,40 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
if (iter.gfn < start || iter.gfn >= end)
continue;
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
iter.gfn, PG_LEVEL_NUM);
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
if (max_mapping_level < iter.level)
continue;
/* Note, a successful atomic zap also does a remote TLB flush. */
if (tdp_mmu_zap_spte_atomic(kvm, &iter))
r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
if (r == -EAGAIN)
goto retry;
else if (r)
continue;
if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
goto retry;
flush = true;
}
if (flush)
kvm_flush_remote_tlbs_memslot(kvm, slot);
rcu_read_unlock();
}
/*
* Zap non-leaf SPTEs (and free their associated page tables) which could
* be replaced by huge pages, for GFNs within the slot.
* Recover huge page mappings within the slot by replacing non-leaf SPTEs with
* huge SPTEs where possible.
*/
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
zap_collapsible_spte_range(kvm, root, slot);
recover_huge_pages_range(kvm, root, slot);
}
/*

View File

@ -34,14 +34,14 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot, int min_level);
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot);
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot);
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,

View File

@ -13104,19 +13104,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (!log_dirty_pages) {
/*
* Dirty logging tracks sptes in 4k granularity, meaning that
* large sptes have to be split. If live migration succeeds,
* the guest in the source machine will be destroyed and large
* sptes will be created in the destination. However, if the
* guest continues to run in the source machine (for example if
* live migration fails), small sptes will remain around and
* cause bad performance.
* Recover huge page mappings in the slot now that dirty logging
* is disabled, i.e. now that KVM does not have to track guest
* writes at 4KiB granularity.
*
* Scan sptes if dirty logging has been stopped, dropping those
* which can be collapsed into a single large-page spte. Later
* page faults will create the large-page sptes.
* Dirty logging might be disabled by userspace if an ongoing VM
* live migration is cancelled and the VM must continue running
* on the source.
*/
kvm_mmu_zap_collapsible_sptes(kvm, new);
kvm_mmu_recover_huge_pages(kvm, new);
} else {
/*
* Initially-all-set does not require write protecting any page,

View File

@ -100,6 +100,10 @@ config KVM_GENERIC_MMU_NOTIFIER
select MMU_NOTIFIER
bool
config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
depends on KVM_GENERIC_MMU_NOTIFIER
bool
config KVM_GENERIC_MEMORY_ATTRIBUTES
depends on KVM_GENERIC_MMU_NOTIFIER
bool

View File

@ -631,7 +631,8 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
unsigned long start,
unsigned long end,
gfn_handler_t handler)
gfn_handler_t handler,
bool flush_on_ret)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
const struct kvm_mmu_notifier_range range = {
@ -639,7 +640,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
.end = end,
.handler = handler,
.on_lock = (void *)kvm_null_fn,
.flush_on_ret = true,
.flush_on_ret = flush_on_ret,
.may_block = false,
};
@ -651,17 +652,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
unsigned long end,
gfn_handler_t handler)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
const struct kvm_mmu_notifier_range range = {
.start = start,
.end = end,
.handler = handler,
.on_lock = (void *)kvm_null_fn,
.flush_on_ret = false,
.may_block = false,
};
return __kvm_handle_hva_range(kvm, &range).ret;
return kvm_handle_hva_range(mn, start, end, handler, false);
}
void kvm_mmu_invalidate_begin(struct kvm *kvm)
@ -826,7 +817,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
{
trace_kvm_age_hva(start, end);
return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
return kvm_handle_hva_range(mn, start, end, kvm_age_gfn,
!IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
}
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,