mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2024-12-28 16:52:18 +00:00
KVM x86 MMU changes for 6.13
- Cleanup KVM's handling of Accessed and Dirty bits to dedup code, improve documentation, harden against unexpected changes, and to simplify A/D-disabled MMUs by using the hardware-defined A/D bits to track if a PFN is Accessed and/or Dirty. - Elide TLB flushes when aging SPTEs, as has been done in x86's primary MMU for over 10 years. - Batch TLB flushes when zapping collapsible TDP MMU SPTEs, i.e. when dirty logging is toggled off, which reduces the time it takes to disable dirty logging by ~3x. - Recover huge pages in-place in the TDP MMU instead of zapping the SP and waiting until the page is re-accessed to create a huge mapping. Proactively installing huge pages can reduce vCPU jitter in extreme scenarios. - Remove support for (poorly) reclaiming page tables in shadow MMUs via the primary MMU's shrinker interface. -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmczpgQACgkQOlYIJqCj N/0qTg//WMEFjC05VrtfwnI8+aCN3au1ALh1Lh1AQzgpVtJfINAmOBAVcIA8JEm9 kX+kApVW0y5y88VgHEb3J43Rbk0LuLQTePQhsfZ8vVJNhNjnxeraOR8wyLsjOjVS Dpb7AtMxKM/9GqrP5YBFaPPf5YKu/FZFpZfNdKu4esf+Re+usWYpJcTOJOAkQeCL v9grKLdVZCWEydO9QXrO6EnOW+EWc5Rmrx4521OBggqbeslAjbD6mAhE32T5Eg2C fdwVAN/XcVv5g1lIof/fHyiXviiztSV+FJCxipxJYsn3HWPgSxBcRvrydsOLY5lT ghvHn0YptZaxND9oyksP63lrF7LuAb8FI44kr9OOZIZeT0QFVNGBWOn2jv6k1xkr hLJ1zqKgIwXMgSALlfk2VoywUDLRwv4OAoL31j7m+/nGMlFDtFfY3oARfpZ4kI7K Taop67iQkHFIh8eX0t3HCPbIop/0oMKINKHC0VmVDgml5l+vIUApRvD6gsXYpnTD Q+AdQgwYU0lM7J6U9GjebisSanim3zKpUggrG5FdAwGGNjns5HLtzLLyBu+KNHic aTPs8HCu0jK51Re3fS7Q5IZBzsec15uv1tyCIDVhNuqtocgzXER6Y4YojC8qNPla XR8H2d/SskJBNkdfbvWsAp+sd/nvtEgv8deEvTmO2SEg5pxhnN8= =xTZ8 -----END PGP SIGNATURE----- Merge tag 'kvm-x86-mmu-6.13' of https://github.com/kvm-x86/linux into HEAD KVM x86 MMU changes for 6.13 - Cleanup KVM's handling of Accessed and Dirty bits to dedup code, improve documentation, harden against unexpected changes, and to simplify A/D-disabled MMUs by using the hardware-defined A/D bits to track if a PFN is Accessed and/or Dirty. - Elide TLB flushes when aging SPTEs, as has been done in x86's primary MMU for over 10 years. - Batch TLB flushes when zapping collapsible TDP MMU SPTEs, i.e. when dirty logging is toggled off, which reduces the time it takes to disable dirty logging by ~3x. - Recover huge pages in-place in the TDP MMU instead of zapping the SP and waiting until the page is re-accessed to create a huge mapping. Proactively installing huge pages can reduce vCPU jitter in extreme scenarios. - Remove support for (poorly) reclaiming page tables in shadow MMUs via the primary MMU's shrinker interface.
This commit is contained in:
commit
c59de14133
@ -1306,7 +1306,6 @@ struct kvm_arch {
|
||||
bool pre_fault_allowed;
|
||||
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
|
||||
struct list_head active_mmu_pages;
|
||||
struct list_head zapped_obsolete_pages;
|
||||
/*
|
||||
* A list of kvm_mmu_page structs that, if zapped, could possibly be
|
||||
* replaced by an NX huge page. A shadow page is on this list if its
|
||||
@ -1955,8 +1954,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *memslot,
|
||||
u64 start, u64 end,
|
||||
int target_level);
|
||||
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *memslot);
|
||||
void kvm_mmu_recover_huge_pages(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *memslot);
|
||||
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *memslot);
|
||||
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
|
||||
|
@ -22,6 +22,7 @@ config KVM_X86
|
||||
depends on X86_LOCAL_APIC
|
||||
select KVM_COMMON
|
||||
select KVM_GENERIC_MMU_NOTIFIER
|
||||
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
|
||||
select HAVE_KVM_IRQCHIP
|
||||
select HAVE_KVM_PFNCACHE
|
||||
select HAVE_KVM_DIRTY_RING_TSO
|
||||
|
@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator {
|
||||
|
||||
static struct kmem_cache *pte_list_desc_cache;
|
||||
struct kmem_cache *mmu_page_header_cache;
|
||||
static struct percpu_counter kvm_total_used_mmu_pages;
|
||||
|
||||
static void mmu_spte_set(u64 *sptep, u64 spte);
|
||||
|
||||
@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
|
||||
__set_spte(sptep, new_spte);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the SPTE (excluding the PFN), but do not track changes in its
|
||||
* accessed/dirty status.
|
||||
/* Rules for using mmu_spte_update:
|
||||
* Update the state bits, it means the mapped pfn is not changed.
|
||||
*
|
||||
* Returns true if the TLB needs to be flushed
|
||||
*/
|
||||
static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
|
||||
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
|
||||
{
|
||||
u64 old_spte = *sptep;
|
||||
|
||||
@ -498,7 +498,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
|
||||
|
||||
if (!is_shadow_present_pte(old_spte)) {
|
||||
mmu_spte_set(sptep, new_spte);
|
||||
return old_spte;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!spte_has_volatile_bits(old_spte))
|
||||
@ -506,49 +506,10 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
|
||||
else
|
||||
old_spte = __update_clear_spte_slow(sptep, new_spte);
|
||||
|
||||
WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
|
||||
WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
|
||||
spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
|
||||
|
||||
return old_spte;
|
||||
}
|
||||
|
||||
/* Rules for using mmu_spte_update:
|
||||
* Update the state bits, it means the mapped pfn is not changed.
|
||||
*
|
||||
* Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
|
||||
* TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
|
||||
* spte, even though the writable spte might be cached on a CPU's TLB.
|
||||
*
|
||||
* Returns true if the TLB needs to be flushed
|
||||
*/
|
||||
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
|
||||
{
|
||||
bool flush = false;
|
||||
u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
|
||||
|
||||
if (!is_shadow_present_pte(old_spte))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* For the spte updated out of mmu-lock is safe, since
|
||||
* we always atomically update it, see the comments in
|
||||
* spte_has_volatile_bits().
|
||||
*/
|
||||
if (is_mmu_writable_spte(old_spte) &&
|
||||
!is_writable_pte(new_spte))
|
||||
flush = true;
|
||||
|
||||
/*
|
||||
* Flush TLB when accessed/dirty states are changed in the page tables,
|
||||
* to guarantee consistency between TLB and page tables.
|
||||
*/
|
||||
|
||||
if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte))
|
||||
flush = true;
|
||||
|
||||
if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte))
|
||||
flush = true;
|
||||
|
||||
return flush;
|
||||
return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1606,8 +1567,13 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
|
||||
clear_bit((ffs(shadow_accessed_mask) - 1),
|
||||
(unsigned long *)sptep);
|
||||
} else {
|
||||
/*
|
||||
* WARN if mmu_spte_update() signals the need
|
||||
* for a TLB flush, as Access tracking a SPTE
|
||||
* should never trigger an _immediate_ flush.
|
||||
*/
|
||||
spte = mark_spte_for_access_track(spte);
|
||||
mmu_spte_update_no_track(sptep, spte);
|
||||
WARN_ON_ONCE(mmu_spte_update(sptep, spte));
|
||||
}
|
||||
young = true;
|
||||
}
|
||||
@ -1655,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* This value is the sum of all of the kvm instances's
|
||||
* kvm->arch.n_used_mmu_pages values. We need a global,
|
||||
* aggregate version in order to make the slab shrinker
|
||||
* faster
|
||||
*/
|
||||
static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
|
||||
{
|
||||
kvm->arch.n_used_mmu_pages += nr;
|
||||
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
|
||||
}
|
||||
|
||||
static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
kvm_mod_used_mmu_pages(kvm, +1);
|
||||
kvm->arch.n_used_mmu_pages++;
|
||||
kvm_account_pgtable_pages((void *)sp->spt, +1);
|
||||
}
|
||||
|
||||
static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||
{
|
||||
kvm_mod_used_mmu_pages(kvm, -1);
|
||||
kvm->arch.n_used_mmu_pages--;
|
||||
kvm_account_pgtable_pages((void *)sp->spt, -1);
|
||||
}
|
||||
|
||||
@ -3147,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
|
||||
}
|
||||
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn,
|
||||
int max_level)
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn)
|
||||
{
|
||||
bool is_private = kvm_slot_can_be_private(slot) &&
|
||||
kvm_mem_is_private(kvm, gfn);
|
||||
|
||||
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
|
||||
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
|
||||
}
|
||||
|
||||
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
@ -3373,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault
|
||||
* by setting the Writable bit, which can be done out of mmu_lock.
|
||||
*/
|
||||
if (!fault->present)
|
||||
return !kvm_ad_enabled();
|
||||
return !kvm_ad_enabled;
|
||||
|
||||
/*
|
||||
* Note, instruction fetches and writes are mutually exclusive, ignore
|
||||
@ -3508,8 +3461,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
* uses A/D bits for non-nested MMUs. Thus, if A/D bits are
|
||||
* enabled, the SPTE can't be an access-tracked SPTE.
|
||||
*/
|
||||
if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
|
||||
new_spte = restore_acc_track_spte(new_spte);
|
||||
if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
|
||||
new_spte = restore_acc_track_spte(new_spte) |
|
||||
shadow_accessed_mask;
|
||||
|
||||
/*
|
||||
* To keep things simple, only SPTEs that are MMU-writable can
|
||||
@ -5485,7 +5439,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
|
||||
role.efer_nx = true;
|
||||
role.smm = cpu_role.base.smm;
|
||||
role.guest_mode = cpu_role.base.guest_mode;
|
||||
role.ad_disabled = !kvm_ad_enabled();
|
||||
role.ad_disabled = !kvm_ad_enabled;
|
||||
role.level = kvm_mmu_get_tdp_level(vcpu);
|
||||
role.direct = true;
|
||||
role.has_4_byte_gpte = false;
|
||||
@ -6413,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
|
||||
{
|
||||
struct kvm_mmu_page *sp, *node;
|
||||
int nr_zapped, batch = 0;
|
||||
LIST_HEAD(invalid_list);
|
||||
bool unstable;
|
||||
|
||||
lockdep_assert_held(&kvm->slots_lock);
|
||||
|
||||
restart:
|
||||
list_for_each_entry_safe_reverse(sp, node,
|
||||
&kvm->arch.active_mmu_pages, link) {
|
||||
@ -6446,7 +6403,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
|
||||
}
|
||||
|
||||
unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
|
||||
&kvm->arch.zapped_obsolete_pages, &nr_zapped);
|
||||
&invalid_list, &nr_zapped);
|
||||
batch += nr_zapped;
|
||||
|
||||
if (unstable)
|
||||
@ -6462,7 +6419,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
|
||||
* kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
|
||||
* running with an obsolete MMU.
|
||||
*/
|
||||
kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
|
||||
kvm_mmu_commit_zap_page(kvm, &invalid_list);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -6525,16 +6482,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
|
||||
kvm_tdp_mmu_zap_invalidated_roots(kvm);
|
||||
}
|
||||
|
||||
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
|
||||
{
|
||||
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
|
||||
}
|
||||
|
||||
void kvm_mmu_init_vm(struct kvm *kvm)
|
||||
{
|
||||
kvm->arch.shadow_mmio_value = shadow_mmio_value;
|
||||
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
|
||||
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
|
||||
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
|
||||
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
|
||||
|
||||
@ -6768,7 +6719,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm,
|
||||
continue;
|
||||
}
|
||||
|
||||
spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
|
||||
spte = make_small_spte(kvm, huge_spte, sp->role, index);
|
||||
mmu_spte_set(sptep, spte);
|
||||
__rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
|
||||
}
|
||||
@ -6951,8 +6902,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
|
||||
* mapping if the indirect sp has level = 1.
|
||||
*/
|
||||
if (sp->role.direct &&
|
||||
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
|
||||
PG_LEVEL_NUM)) {
|
||||
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
|
||||
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
|
||||
|
||||
if (kvm_available_flush_remote_tlbs_range())
|
||||
@ -6980,8 +6930,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
|
||||
kvm_flush_remote_tlbs_memslot(kvm, slot);
|
||||
}
|
||||
|
||||
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot)
|
||||
void kvm_mmu_recover_huge_pages(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot)
|
||||
{
|
||||
if (kvm_memslots_have_rmaps(kvm)) {
|
||||
write_lock(&kvm->mmu_lock);
|
||||
@ -6991,7 +6941,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
|
||||
if (tdp_mmu_enabled) {
|
||||
read_lock(&kvm->mmu_lock);
|
||||
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
|
||||
kvm_tdp_mmu_recover_huge_pages(kvm, slot);
|
||||
read_unlock(&kvm->mmu_lock);
|
||||
}
|
||||
}
|
||||
@ -7146,72 +7096,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long mmu_shrink_scan(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct kvm *kvm;
|
||||
int nr_to_scan = sc->nr_to_scan;
|
||||
unsigned long freed = 0;
|
||||
|
||||
mutex_lock(&kvm_lock);
|
||||
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
int idx;
|
||||
|
||||
/*
|
||||
* Never scan more than sc->nr_to_scan VM instances.
|
||||
* Will not hit this condition practically since we do not try
|
||||
* to shrink more than one VM and it is very unlikely to see
|
||||
* !n_used_mmu_pages so many times.
|
||||
*/
|
||||
if (!nr_to_scan--)
|
||||
break;
|
||||
/*
|
||||
* n_used_mmu_pages is accessed without holding kvm->mmu_lock
|
||||
* here. We may skip a VM instance errorneosly, but we do not
|
||||
* want to shrink a VM that only started to populate its MMU
|
||||
* anyway.
|
||||
*/
|
||||
if (!kvm->arch.n_used_mmu_pages &&
|
||||
!kvm_has_zapped_obsolete_pages(kvm))
|
||||
continue;
|
||||
|
||||
idx = srcu_read_lock(&kvm->srcu);
|
||||
write_lock(&kvm->mmu_lock);
|
||||
|
||||
if (kvm_has_zapped_obsolete_pages(kvm)) {
|
||||
kvm_mmu_commit_zap_page(kvm,
|
||||
&kvm->arch.zapped_obsolete_pages);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
|
||||
|
||||
unlock:
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
srcu_read_unlock(&kvm->srcu, idx);
|
||||
|
||||
/*
|
||||
* unfair on small ones
|
||||
* per-vm shrinkers cry out
|
||||
* sadness comes quickly
|
||||
*/
|
||||
list_move_tail(&kvm->vm_list, &vm_list);
|
||||
break;
|
||||
}
|
||||
|
||||
mutex_unlock(&kvm_lock);
|
||||
return freed;
|
||||
}
|
||||
|
||||
static unsigned long mmu_shrink_count(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
|
||||
}
|
||||
|
||||
static struct shrinker *mmu_shrinker;
|
||||
|
||||
static void mmu_destroy_caches(void)
|
||||
{
|
||||
kmem_cache_destroy(pte_list_desc_cache);
|
||||
@ -7338,23 +7222,8 @@ int kvm_mmu_vendor_module_init(void)
|
||||
if (!mmu_page_header_cache)
|
||||
goto out;
|
||||
|
||||
if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
|
||||
goto out;
|
||||
|
||||
mmu_shrinker = shrinker_alloc(0, "x86-mmu");
|
||||
if (!mmu_shrinker)
|
||||
goto out_shrinker;
|
||||
|
||||
mmu_shrinker->count_objects = mmu_shrink_count;
|
||||
mmu_shrinker->scan_objects = mmu_shrink_scan;
|
||||
mmu_shrinker->seeks = DEFAULT_SEEKS * 10;
|
||||
|
||||
shrinker_register(mmu_shrinker);
|
||||
|
||||
return 0;
|
||||
|
||||
out_shrinker:
|
||||
percpu_counter_destroy(&kvm_total_used_mmu_pages);
|
||||
out:
|
||||
mmu_destroy_caches();
|
||||
return ret;
|
||||
@ -7371,8 +7240,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
|
||||
void kvm_mmu_vendor_module_exit(void)
|
||||
{
|
||||
mmu_destroy_caches();
|
||||
percpu_counter_destroy(&kvm_total_used_mmu_pages);
|
||||
shrinker_free(mmu_shrinker);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -346,8 +346,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
|
||||
}
|
||||
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn,
|
||||
int max_level);
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn);
|
||||
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
|
||||
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
|
||||
|
||||
|
@ -24,6 +24,8 @@ static bool __ro_after_init allow_mmio_caching;
|
||||
module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
|
||||
EXPORT_SYMBOL_GPL(enable_mmio_caching);
|
||||
|
||||
bool __read_mostly kvm_ad_enabled;
|
||||
|
||||
u64 __read_mostly shadow_host_writable_mask;
|
||||
u64 __read_mostly shadow_mmu_writable_mask;
|
||||
u64 __read_mostly shadow_nx_mask;
|
||||
@ -133,12 +135,6 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
|
||||
*/
|
||||
bool spte_has_volatile_bits(u64 spte)
|
||||
{
|
||||
/*
|
||||
* Always atomically update spte if it can be updated
|
||||
* out of mmu-lock, it can ensure dirty bit is not lost,
|
||||
* also, it can help us to get a stable is_writable_pte()
|
||||
* to ensure tlb flush is not missed.
|
||||
*/
|
||||
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
|
||||
return true;
|
||||
|
||||
@ -179,7 +175,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
|
||||
spte |= shadow_present_mask;
|
||||
if (!prefetch || synchronizing)
|
||||
spte |= spte_shadow_accessed_mask(spte);
|
||||
spte |= shadow_accessed_mask;
|
||||
|
||||
/*
|
||||
* For simplicity, enforce the NX huge page mitigation even if not
|
||||
@ -223,42 +219,27 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
spte |= (u64)pfn << PAGE_SHIFT;
|
||||
|
||||
if (pte_access & ACC_WRITE_MASK) {
|
||||
spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
|
||||
|
||||
/*
|
||||
* When overwriting an existing leaf SPTE, and the old SPTE was
|
||||
* writable, skip trying to unsync shadow pages as any relevant
|
||||
* shadow pages must already be unsync, i.e. the hash lookup is
|
||||
* unnecessary (and expensive).
|
||||
*
|
||||
* The same reasoning applies to dirty page/folio accounting;
|
||||
* KVM marked the folio dirty when the old SPTE was created,
|
||||
* thus there's no need to mark the folio dirty again.
|
||||
*
|
||||
* Note, both cases rely on KVM not changing PFNs without first
|
||||
* zapping the old SPTE, which is guaranteed by both the shadow
|
||||
* MMU and the TDP MMU.
|
||||
*/
|
||||
if (is_last_spte(old_spte, level) && is_writable_pte(old_spte))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Unsync shadow pages that are reachable by the new, writable
|
||||
* SPTE. Write-protect the SPTE if the page can't be unsync'd,
|
||||
* e.g. it's write-tracked (upper-level SPs) or has one or more
|
||||
* shadow pages and unsync'ing pages is not allowed.
|
||||
*
|
||||
* When overwriting an existing leaf SPTE, and the old SPTE was
|
||||
* writable, skip trying to unsync shadow pages as any relevant
|
||||
* shadow pages must already be unsync, i.e. the hash lookup is
|
||||
* unnecessary (and expensive). Note, this relies on KVM not
|
||||
* changing PFNs without first zapping the old SPTE, which is
|
||||
* guaranteed by both the shadow MMU and the TDP MMU.
|
||||
*/
|
||||
if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) {
|
||||
if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) &&
|
||||
mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch))
|
||||
wrprot = true;
|
||||
pte_access &= ~ACC_WRITE_MASK;
|
||||
spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
|
||||
}
|
||||
else
|
||||
spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask |
|
||||
shadow_dirty_mask;
|
||||
}
|
||||
|
||||
if (pte_access & ACC_WRITE_MASK)
|
||||
spte |= spte_shadow_dirty_mask(spte);
|
||||
|
||||
out:
|
||||
if (prefetch && !synchronizing)
|
||||
spte = mark_spte_for_access_track(spte);
|
||||
|
||||
@ -281,15 +262,15 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
return wrprot;
|
||||
}
|
||||
|
||||
static u64 make_spte_executable(u64 spte)
|
||||
static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
|
||||
{
|
||||
bool is_access_track = is_access_track_spte(spte);
|
||||
|
||||
if (is_access_track)
|
||||
spte = restore_acc_track_spte(spte);
|
||||
|
||||
spte &= ~shadow_nx_mask;
|
||||
spte |= shadow_x_mask;
|
||||
KVM_MMU_WARN_ON(set & clear);
|
||||
spte = (spte | set) & ~clear;
|
||||
|
||||
if (is_access_track)
|
||||
spte = mark_spte_for_access_track(spte);
|
||||
@ -297,6 +278,16 @@ static u64 make_spte_executable(u64 spte)
|
||||
return spte;
|
||||
}
|
||||
|
||||
static u64 make_spte_executable(u64 spte)
|
||||
{
|
||||
return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
|
||||
}
|
||||
|
||||
static u64 make_spte_nonexecutable(u64 spte)
|
||||
{
|
||||
return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct an SPTE that maps a sub-page of the given huge page SPTE where
|
||||
* `index` identifies which sub-page.
|
||||
@ -304,8 +295,8 @@ static u64 make_spte_executable(u64 spte)
|
||||
* This is used during huge page splitting to build the SPTEs that make up the
|
||||
* new page table.
|
||||
*/
|
||||
u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
|
||||
union kvm_mmu_page_role role, int index)
|
||||
u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
|
||||
union kvm_mmu_page_role role, int index)
|
||||
{
|
||||
u64 child_spte = huge_spte;
|
||||
|
||||
@ -333,6 +324,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
|
||||
return child_spte;
|
||||
}
|
||||
|
||||
u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
|
||||
{
|
||||
u64 huge_spte;
|
||||
|
||||
KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm);
|
||||
|
||||
huge_spte = small_spte | PT_PAGE_SIZE_MASK;
|
||||
|
||||
/*
|
||||
* huge_spte already has the address of the sub-page being collapsed
|
||||
* from small_spte, so just clear the lower address bits to create the
|
||||
* huge page address.
|
||||
*/
|
||||
huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
|
||||
|
||||
if (is_nx_huge_page_enabled(kvm))
|
||||
huge_spte = make_spte_nonexecutable(huge_spte);
|
||||
|
||||
return huge_spte;
|
||||
}
|
||||
|
||||
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
|
||||
{
|
||||
@ -365,7 +376,7 @@ u64 mark_spte_for_access_track(u64 spte)
|
||||
|
||||
spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
|
||||
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
|
||||
spte &= ~shadow_acc_track_mask;
|
||||
spte &= ~(shadow_acc_track_mask | shadow_accessed_mask);
|
||||
|
||||
return spte;
|
||||
}
|
||||
@ -435,9 +446,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
|
||||
|
||||
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
|
||||
{
|
||||
kvm_ad_enabled = has_ad_bits;
|
||||
|
||||
shadow_user_mask = VMX_EPT_READABLE_MASK;
|
||||
shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
|
||||
shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
|
||||
shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
|
||||
shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
|
||||
shadow_nx_mask = 0ull;
|
||||
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
|
||||
/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
|
||||
@ -468,6 +481,8 @@ void kvm_mmu_reset_all_pte_masks(void)
|
||||
u8 low_phys_bits;
|
||||
u64 mask;
|
||||
|
||||
kvm_ad_enabled = true;
|
||||
|
||||
/*
|
||||
* If the CPU has 46 or less physical address bits, then set an
|
||||
* appropriate mask to guard against L1TF attacks. Otherwise, it is
|
||||
|
@ -167,6 +167,15 @@ static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
|
||||
#define SHADOW_NONPRESENT_VALUE 0ULL
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* True if A/D bits are supported in hardware and are enabled by KVM. When
|
||||
* enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable
|
||||
* A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario
|
||||
* where KVM is using A/D bits for L1, but not L2.
|
||||
*/
|
||||
extern bool __read_mostly kvm_ad_enabled;
|
||||
|
||||
extern u64 __read_mostly shadow_host_writable_mask;
|
||||
extern u64 __read_mostly shadow_mmu_writable_mask;
|
||||
extern u64 __read_mostly shadow_nx_mask;
|
||||
@ -285,17 +294,6 @@ static inline bool is_ept_ve_possible(u64 spte)
|
||||
(spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if A/D bits are supported in hardware and are enabled by KVM.
|
||||
* When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can
|
||||
* disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
|
||||
* scenario where KVM is using A/D bits for L1, but not L2.
|
||||
*/
|
||||
static inline bool kvm_ad_enabled(void)
|
||||
{
|
||||
return !!shadow_accessed_mask;
|
||||
}
|
||||
|
||||
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
|
||||
{
|
||||
return sp->role.ad_disabled;
|
||||
@ -318,18 +316,6 @@ static inline bool spte_ad_need_write_protect(u64 spte)
|
||||
return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
|
||||
}
|
||||
|
||||
static inline u64 spte_shadow_accessed_mask(u64 spte)
|
||||
{
|
||||
KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
|
||||
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
|
||||
}
|
||||
|
||||
static inline u64 spte_shadow_dirty_mask(u64 spte)
|
||||
{
|
||||
KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
|
||||
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
|
||||
}
|
||||
|
||||
static inline bool is_access_track_spte(u64 spte)
|
||||
{
|
||||
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
|
||||
@ -357,17 +343,7 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte)
|
||||
|
||||
static inline bool is_accessed_spte(u64 spte)
|
||||
{
|
||||
u64 accessed_mask = spte_shadow_accessed_mask(spte);
|
||||
|
||||
return accessed_mask ? spte & accessed_mask
|
||||
: !is_access_track_spte(spte);
|
||||
}
|
||||
|
||||
static inline bool is_dirty_spte(u64 spte)
|
||||
{
|
||||
u64 dirty_mask = spte_shadow_dirty_mask(spte);
|
||||
|
||||
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
|
||||
return spte & shadow_accessed_mask;
|
||||
}
|
||||
|
||||
static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
|
||||
@ -485,6 +461,33 @@ static inline bool is_mmu_writable_spte(u64 spte)
|
||||
return spte & shadow_mmu_writable_mask;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for
|
||||
* write-tracking, remote TLBs must be flushed, even if the SPTE was read-only,
|
||||
* as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM
|
||||
* flushes TLBs based on whether or not dirty bitmap/ring entries were reaped,
|
||||
* not whether or not SPTEs were modified, i.e. only the write-tracking case
|
||||
* needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
|
||||
*
|
||||
* Don't flush if the Accessed bit is cleared, as access tracking tolerates
|
||||
* false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a
|
||||
* mmu_notifier.clear_flush_young() event.
|
||||
*
|
||||
* Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
|
||||
* flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and
|
||||
* when clearing dirty logs, KVM flushes based on whether or not dirty entries
|
||||
* were reaped from the bitmap/ring, not whether or not dirty SPTEs were found.
|
||||
*
|
||||
* Note, this logic only applies to shadow-present leaf SPTEs. The caller is
|
||||
* responsible for checking that the old SPTE is shadow-present, and is also
|
||||
* responsible for determining whether or not a TLB flush is required when
|
||||
* modifying a shadow-present non-leaf SPTE.
|
||||
*/
|
||||
static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte)
|
||||
{
|
||||
return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
|
||||
}
|
||||
|
||||
static inline u64 get_mmio_spte_generation(u64 spte)
|
||||
{
|
||||
u64 gen;
|
||||
@ -501,8 +504,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
|
||||
u64 old_spte, bool prefetch, bool synchronizing,
|
||||
bool host_writable, u64 *new_spte);
|
||||
u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
|
||||
union kvm_mmu_page_role role, int index);
|
||||
u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
|
||||
union kvm_mmu_page_role role, int index);
|
||||
u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level);
|
||||
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
|
||||
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
|
||||
u64 mark_spte_for_access_track(u64 spte);
|
||||
|
@ -583,48 +583,6 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm,
|
||||
struct tdp_iter *iter)
|
||||
{
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held_read(&kvm->mmu_lock);
|
||||
|
||||
/*
|
||||
* Freeze the SPTE by setting it to a special, non-present value. This
|
||||
* will stop other threads from immediately installing a present entry
|
||||
* in its place before the TLBs are flushed.
|
||||
*
|
||||
* Delay processing of the zapped SPTE until after TLBs are flushed and
|
||||
* the FROZEN_SPTE is replaced (see below).
|
||||
*/
|
||||
ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
|
||||
|
||||
/*
|
||||
* No other thread can overwrite the frozen SPTE as they must either
|
||||
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
|
||||
* overwrite the special frozen SPTE value. Use the raw write helper to
|
||||
* avoid an unnecessary check on volatile bits.
|
||||
*/
|
||||
__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
|
||||
|
||||
/*
|
||||
* Process the zapped SPTE after flushing TLBs, and after replacing
|
||||
* FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are
|
||||
* blocked by the FROZEN_SPTE and reduces contention on the child
|
||||
* SPTEs.
|
||||
*/
|
||||
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
|
||||
SHADOW_NONPRESENT_VALUE, iter->level, true);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
|
||||
* @kvm: KVM instance
|
||||
@ -680,6 +638,16 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
||||
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
|
||||
for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
|
||||
|
||||
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
|
||||
struct tdp_iter *iter)
|
||||
{
|
||||
if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
|
||||
return false;
|
||||
|
||||
/* Ensure forward progress has been made before yielding. */
|
||||
return iter->next_last_level_gfn != iter->yielded_gfn;
|
||||
}
|
||||
|
||||
/*
|
||||
* Yield if the MMU lock is contended or this thread needs to return control
|
||||
* to the scheduler.
|
||||
@ -698,31 +666,27 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
|
||||
struct tdp_iter *iter,
|
||||
bool flush, bool shared)
|
||||
{
|
||||
WARN_ON_ONCE(iter->yielded);
|
||||
KVM_MMU_WARN_ON(iter->yielded);
|
||||
|
||||
/* Ensure forward progress has been made before yielding. */
|
||||
if (iter->next_last_level_gfn == iter->yielded_gfn)
|
||||
if (!tdp_mmu_iter_need_resched(kvm, iter))
|
||||
return false;
|
||||
|
||||
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
|
||||
if (flush)
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
if (flush)
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
|
||||
rcu_read_unlock();
|
||||
rcu_read_unlock();
|
||||
|
||||
if (shared)
|
||||
cond_resched_rwlock_read(&kvm->mmu_lock);
|
||||
else
|
||||
cond_resched_rwlock_write(&kvm->mmu_lock);
|
||||
if (shared)
|
||||
cond_resched_rwlock_read(&kvm->mmu_lock);
|
||||
else
|
||||
cond_resched_rwlock_write(&kvm->mmu_lock);
|
||||
|
||||
rcu_read_lock();
|
||||
rcu_read_lock();
|
||||
|
||||
WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
|
||||
WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
|
||||
|
||||
iter->yielded = true;
|
||||
}
|
||||
|
||||
return iter->yielded;
|
||||
iter->yielded = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
|
||||
@ -1033,7 +997,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
|
||||
else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
|
||||
return RET_PF_RETRY;
|
||||
else if (is_shadow_present_pte(iter->old_spte) &&
|
||||
!is_last_spte(iter->old_spte, iter->level))
|
||||
(!is_last_spte(iter->old_spte, iter->level) ||
|
||||
WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
|
||||
kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
|
||||
|
||||
/*
|
||||
@ -1073,7 +1038,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
|
||||
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
|
||||
struct kvm_mmu_page *sp, bool shared)
|
||||
{
|
||||
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
|
||||
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
|
||||
int ret = 0;
|
||||
|
||||
if (shared) {
|
||||
@ -1190,33 +1155,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
|
||||
return flush;
|
||||
}
|
||||
|
||||
typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
|
||||
struct kvm_gfn_range *range);
|
||||
|
||||
static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
|
||||
struct kvm_gfn_range *range,
|
||||
tdp_handler_t handler)
|
||||
{
|
||||
struct kvm_mmu_page *root;
|
||||
struct tdp_iter iter;
|
||||
bool ret = false;
|
||||
|
||||
/*
|
||||
* Don't support rescheduling, none of the MMU notifiers that funnel
|
||||
* into this helper allow blocking; it'd be dead, wasteful code.
|
||||
*/
|
||||
for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
|
||||
rcu_read_lock();
|
||||
|
||||
tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
|
||||
ret |= handler(kvm, &iter, range);
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
|
||||
* if any of the GFNs in the range have been accessed.
|
||||
@ -1225,15 +1163,10 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
|
||||
* from the clear_young() or clear_flush_young() notifier, which uses the
|
||||
* return value to determine if the page has been accessed.
|
||||
*/
|
||||
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
|
||||
struct kvm_gfn_range *range)
|
||||
static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
|
||||
{
|
||||
u64 new_spte;
|
||||
|
||||
/* If we have a non-accessed entry we don't need to change the pte. */
|
||||
if (!is_accessed_spte(iter->old_spte))
|
||||
return false;
|
||||
|
||||
if (spte_ad_enabled(iter->old_spte)) {
|
||||
iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
|
||||
iter->old_spte,
|
||||
@ -1249,23 +1182,48 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
|
||||
|
||||
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
|
||||
iter->old_spte, new_spte);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
|
||||
struct kvm_gfn_range *range,
|
||||
bool test_only)
|
||||
{
|
||||
struct kvm_mmu_page *root;
|
||||
struct tdp_iter iter;
|
||||
bool ret = false;
|
||||
|
||||
/*
|
||||
* Don't support rescheduling, none of the MMU notifiers that funnel
|
||||
* into this helper allow blocking; it'd be dead, wasteful code. Note,
|
||||
* this helper must NOT be used to unmap GFNs, as it processes only
|
||||
* valid roots!
|
||||
*/
|
||||
for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
|
||||
guard(rcu)();
|
||||
|
||||
tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
|
||||
if (!is_accessed_spte(iter.old_spte))
|
||||
continue;
|
||||
|
||||
if (test_only)
|
||||
return true;
|
||||
|
||||
ret = true;
|
||||
kvm_tdp_mmu_age_spte(&iter);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
|
||||
{
|
||||
return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
|
||||
}
|
||||
|
||||
static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
|
||||
struct kvm_gfn_range *range)
|
||||
{
|
||||
return is_accessed_spte(iter->old_spte);
|
||||
return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
|
||||
}
|
||||
|
||||
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
||||
{
|
||||
return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
|
||||
return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1356,7 +1314,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
|
||||
* not been linked in yet and thus is not reachable from any other CPU.
|
||||
*/
|
||||
for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
|
||||
sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
|
||||
sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
|
||||
|
||||
/*
|
||||
* Replace the huge spte with a pointer to the populated lower level
|
||||
@ -1489,16 +1447,15 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
|
||||
* from level, so it is valid to key off any shadow page to determine if
|
||||
* write protection is needed for an entire tree.
|
||||
*/
|
||||
return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled();
|
||||
return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
|
||||
}
|
||||
|
||||
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
gfn_t start, gfn_t end)
|
||||
static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
gfn_t start, gfn_t end)
|
||||
{
|
||||
const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
|
||||
shadow_dirty_mask;
|
||||
struct tdp_iter iter;
|
||||
bool spte_set = false;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
@ -1519,31 +1476,24 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
|
||||
if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
|
||||
goto retry;
|
||||
|
||||
spte_set = true;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
return spte_set;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
|
||||
* memslot. Returns true if an SPTE has been changed and the TLBs need to be
|
||||
* flushed.
|
||||
* memslot.
|
||||
*/
|
||||
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
|
||||
void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot)
|
||||
{
|
||||
struct kvm_mmu_page *root;
|
||||
bool spte_set = false;
|
||||
|
||||
lockdep_assert_held_read(&kvm->mmu_lock);
|
||||
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
|
||||
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
|
||||
slot->base_gfn + slot->npages);
|
||||
|
||||
return spte_set;
|
||||
clear_dirty_gfn_range(kvm, root, slot->base_gfn,
|
||||
slot->base_gfn + slot->npages);
|
||||
}
|
||||
|
||||
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
@ -1602,21 +1552,55 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
||||
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
|
||||
}
|
||||
|
||||
static void zap_collapsible_spte_range(struct kvm *kvm,
|
||||
struct kvm_mmu_page *root,
|
||||
const struct kvm_memory_slot *slot)
|
||||
static int tdp_mmu_make_huge_spte(struct kvm *kvm,
|
||||
struct tdp_iter *parent,
|
||||
u64 *huge_spte)
|
||||
{
|
||||
struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
|
||||
gfn_t start = parent->gfn;
|
||||
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
|
||||
struct tdp_iter iter;
|
||||
|
||||
tdp_root_for_each_leaf_pte(iter, root, start, end) {
|
||||
/*
|
||||
* Use the parent iterator when checking for forward progress so
|
||||
* that KVM doesn't get stuck continuously trying to yield (i.e.
|
||||
* returning -EAGAIN here and then failing the forward progress
|
||||
* check in the caller ad nauseam).
|
||||
*/
|
||||
if (tdp_mmu_iter_need_resched(kvm, parent))
|
||||
return -EAGAIN;
|
||||
|
||||
*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
static void recover_huge_pages_range(struct kvm *kvm,
|
||||
struct kvm_mmu_page *root,
|
||||
const struct kvm_memory_slot *slot)
|
||||
{
|
||||
gfn_t start = slot->base_gfn;
|
||||
gfn_t end = start + slot->npages;
|
||||
struct tdp_iter iter;
|
||||
int max_mapping_level;
|
||||
bool flush = false;
|
||||
u64 huge_spte;
|
||||
int r;
|
||||
|
||||
if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
|
||||
retry:
|
||||
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
|
||||
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
|
||||
flush = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
|
||||
!is_shadow_present_pte(iter.old_spte))
|
||||
@ -1640,31 +1624,40 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
|
||||
if (iter.gfn < start || iter.gfn >= end)
|
||||
continue;
|
||||
|
||||
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
|
||||
iter.gfn, PG_LEVEL_NUM);
|
||||
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
|
||||
if (max_mapping_level < iter.level)
|
||||
continue;
|
||||
|
||||
/* Note, a successful atomic zap also does a remote TLB flush. */
|
||||
if (tdp_mmu_zap_spte_atomic(kvm, &iter))
|
||||
r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
|
||||
if (r == -EAGAIN)
|
||||
goto retry;
|
||||
else if (r)
|
||||
continue;
|
||||
|
||||
if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
|
||||
goto retry;
|
||||
|
||||
flush = true;
|
||||
}
|
||||
|
||||
if (flush)
|
||||
kvm_flush_remote_tlbs_memslot(kvm, slot);
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* Zap non-leaf SPTEs (and free their associated page tables) which could
|
||||
* be replaced by huge pages, for GFNs within the slot.
|
||||
* Recover huge page mappings within the slot by replacing non-leaf SPTEs with
|
||||
* huge SPTEs where possible.
|
||||
*/
|
||||
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot)
|
||||
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot)
|
||||
{
|
||||
struct kvm_mmu_page *root;
|
||||
|
||||
lockdep_assert_held_read(&kvm->mmu_lock);
|
||||
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
|
||||
zap_collapsible_spte_range(kvm, root, slot);
|
||||
recover_huge_pages_range(kvm, root, slot);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -34,14 +34,14 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
|
||||
|
||||
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot, int min_level);
|
||||
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
|
||||
void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot);
|
||||
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot,
|
||||
gfn_t gfn, unsigned long mask,
|
||||
bool wrprot);
|
||||
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot);
|
||||
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot);
|
||||
|
||||
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot, gfn_t gfn,
|
||||
|
@ -13104,19 +13104,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
|
||||
|
||||
if (!log_dirty_pages) {
|
||||
/*
|
||||
* Dirty logging tracks sptes in 4k granularity, meaning that
|
||||
* large sptes have to be split. If live migration succeeds,
|
||||
* the guest in the source machine will be destroyed and large
|
||||
* sptes will be created in the destination. However, if the
|
||||
* guest continues to run in the source machine (for example if
|
||||
* live migration fails), small sptes will remain around and
|
||||
* cause bad performance.
|
||||
* Recover huge page mappings in the slot now that dirty logging
|
||||
* is disabled, i.e. now that KVM does not have to track guest
|
||||
* writes at 4KiB granularity.
|
||||
*
|
||||
* Scan sptes if dirty logging has been stopped, dropping those
|
||||
* which can be collapsed into a single large-page spte. Later
|
||||
* page faults will create the large-page sptes.
|
||||
* Dirty logging might be disabled by userspace if an ongoing VM
|
||||
* live migration is cancelled and the VM must continue running
|
||||
* on the source.
|
||||
*/
|
||||
kvm_mmu_zap_collapsible_sptes(kvm, new);
|
||||
kvm_mmu_recover_huge_pages(kvm, new);
|
||||
} else {
|
||||
/*
|
||||
* Initially-all-set does not require write protecting any page,
|
||||
|
@ -100,6 +100,10 @@ config KVM_GENERIC_MMU_NOTIFIER
|
||||
select MMU_NOTIFIER
|
||||
bool
|
||||
|
||||
config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
|
||||
depends on KVM_GENERIC_MMU_NOTIFIER
|
||||
bool
|
||||
|
||||
config KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
depends on KVM_GENERIC_MMU_NOTIFIER
|
||||
bool
|
||||
|
@ -631,7 +631,8 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
|
||||
static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
gfn_handler_t handler)
|
||||
gfn_handler_t handler,
|
||||
bool flush_on_ret)
|
||||
{
|
||||
struct kvm *kvm = mmu_notifier_to_kvm(mn);
|
||||
const struct kvm_mmu_notifier_range range = {
|
||||
@ -639,7 +640,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
|
||||
.end = end,
|
||||
.handler = handler,
|
||||
.on_lock = (void *)kvm_null_fn,
|
||||
.flush_on_ret = true,
|
||||
.flush_on_ret = flush_on_ret,
|
||||
.may_block = false,
|
||||
};
|
||||
|
||||
@ -651,17 +652,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
|
||||
unsigned long end,
|
||||
gfn_handler_t handler)
|
||||
{
|
||||
struct kvm *kvm = mmu_notifier_to_kvm(mn);
|
||||
const struct kvm_mmu_notifier_range range = {
|
||||
.start = start,
|
||||
.end = end,
|
||||
.handler = handler,
|
||||
.on_lock = (void *)kvm_null_fn,
|
||||
.flush_on_ret = false,
|
||||
.may_block = false,
|
||||
};
|
||||
|
||||
return __kvm_handle_hva_range(kvm, &range).ret;
|
||||
return kvm_handle_hva_range(mn, start, end, handler, false);
|
||||
}
|
||||
|
||||
void kvm_mmu_invalidate_begin(struct kvm *kvm)
|
||||
@ -826,7 +817,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
|
||||
{
|
||||
trace_kvm_age_hva(start, end);
|
||||
|
||||
return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
|
||||
return kvm_handle_hva_range(mn, start, end, kvm_age_gfn,
|
||||
!IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
|
||||
}
|
||||
|
||||
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
|
||||
|
Loading…
Reference in New Issue
Block a user