kvm: x86: mmu: allow A/D bits to be disabled in an mmu

Adds the plumbing to disable A/D bits in the MMU based on a new role
bit, ad_disabled. When A/D is disabled, the MMU operates as though A/D
aren't available (i.e., using access tracking faults instead).

To avoid SP -> kvm_mmu_page.role.ad_disabled lookups all over the
place, A/D disablement is now stored in the SPTE. This state is stored
in the SPTE by tweaking the use of SPTE_SPECIAL_MASK for access
tracking. Rather than just setting SPTE_SPECIAL_MASK when an
access-tracking SPTE is non-present, we now always set
SPTE_SPECIAL_MASK for access-tracking SPTEs.

Signed-off-by: Peter Feiner <pfeiner@google.com>
[Use role.ad_disabled even for direct (non-shadow) EPT page tables.  Add
 documentation and a few MMU_WARN_ONs. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Peter Feiner 2017-06-30 17:26:31 -07:00 committed by Paolo Bonzini
parent dcdca5fed5
commit ac8d57e573
4 changed files with 95 additions and 33 deletions

View File

@ -179,6 +179,10 @@ Shadow pages contain the following information:
shadow page; it is also used to go back from a struct kvm_mmu_page shadow page; it is also used to go back from a struct kvm_mmu_page
to a memslot, through the kvm_memslots_for_spte_role macro and to a memslot, through the kvm_memslots_for_spte_role macro and
__gfn_to_memslot. __gfn_to_memslot.
role.ad_disabled:
Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D
bits before Haswell; shadow EPT page tables also cannot use A/D bits
if the L1 hypervisor does not enable them.
gfn: gfn:
Either the guest page table containing the translations shadowed by this Either the guest page table containing the translations shadowed by this
page, or the base page frame for linear translations. See role.direct. page, or the base page frame for linear translations. See role.direct.

View File

@ -257,7 +257,8 @@ union kvm_mmu_page_role {
unsigned cr0_wp:1; unsigned cr0_wp:1;
unsigned smep_andnot_wp:1; unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1; unsigned smap_andnot_wp:1;
unsigned :8; unsigned ad_disabled:1;
unsigned :7;
/* /*
* This is left at the top of the word so that * This is left at the top of the word so that

View File

@ -187,10 +187,9 @@ static u64 __read_mostly shadow_mmio_value;
static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_present_mask;
/* /*
* The mask/value to distinguish a PTE that has been marked not-present for * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
* access tracking purposes. * Non-present SPTEs with shadow_acc_track_value set are in place for access
* The mask would be either 0 if access tracking is disabled, or * tracking.
* SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
*/ */
static u64 __read_mostly shadow_acc_track_mask; static u64 __read_mostly shadow_acc_track_mask;
static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
@ -216,10 +215,32 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
} }
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
{
return sp->role.ad_disabled;
}
static inline bool spte_ad_enabled(u64 spte)
{
MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
return !(spte & shadow_acc_track_value);
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
static inline bool is_access_track_spte(u64 spte) static inline bool is_access_track_spte(u64 spte)
{ {
/* Always false if shadow_acc_track_mask is zero. */ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
} }
/* /*
@ -329,10 +350,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
u64 acc_track_mask) u64 acc_track_mask)
{ {
if (acc_track_mask != 0)
acc_track_mask |= SPTE_SPECIAL_MASK;
BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!dirty_mask != !accessed_mask);
BUG_ON(!accessed_mask && !acc_track_mask); BUG_ON(!accessed_mask && !acc_track_mask);
BUG_ON(acc_track_mask & shadow_acc_track_value);
shadow_user_mask = user_mask; shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask; shadow_accessed_mask = accessed_mask;
@ -341,7 +361,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
shadow_x_mask = x_mask; shadow_x_mask = x_mask;
shadow_present_mask = p_mask; shadow_present_mask = p_mask;
shadow_acc_track_mask = acc_track_mask; shadow_acc_track_mask = acc_track_mask;
WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
} }
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@ -561,7 +580,7 @@ static bool spte_has_volatile_bits(u64 spte)
is_access_track_spte(spte)) is_access_track_spte(spte))
return true; return true;
if (shadow_accessed_mask) { if (spte_ad_enabled(spte)) {
if ((spte & shadow_accessed_mask) == 0 || if ((spte & shadow_accessed_mask) == 0 ||
(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
return true; return true;
@ -572,14 +591,17 @@ static bool spte_has_volatile_bits(u64 spte)
static bool is_accessed_spte(u64 spte) static bool is_accessed_spte(u64 spte)
{ {
return shadow_accessed_mask ? spte & shadow_accessed_mask u64 accessed_mask = spte_shadow_accessed_mask(spte);
: !is_access_track_spte(spte);
return accessed_mask ? spte & accessed_mask
: !is_access_track_spte(spte);
} }
static bool is_dirty_spte(u64 spte) static bool is_dirty_spte(u64 spte)
{ {
return shadow_dirty_mask ? spte & shadow_dirty_mask u64 dirty_mask = spte_shadow_dirty_mask(spte);
: spte & PT_WRITABLE_MASK;
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
} }
/* Rules for using mmu_spte_set: /* Rules for using mmu_spte_set:
@ -719,10 +741,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static u64 mark_spte_for_access_track(u64 spte) static u64 mark_spte_for_access_track(u64 spte)
{ {
if (shadow_accessed_mask != 0) if (spte_ad_enabled(spte))
return spte & ~shadow_accessed_mask; return spte & ~shadow_accessed_mask;
if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) if (is_access_track_spte(spte))
return spte; return spte;
/* /*
@ -741,7 +763,6 @@ static u64 mark_spte_for_access_track(u64 spte)
spte |= (spte & shadow_acc_track_saved_bits_mask) << spte |= (spte & shadow_acc_track_saved_bits_mask) <<
shadow_acc_track_saved_bits_shift; shadow_acc_track_saved_bits_shift;
spte &= ~shadow_acc_track_mask; spte &= ~shadow_acc_track_mask;
spte |= shadow_acc_track_value;
return spte; return spte;
} }
@ -753,6 +774,7 @@ static u64 restore_acc_track_spte(u64 spte)
u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
& shadow_acc_track_saved_bits_mask; & shadow_acc_track_saved_bits_mask;
WARN_ON_ONCE(spte_ad_enabled(spte));
WARN_ON_ONCE(!is_access_track_spte(spte)); WARN_ON_ONCE(!is_access_track_spte(spte));
new_spte &= ~shadow_acc_track_mask; new_spte &= ~shadow_acc_track_mask;
@ -771,7 +793,7 @@ static bool mmu_spte_age(u64 *sptep)
if (!is_accessed_spte(spte)) if (!is_accessed_spte(spte))
return false; return false;
if (shadow_accessed_mask) { if (spte_ad_enabled(spte)) {
clear_bit((ffs(shadow_accessed_mask) - 1), clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep); (unsigned long *)sptep);
} else { } else {
@ -1402,6 +1424,22 @@ static bool spte_clear_dirty(u64 *sptep)
return mmu_spte_update(sptep, spte); return mmu_spte_update(sptep, spte);
} }
static bool wrprot_ad_disabled_spte(u64 *sptep)
{
bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
if (was_writable)
kvm_set_pfn_dirty(spte_to_pfn(*sptep));
return was_writable;
}
/*
* Gets the GFN ready for another round of dirty logging by clearing the
* - D bit on ad-enabled SPTEs, and
* - W bit on ad-disabled SPTEs.
* Returns true iff any D or W bits were cleared.
*/
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{ {
u64 *sptep; u64 *sptep;
@ -1409,7 +1447,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false; bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep) for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_clear_dirty(sptep); if (spte_ad_enabled(*sptep))
flush |= spte_clear_dirty(sptep);
else
flush |= wrprot_ad_disabled_spte(sptep);
return flush; return flush;
} }
@ -1432,7 +1473,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false; bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep) for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_set_dirty(sptep); if (spte_ad_enabled(*sptep))
flush |= spte_set_dirty(sptep);
return flush; return flush;
} }
@ -1464,7 +1506,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
} }
/** /**
* kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
* protect the page if the D-bit isn't supported.
* @kvm: kvm instance * @kvm: kvm instance
* @slot: slot to clear D-bit * @slot: slot to clear D-bit
* @gfn_offset: start of the BITS_PER_LONG pages we care about * @gfn_offset: start of the BITS_PER_LONG pages we care about
@ -2389,7 +2432,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
shadow_user_mask | shadow_x_mask | shadow_accessed_mask; shadow_user_mask | shadow_x_mask;
if (sp_ad_disabled(sp))
spte |= shadow_acc_track_value;
else
spte |= shadow_accessed_mask;
mmu_spte_set(sptep, spte); mmu_spte_set(sptep, spte);
@ -2657,10 +2705,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
{ {
u64 spte = 0; u64 spte = 0;
int ret = 0; int ret = 0;
struct kvm_mmu_page *sp;
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0; return 0;
sp = page_header(__pa(sptep));
if (sp_ad_disabled(sp))
spte |= shadow_acc_track_value;
/* /*
* For the EPT case, shadow_present_mask is 0 if hardware * For the EPT case, shadow_present_mask is 0 if hardware
* supports exec-only page table entries. In that case, * supports exec-only page table entries. In that case,
@ -2669,7 +2722,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
*/ */
spte |= shadow_present_mask; spte |= shadow_present_mask;
if (!speculative) if (!speculative)
spte |= shadow_accessed_mask; spte |= spte_shadow_accessed_mask(spte);
if (pte_access & ACC_EXEC_MASK) if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask; spte |= shadow_x_mask;
@ -2726,7 +2779,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (pte_access & ACC_WRITE_MASK) { if (pte_access & ACC_WRITE_MASK) {
kvm_vcpu_mark_page_dirty(vcpu, gfn); kvm_vcpu_mark_page_dirty(vcpu, gfn);
spte |= shadow_dirty_mask; spte |= spte_shadow_dirty_mask(spte);
} }
if (speculative) if (speculative)
@ -2868,16 +2921,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
{ {
struct kvm_mmu_page *sp; struct kvm_mmu_page *sp;
sp = page_header(__pa(sptep));
/* /*
* Since it's no accessed bit on EPT, it's no way to * Without accessed bits, there's no way to distinguish between
* distinguish between actually accessed translations * actually accessed translations and prefetched, so disable pte
* and prefetched, so disable pte prefetch if EPT is * prefetch if accessed bits aren't available.
* enabled.
*/ */
if (!shadow_accessed_mask) if (sp_ad_disabled(sp))
return; return;
sp = page_header(__pa(sptep));
if (sp->role.level > PT_PAGE_TABLE_LEVEL) if (sp->role.level > PT_PAGE_TABLE_LEVEL)
return; return;
@ -4278,6 +4331,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->base_role.word = 0; context->base_role.word = 0;
context->base_role.smm = is_smm(vcpu); context->base_role.smm = is_smm(vcpu);
context->base_role.ad_disabled = (shadow_accessed_mask == 0);
context->page_fault = tdp_page_fault; context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page; context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg; context->invlpg = nonpaging_invlpg;
@ -4624,6 +4678,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mask.smep_andnot_wp = 1; mask.smep_andnot_wp = 1;
mask.smap_andnot_wp = 1; mask.smap_andnot_wp = 1;
mask.smm = 1; mask.smm = 1;
mask.ad_disabled = 1;
/* /*
* If we don't have indirect shadow pages, it means no page is * If we don't have indirect shadow pages, it means no page is

View File

@ -30,8 +30,9 @@
\ \
role.word = __entry->role; \ role.word = __entry->role; \
\ \
trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
" %snxe root %u %s%c", __entry->mmu_valid_gen, \ " %snxe %sad root %u %s%c", \
__entry->mmu_valid_gen, \
__entry->gfn, role.level, \ __entry->gfn, role.level, \
role.cr4_pae ? " pae" : "", \ role.cr4_pae ? " pae" : "", \
role.quadrant, \ role.quadrant, \
@ -39,6 +40,7 @@
access_str[role.access], \ access_str[role.access], \
role.invalid ? " invalid" : "", \ role.invalid ? " invalid" : "", \
role.nxe ? "" : "!", \ role.nxe ? "" : "!", \
role.ad_disabled ? "!" : "", \
__entry->root_count, \ __entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \ __entry->unsync ? "unsync" : "sync", 0); \
saved_ptr; \ saved_ptr; \