Merge branch 'kvm-prefault' into HEAD

Pre-population has been requested several times to mitigate KVM page faults
during guest boot or after live migration.  It is also required by TDX
before filling in the initial guest memory with measured contents.
Introduce it as a generic API.
This commit is contained in:
Paolo Bonzini 2024-07-12 11:18:45 -04:00
commit f3996d4d79
12 changed files with 394 additions and 26 deletions

View File

@ -6352,6 +6352,61 @@ a single guest_memfd file, but the bound ranges must not overlap).
See KVM_SET_USER_MEMORY_REGION2 for additional details.
4.143 KVM_PRE_FAULT_MEMORY
------------------------
:Capability: KVM_CAP_PRE_FAULT_MEMORY
:Architectures: none
:Type: vcpu ioctl
:Parameters: struct kvm_pre_fault_memory (in/out)
:Returns: 0 if at least one page is processed, < 0 on error
Errors:
========== ===============================================================
EINVAL The specified `gpa` and `size` were invalid (e.g. not
page aligned, causes an overflow, or size is zero).
ENOENT The specified `gpa` is outside defined memslots.
EINTR An unmasked signal is pending and no page was processed.
EFAULT The parameter address was invalid.
EOPNOTSUPP Mapping memory for a GPA is unsupported by the
hypervisor, and/or for the current vCPU state/mode.
EIO unexpected error conditions (also causes a WARN)
========== ===============================================================
::
struct kvm_pre_fault_memory {
/* in/out */
__u64 gpa;
__u64 size;
/* in */
__u64 flags;
__u64 padding[5];
};
KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
for the current vCPU state. KVM maps memory as if the vCPU generated a
stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed.
In some cases, multiple vCPUs might share the page tables. In this
case, the ioctl can be called in parallel.
When the ioctl returns, the input values are updated to point to the
remaining range. If `size` > 0 on return, the caller can just issue
the ioctl again with the same `struct kvm_map_memory` argument.
Shadow page tables cannot support this ioctl because they
are indexed by virtual address or nested guest physical address.
Calling this ioctl when the guest is using shadow page tables (for
example because it is running a nested guest with nested page tables)
will fail with `EOPNOTSUPP` even if `KVM_CHECK_EXTENSION` reports
the capability to be present.
`flags` must currently be zero.
5. The kvm_run structure
========================

View File

@ -44,6 +44,7 @@ config KVM
select KVM_VFIO
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
select KVM_GENERIC_PRE_FAULT_MEMORY
select KVM_WERROR if WERROR
help
Support hosting fully virtualized guest machines using hardware

View File

@ -4291,7 +4291,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
true, NULL, NULL);
/*
* Account fixed page faults, otherwise they'll never be counted, but
* ignore stats for all other return times. Page-ready "faults" aren't
* truly spurious and never trigger emulation
*/
if (r == RET_PF_FIXED)
vcpu->stat.pf_fixed++;
}
static inline u8 kvm_max_level_for_order(int order)
@ -4700,6 +4709,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
u8 *level)
{
int r;
/*
* Restrict to TDP page fault, since that's the only case where the MMU
* is indexed by GPA.
*/
if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
return -EOPNOTSUPP;
do {
if (signal_pending(current))
return -EINTR;
cond_resched();
r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
} while (r == RET_PF_RETRY);
if (r < 0)
return r;
switch (r) {
case RET_PF_FIXED:
case RET_PF_SPURIOUS:
return 0;
case RET_PF_EMULATE:
return -ENOENT;
case RET_PF_RETRY:
case RET_PF_CONTINUE:
case RET_PF_INVALID:
default:
WARN_ONCE(1, "could not fix page fault during prefault");
return -EIO;
}
}
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
{
u64 error_code = PFERR_GUEST_FINAL_MASK;
u8 level = PG_LEVEL_4K;
u64 end;
int r;
/*
* reload is efficient when called repeatedly, so we can do it on
* every iteration.
*/
kvm_mmu_reload(vcpu);
if (kvm_arch_has_private_mem(vcpu->kvm) &&
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
error_code |= PFERR_PRIVATE_ACCESS;
/*
* Shadow paging uses GVA for kvm page fault, so restrict to
* two-dimensional paging.
*/
r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
if (r < 0)
return r;
/*
* If the mapping that covers range->gpa can use a huge page, it
* may start below it or end after range->gpa + range->size.
*/
end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
return min(range->size, end - range->gpa);
}
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@ -5925,14 +6007,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
}
if (r == RET_PF_INVALID) {
vcpu->stat.pf_taken++;
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
&emulation_type);
&emulation_type, NULL);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
if (r < 0)
return r;
if (r == RET_PF_FIXED)
vcpu->stat.pf_fixed++;
else if (r == RET_PF_EMULATE)
vcpu->stat.pf_emulate++;
else if (r == RET_PF_SPURIOUS)
vcpu->stat.pf_spurious++;
if (r != RET_PF_EMULATE)
return 1;

View File

@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
}
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
u64 err, bool prefetch, int *emulation_type)
u64 err, bool prefetch,
int *emulation_type, u8 *level)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
/*
* Async #PF "faults", a.k.a. prefetch faults, are not faults from the
* guest perspective and have already been counted at the time of the
* original fault.
*/
if (!prefetch)
vcpu->stat.pf_taken++;
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (fault.write_fault_to_shadow_pgtable && emulation_type)
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
if (level)
*level = fault.goal_level;
/*
* Similar to above, prefetch faults aren't truly spurious, and the
* async #PF path doesn't do emulation. Do count faults that are fixed
* by the async #PF handler though, otherwise they'll never be counted.
*/
if (r == RET_PF_FIXED)
vcpu->stat.pf_fixed++;
else if (prefetch)
;
else if (r == RET_PF_EMULATE)
vcpu->stat.pf_emulate++;
else if (r == RET_PF_SPURIOUS)
vcpu->stat.pf_spurious++;
return r;
}

View File

@ -4705,6 +4705,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MEMORY_FAULT_INFO:
r = 1;
break;
case KVM_CAP_PRE_FAULT_MEMORY:
r = tdp_enabled;
break;
case KVM_CAP_EXIT_HYPERCALL:
r = KVM_EXIT_HYPERCALL_VALID_MASK;
break;

View File

@ -2477,4 +2477,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
#endif
#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range);
#endif
#endif

View File

@ -917,6 +917,7 @@ struct kvm_enable_cap {
#define KVM_CAP_MEMORY_ATTRIBUTES 233
#define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235
#define KVM_CAP_PRE_FAULT_MEMORY 236
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
__u64 reserved[6];
};
#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
struct kvm_pre_fault_memory {
__u64 gpa;
__u64 size;
__u64 flags;
__u64 padding[5];
};
#endif /* __LINUX_KVM_H */

View File

@ -917,6 +917,7 @@ struct kvm_enable_cap {
#define KVM_CAP_MEMORY_ATTRIBUTES 233
#define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235
#define KVM_CAP_PRE_FAULT_MEMORY 236
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@ -1221,9 +1222,9 @@ struct kvm_vfio_spapr_tce {
/* Available with KVM_CAP_SPAPR_RESIZE_HPT */
#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
#define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */
/* Available with KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 */
#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg)
/* Available with KVM_CAP_PPC_RADIX_MMU */
/* Available with KVM_CAP_PPC_MMU_RADIX */
#define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info)
/* Available with KVM_CAP_PPC_GET_CPU_CHAR */
#define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char)
@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
__u64 reserved[6];
};
#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
struct kvm_pre_fault_memory {
__u64 gpa;
__u64 size;
__u64 flags;
__u64 padding[5];
};
#endif /* __LINUX_KVM_H */

View File

@ -145,6 +145,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test
TEST_GEN_PROGS_x86_64 += steal_time
TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
TEST_GEN_PROGS_x86_64 += system_counter_offset_test
TEST_GEN_PROGS_x86_64 += pre_fault_memory_test
# Compiled outputs used by test targets
TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test

View File

@ -0,0 +1,146 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2024, Intel, Inc
*
* Author:
* Isaku Yamahata <isaku.yamahata at gmail.com>
*/
#include <linux/sizes.h>
#include <test_util.h>
#include <kvm_util.h>
#include <processor.h>
/* Arbitrarily chosen values */
#define TEST_SIZE (SZ_2M + PAGE_SIZE)
#define TEST_NPAGES (TEST_SIZE / PAGE_SIZE)
#define TEST_SLOT 10
static void guest_code(uint64_t base_gpa)
{
volatile uint64_t val __used;
int i;
for (i = 0; i < TEST_NPAGES; i++) {
uint64_t *src = (uint64_t *)(base_gpa + i * PAGE_SIZE);
val = *src;
}
GUEST_DONE();
}
static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 gpa, u64 size,
u64 left)
{
struct kvm_pre_fault_memory range = {
.gpa = gpa,
.size = size,
.flags = 0,
};
u64 prev;
int ret, save_errno;
do {
prev = range.size;
ret = __vcpu_ioctl(vcpu, KVM_PRE_FAULT_MEMORY, &range);
save_errno = errno;
TEST_ASSERT((range.size < prev) ^ (ret < 0),
"%sexpecting range.size to change on %s",
ret < 0 ? "not " : "",
ret < 0 ? "failure" : "success");
} while (ret >= 0 ? range.size : save_errno == EINTR);
TEST_ASSERT(range.size == left,
"Completed with %lld bytes left, expected %" PRId64,
range.size, left);
if (left == 0)
__TEST_ASSERT_VM_VCPU_IOCTL(!ret, "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm);
else
/* No memory slot causes RET_PF_EMULATE. it results in -ENOENT. */
__TEST_ASSERT_VM_VCPU_IOCTL(ret && save_errno == ENOENT,
"KVM_PRE_FAULT_MEMORY", ret, vcpu->vm);
}
static void __test_pre_fault_memory(unsigned long vm_type, bool private)
{
const struct vm_shape shape = {
.mode = VM_MODE_DEFAULT,
.type = vm_type,
};
struct kvm_vcpu *vcpu;
struct kvm_run *run;
struct kvm_vm *vm;
struct ucall uc;
uint64_t guest_test_phys_mem;
uint64_t guest_test_virt_mem;
uint64_t alignment, guest_page_size;
vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size;
guest_test_phys_mem = (vm->max_gfn - TEST_NPAGES) * guest_page_size;
#ifdef __s390x__
alignment = max(0x100000UL, guest_page_size);
#else
alignment = SZ_2M;
#endif
guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
guest_test_virt_mem = guest_test_phys_mem & ((1ULL << (vm->va_bits - 1)) - 1);
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
guest_test_phys_mem, TEST_SLOT, TEST_NPAGES,
private ? KVM_MEM_GUEST_MEMFD : 0);
virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, TEST_NPAGES);
if (private)
vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE);
pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, 0);
pre_fault_memory(vcpu, guest_test_phys_mem + SZ_2M, PAGE_SIZE * 2, PAGE_SIZE);
pre_fault_memory(vcpu, guest_test_phys_mem + TEST_SIZE, PAGE_SIZE, PAGE_SIZE);
vcpu_args_set(vcpu, 1, guest_test_virt_mem);
vcpu_run(vcpu);
run = vcpu->run;
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
"Wanted KVM_EXIT_IO, got exit reason: %u (%s)",
run->exit_reason, exit_reason_str(run->exit_reason));
switch (get_ucall(vcpu, &uc)) {
case UCALL_ABORT:
REPORT_GUEST_ASSERT(uc);
break;
case UCALL_DONE:
break;
default:
TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
break;
}
kvm_vm_free(vm);
}
static void test_pre_fault_memory(unsigned long vm_type, bool private)
{
if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) {
pr_info("Skipping tests for vm_type 0x%lx\n", vm_type);
return;
}
__test_pre_fault_memory(vm_type, private);
}
int main(int argc, char *argv[])
{
TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
test_pre_fault_memory(0, false);
#ifdef __x86_64__
test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, false);
test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, true);
#endif
return 0;
}

View File

@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS
config KVM_GENERIC_DIRTYLOG_READ_PROTECT
bool
config KVM_GENERIC_PRE_FAULT_MEMORY
bool
config KVM_COMPAT
def_bool y
depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)

View File

@ -4373,6 +4373,52 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
return fd;
}
#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
{
int idx;
long r;
u64 full_size;
if (range->flags)
return -EINVAL;
if (!PAGE_ALIGNED(range->gpa) ||
!PAGE_ALIGNED(range->size) ||
range->gpa + range->size <= range->gpa)
return -EINVAL;
vcpu_load(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
full_size = range->size;
do {
if (signal_pending(current)) {
r = -EINTR;
break;
}
r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
if (WARN_ON_ONCE(r == 0 || r == -EIO))
break;
if (r < 0)
break;
range->size -= r;
range->gpa += r;
cond_resched();
} while (range->size);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
vcpu_put(vcpu);
/* Return success if at least one page was mapped successfully. */
return full_size == range->size ? r : 0;
}
#endif
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@ -4573,6 +4619,20 @@ static long kvm_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
break;
}
#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
case KVM_PRE_FAULT_MEMORY: {
struct kvm_pre_fault_memory range;
r = -EFAULT;
if (copy_from_user(&range, argp, sizeof(range)))
break;
r = kvm_vcpu_pre_fault_memory(vcpu, &range);
/* Pass back leftover range. */
if (copy_to_user(argp, &range, sizeof(range)))
r = -EFAULT;
break;
}
#endif
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}