mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-20 04:24:13 +00:00
Merge branch kvm-arm64/eager-page-splitting into kvmarm/next
* kvm-arm64/eager-page-splitting: : Eager Page Splitting, courtesy of Ricardo Koller. : : Dirty logging performance is dominated by the cost of splitting : hugepages to PTE granularity. On systems that mere mortals can get their : hands on, each fault incurs the cost of a full break-before-make : pattern, wherein the broadcast invalidation and ensuing serialization : significantly increases fault latency. : : The goal of eager page splitting is to move the cost of hugepage : splitting out of the stage-2 fault path and instead into the ioctls : responsible for managing the dirty log: : : - If manual protection is enabled for the VM, hugepage splitting : happens in the KVM_CLEAR_DIRTY_LOG ioctl. This is desirable as it : provides userspace granular control over hugepage splitting. : : - Otherwise, if userspace relies on the legacy dirty log behavior : (clear on collection), hugepage splitting is done at the moment dirty : logging is enabled for a particular memslot. : : Support for eager page splitting requires explicit opt-in from : userspace, which is realized through the : KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE capability. arm64: kvm: avoid overflow in integer division KVM: arm64: Use local TLBI on permission relaxation KVM: arm64: Split huge pages during KVM_CLEAR_DIRTY_LOG KVM: arm64: Open-code kvm_mmu_write_protect_pt_masked() KVM: arm64: Split huge pages when dirty logging is enabled KVM: arm64: Add kvm_uninit_stage2_mmu() KVM: arm64: Refactor kvm_arch_commit_memory_region() KVM: arm64: Add kvm_pgtable_stage2_split() KVM: arm64: Add KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE KVM: arm64: Export kvm_are_all_memslots_empty() KVM: arm64: Add helper for creating unlinked stage2 subtrees KVM: arm64: Add KVM_PGTABLE_WALK flags for skipping CMOs and BBM TLBIs KVM: arm64: Rename free_removed to free_unlinked Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
This commit is contained in:
commit
83510396c0
@ -8445,6 +8445,33 @@ structure.
|
|||||||
When getting the Modified Change Topology Report value, the attr->addr
|
When getting the Modified Change Topology Report value, the attr->addr
|
||||||
must point to a byte where the value will be stored or retrieved from.
|
must point to a byte where the value will be stored or retrieved from.
|
||||||
|
|
||||||
|
8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
|
||||||
|
---------------------------------------
|
||||||
|
|
||||||
|
:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
|
||||||
|
:Architectures: arm64
|
||||||
|
:Type: vm
|
||||||
|
:Parameters: arg[0] is the new split chunk size.
|
||||||
|
:Returns: 0 on success, -EINVAL if any memslot was already created.
|
||||||
|
|
||||||
|
This capability sets the chunk size used in Eager Page Splitting.
|
||||||
|
|
||||||
|
Eager Page Splitting improves the performance of dirty-logging (used
|
||||||
|
in live migrations) when guest memory is backed by huge-pages. It
|
||||||
|
avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing
|
||||||
|
it eagerly when enabling dirty logging (with the
|
||||||
|
KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using
|
||||||
|
KVM_CLEAR_DIRTY_LOG.
|
||||||
|
|
||||||
|
The chunk size specifies how many pages to break at a time, using a
|
||||||
|
single allocation for each chunk. Bigger the chunk size, more pages
|
||||||
|
need to be allocated ahead of time.
|
||||||
|
|
||||||
|
The chunk size needs to be a valid block size. The list of acceptable
|
||||||
|
block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a
|
||||||
|
64-bit bitmap (each bit describing a block size). The default value is
|
||||||
|
0, to disable the eager page splitting.
|
||||||
|
|
||||||
9. Known KVM API problems
|
9. Known KVM API problems
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
|
@ -68,6 +68,7 @@ enum __kvm_host_smccc_func {
|
|||||||
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
|
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
|
||||||
__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
|
__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
|
||||||
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
|
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
|
||||||
|
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa_nsh,
|
||||||
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
|
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
|
||||||
__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
|
__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
|
||||||
__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
|
__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
|
||||||
@ -225,6 +226,9 @@ extern void __kvm_flush_vm_context(void);
|
|||||||
extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
|
extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
|
||||||
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
|
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
|
||||||
int level);
|
int level);
|
||||||
|
extern void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
|
||||||
|
phys_addr_t ipa,
|
||||||
|
int level);
|
||||||
extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
|
extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
|
||||||
|
|
||||||
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
|
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
|
||||||
|
@ -159,6 +159,21 @@ struct kvm_s2_mmu {
|
|||||||
/* The last vcpu id that ran on each physical CPU */
|
/* The last vcpu id that ran on each physical CPU */
|
||||||
int __percpu *last_vcpu_ran;
|
int __percpu *last_vcpu_ran;
|
||||||
|
|
||||||
|
#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0
|
||||||
|
/*
|
||||||
|
* Memory cache used to split
|
||||||
|
* KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It
|
||||||
|
* is used to allocate stage2 page tables while splitting huge
|
||||||
|
* pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
|
||||||
|
* influences both the capacity of the split page cache, and
|
||||||
|
* how often KVM reschedules. Be wary of raising CHUNK_SIZE
|
||||||
|
* too high.
|
||||||
|
*
|
||||||
|
* Protected by kvm->slots_lock.
|
||||||
|
*/
|
||||||
|
struct kvm_mmu_memory_cache split_page_cache;
|
||||||
|
uint64_t split_page_chunk_size;
|
||||||
|
|
||||||
struct kvm_arch *arch;
|
struct kvm_arch *arch;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -172,6 +172,7 @@ void __init free_hyp_pgds(void);
|
|||||||
|
|
||||||
void stage2_unmap_vm(struct kvm *kvm);
|
void stage2_unmap_vm(struct kvm *kvm);
|
||||||
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
|
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
|
||||||
|
void kvm_uninit_stage2_mmu(struct kvm *kvm);
|
||||||
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
|
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
|
||||||
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
||||||
phys_addr_t pa, unsigned long size, bool writable);
|
phys_addr_t pa, unsigned long size, bool writable);
|
||||||
|
@ -92,6 +92,24 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
|
|||||||
return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
|
return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline u32 kvm_supported_block_sizes(void)
|
||||||
|
{
|
||||||
|
u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL;
|
||||||
|
u32 r = 0;
|
||||||
|
|
||||||
|
for (; level < KVM_PGTABLE_MAX_LEVELS; level++)
|
||||||
|
r |= BIT(kvm_granule_shift(level));
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool kvm_is_block_size_supported(u64 size)
|
||||||
|
{
|
||||||
|
bool is_power_of_two = IS_ALIGNED(size, size);
|
||||||
|
|
||||||
|
return is_power_of_two && (size & kvm_supported_block_sizes());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct kvm_pgtable_mm_ops - Memory management callbacks.
|
* struct kvm_pgtable_mm_ops - Memory management callbacks.
|
||||||
* @zalloc_page: Allocate a single zeroed memory page.
|
* @zalloc_page: Allocate a single zeroed memory page.
|
||||||
@ -104,7 +122,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
|
|||||||
* allocation is physically contiguous.
|
* allocation is physically contiguous.
|
||||||
* @free_pages_exact: Free an exact number of memory pages previously
|
* @free_pages_exact: Free an exact number of memory pages previously
|
||||||
* allocated by zalloc_pages_exact.
|
* allocated by zalloc_pages_exact.
|
||||||
* @free_removed_table: Free a removed paging structure by unlinking and
|
* @free_unlinked_table: Free an unlinked paging structure by unlinking and
|
||||||
* dropping references.
|
* dropping references.
|
||||||
* @get_page: Increment the refcount on a page.
|
* @get_page: Increment the refcount on a page.
|
||||||
* @put_page: Decrement the refcount on a page. When the
|
* @put_page: Decrement the refcount on a page. When the
|
||||||
@ -124,7 +142,7 @@ struct kvm_pgtable_mm_ops {
|
|||||||
void* (*zalloc_page)(void *arg);
|
void* (*zalloc_page)(void *arg);
|
||||||
void* (*zalloc_pages_exact)(size_t size);
|
void* (*zalloc_pages_exact)(size_t size);
|
||||||
void (*free_pages_exact)(void *addr, size_t size);
|
void (*free_pages_exact)(void *addr, size_t size);
|
||||||
void (*free_removed_table)(void *addr, u32 level);
|
void (*free_unlinked_table)(void *addr, u32 level);
|
||||||
void (*get_page)(void *addr);
|
void (*get_page)(void *addr);
|
||||||
void (*put_page)(void *addr);
|
void (*put_page)(void *addr);
|
||||||
int (*page_count)(void *addr);
|
int (*page_count)(void *addr);
|
||||||
@ -195,6 +213,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
|
|||||||
* with other software walkers.
|
* with other software walkers.
|
||||||
* @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was
|
* @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was
|
||||||
* invoked from a fault handler.
|
* invoked from a fault handler.
|
||||||
|
* @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries
|
||||||
|
* without Break-before-make's
|
||||||
|
* TLB invalidation.
|
||||||
|
* @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries
|
||||||
|
* without Cache maintenance
|
||||||
|
* operations required.
|
||||||
*/
|
*/
|
||||||
enum kvm_pgtable_walk_flags {
|
enum kvm_pgtable_walk_flags {
|
||||||
KVM_PGTABLE_WALK_LEAF = BIT(0),
|
KVM_PGTABLE_WALK_LEAF = BIT(0),
|
||||||
@ -202,6 +226,8 @@ enum kvm_pgtable_walk_flags {
|
|||||||
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
|
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
|
||||||
KVM_PGTABLE_WALK_SHARED = BIT(3),
|
KVM_PGTABLE_WALK_SHARED = BIT(3),
|
||||||
KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4),
|
KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4),
|
||||||
|
KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5),
|
||||||
|
KVM_PGTABLE_WALK_SKIP_CMO = BIT(6),
|
||||||
};
|
};
|
||||||
|
|
||||||
struct kvm_pgtable_visit_ctx {
|
struct kvm_pgtable_visit_ctx {
|
||||||
@ -441,7 +467,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
|
|||||||
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
|
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure.
|
* kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
|
||||||
* @mm_ops: Memory management callbacks.
|
* @mm_ops: Memory management callbacks.
|
||||||
* @pgtable: Unlinked stage-2 paging structure to be freed.
|
* @pgtable: Unlinked stage-2 paging structure to be freed.
|
||||||
* @level: Level of the stage-2 paging structure to be freed.
|
* @level: Level of the stage-2 paging structure to be freed.
|
||||||
@ -449,7 +475,33 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
|
|||||||
* The page-table is assumed to be unreachable by any hardware walkers prior to
|
* The page-table is assumed to be unreachable by any hardware walkers prior to
|
||||||
* freeing and therefore no TLB invalidation is performed.
|
* freeing and therefore no TLB invalidation is performed.
|
||||||
*/
|
*/
|
||||||
void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
|
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure.
|
||||||
|
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
|
||||||
|
* @phys: Physical address of the memory to map.
|
||||||
|
* @level: Starting level of the stage-2 paging structure to be created.
|
||||||
|
* @prot: Permissions and attributes for the mapping.
|
||||||
|
* @mc: Cache of pre-allocated and zeroed memory from which to allocate
|
||||||
|
* page-table pages.
|
||||||
|
* @force_pte: Force mappings to PAGE_SIZE granularity.
|
||||||
|
*
|
||||||
|
* Returns an unlinked page-table tree. This new page-table tree is
|
||||||
|
* not reachable (i.e., it is unlinked) from the root pgd and it's
|
||||||
|
* therefore unreachableby the hardware page-table walker. No TLB
|
||||||
|
* invalidation or CMOs are performed.
|
||||||
|
*
|
||||||
|
* If device attributes are not explicitly requested in @prot, then the
|
||||||
|
* mapping will be normal, cacheable.
|
||||||
|
*
|
||||||
|
* Return: The fully populated (unlinked) stage-2 paging structure, or
|
||||||
|
* an ERR_PTR(error) on failure.
|
||||||
|
*/
|
||||||
|
kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
|
||||||
|
u64 phys, u32 level,
|
||||||
|
enum kvm_pgtable_prot prot,
|
||||||
|
void *mc, bool force_pte);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
|
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
|
||||||
@ -620,6 +672,25 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
|
|||||||
*/
|
*/
|
||||||
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
|
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
|
||||||
|
* to PAGE_SIZE guest pages.
|
||||||
|
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
|
||||||
|
* @addr: Intermediate physical address from which to split.
|
||||||
|
* @size: Size of the range.
|
||||||
|
* @mc: Cache of pre-allocated and zeroed memory from which to allocate
|
||||||
|
* page-table pages.
|
||||||
|
*
|
||||||
|
* The function tries to split any level 1 or 2 entry that overlaps
|
||||||
|
* with the input range (given by @addr and @size).
|
||||||
|
*
|
||||||
|
* Return: 0 on success, negative error code on failure. Note that
|
||||||
|
* kvm_pgtable_stage2_split() is best effort: it tries to break as many
|
||||||
|
* blocks in the input range as allowed by @mc_capacity.
|
||||||
|
*/
|
||||||
|
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||||
|
struct kvm_mmu_memory_cache *mc);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* kvm_pgtable_walk() - Walk a page-table.
|
* kvm_pgtable_walk() - Walk a page-table.
|
||||||
* @pgt: Page-table structure initialised by kvm_pgtable_*_init().
|
* @pgt: Page-table structure initialised by kvm_pgtable_*_init().
|
||||||
|
@ -65,6 +65,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
|
|||||||
struct kvm_enable_cap *cap)
|
struct kvm_enable_cap *cap)
|
||||||
{
|
{
|
||||||
int r;
|
int r;
|
||||||
|
u64 new_cap;
|
||||||
|
|
||||||
if (cap->flags)
|
if (cap->flags)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
@ -89,6 +90,24 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
|
|||||||
r = 0;
|
r = 0;
|
||||||
set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
|
set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
|
||||||
break;
|
break;
|
||||||
|
case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
|
||||||
|
new_cap = cap->args[0];
|
||||||
|
|
||||||
|
mutex_lock(&kvm->slots_lock);
|
||||||
|
/*
|
||||||
|
* To keep things simple, allow changing the chunk
|
||||||
|
* size only when no memory slots have been created.
|
||||||
|
*/
|
||||||
|
if (!kvm_are_all_memslots_empty(kvm)) {
|
||||||
|
r = -EINVAL;
|
||||||
|
} else if (new_cap && !kvm_is_block_size_supported(new_cap)) {
|
||||||
|
r = -EINVAL;
|
||||||
|
} else {
|
||||||
|
r = 0;
|
||||||
|
kvm->arch.mmu.split_page_chunk_size = new_cap;
|
||||||
|
}
|
||||||
|
mutex_unlock(&kvm->slots_lock);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
r = -EINVAL;
|
r = -EINVAL;
|
||||||
break;
|
break;
|
||||||
@ -302,6 +321,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
|
|||||||
case KVM_CAP_ARM_PTRAUTH_GENERIC:
|
case KVM_CAP_ARM_PTRAUTH_GENERIC:
|
||||||
r = system_has_full_ptr_auth();
|
r = system_has_full_ptr_auth();
|
||||||
break;
|
break;
|
||||||
|
case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
|
||||||
|
if (kvm)
|
||||||
|
r = kvm->arch.mmu.split_page_chunk_size;
|
||||||
|
else
|
||||||
|
r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
|
||||||
|
break;
|
||||||
|
case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
|
||||||
|
r = kvm_supported_block_sizes();
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
r = 0;
|
r = 0;
|
||||||
}
|
}
|
||||||
|
@ -125,6 +125,15 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt)
|
|||||||
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
|
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt)
|
||||||
|
{
|
||||||
|
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
|
||||||
|
DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
|
||||||
|
DECLARE_REG(int, level, host_ctxt, 3);
|
||||||
|
|
||||||
|
__kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level);
|
||||||
|
}
|
||||||
|
|
||||||
static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
|
static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
|
||||||
{
|
{
|
||||||
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
|
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
|
||||||
@ -315,6 +324,7 @@ static const hcall_t host_hcall[] = {
|
|||||||
HANDLE_FUNC(__kvm_vcpu_run),
|
HANDLE_FUNC(__kvm_vcpu_run),
|
||||||
HANDLE_FUNC(__kvm_flush_vm_context),
|
HANDLE_FUNC(__kvm_flush_vm_context),
|
||||||
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
|
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
|
||||||
|
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh),
|
||||||
HANDLE_FUNC(__kvm_tlb_flush_vmid),
|
HANDLE_FUNC(__kvm_tlb_flush_vmid),
|
||||||
HANDLE_FUNC(__kvm_flush_cpu_context),
|
HANDLE_FUNC(__kvm_flush_cpu_context),
|
||||||
HANDLE_FUNC(__kvm_timer_set_cntvoff),
|
HANDLE_FUNC(__kvm_timer_set_cntvoff),
|
||||||
|
@ -91,9 +91,9 @@ static void host_s2_put_page(void *addr)
|
|||||||
hyp_put_page(&host_s2_pool, addr);
|
hyp_put_page(&host_s2_pool, addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void host_s2_free_removed_table(void *addr, u32 level)
|
static void host_s2_free_unlinked_table(void *addr, u32 level)
|
||||||
{
|
{
|
||||||
kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level);
|
kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int prepare_s2_pool(void *pgt_pool_base)
|
static int prepare_s2_pool(void *pgt_pool_base)
|
||||||
@ -110,7 +110,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
|
|||||||
host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
|
host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
|
||||||
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
|
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
|
||||||
.zalloc_page = host_s2_zalloc_page,
|
.zalloc_page = host_s2_zalloc_page,
|
||||||
.free_removed_table = host_s2_free_removed_table,
|
.free_unlinked_table = host_s2_free_unlinked_table,
|
||||||
.phys_to_virt = hyp_phys_to_virt,
|
.phys_to_virt = hyp_phys_to_virt,
|
||||||
.virt_to_phys = hyp_virt_to_phys,
|
.virt_to_phys = hyp_virt_to_phys,
|
||||||
.page_count = hyp_page_count,
|
.page_count = hyp_page_count,
|
||||||
|
@ -130,6 +130,58 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
|
|||||||
__tlb_switch_to_host(&cxt);
|
__tlb_switch_to_host(&cxt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
|
||||||
|
phys_addr_t ipa, int level)
|
||||||
|
{
|
||||||
|
struct tlb_inv_context cxt;
|
||||||
|
|
||||||
|
/* Switch to requested VMID */
|
||||||
|
__tlb_switch_to_guest(mmu, &cxt, true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We could do so much better if we had the VA as well.
|
||||||
|
* Instead, we invalidate Stage-2 for this IPA, and the
|
||||||
|
* whole of Stage-1. Weep...
|
||||||
|
*/
|
||||||
|
ipa >>= 12;
|
||||||
|
__tlbi_level(ipas2e1, ipa, level);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We have to ensure completion of the invalidation at Stage-2,
|
||||||
|
* since a table walk on another CPU could refill a TLB with a
|
||||||
|
* complete (S1 + S2) walk based on the old Stage-2 mapping if
|
||||||
|
* the Stage-1 invalidation happened first.
|
||||||
|
*/
|
||||||
|
dsb(nsh);
|
||||||
|
__tlbi(vmalle1);
|
||||||
|
dsb(nsh);
|
||||||
|
isb();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the host is running at EL1 and we have a VPIPT I-cache,
|
||||||
|
* then we must perform I-cache maintenance at EL2 in order for
|
||||||
|
* it to have an effect on the guest. Since the guest cannot hit
|
||||||
|
* I-cache lines allocated with a different VMID, we don't need
|
||||||
|
* to worry about junk out of guest reset (we nuke the I-cache on
|
||||||
|
* VMID rollover), but we do need to be careful when remapping
|
||||||
|
* executable pages for the same guest. This can happen when KSM
|
||||||
|
* takes a CoW fault on an executable page, copies the page into
|
||||||
|
* a page that was previously mapped in the guest and then needs
|
||||||
|
* to invalidate the guest view of the I-cache for that page
|
||||||
|
* from EL1. To solve this, we invalidate the entire I-cache when
|
||||||
|
* unmapping a page from a guest if we have a VPIPT I-cache but
|
||||||
|
* the host is running at EL1. As above, we could do better if
|
||||||
|
* we had the VA.
|
||||||
|
*
|
||||||
|
* The moral of this story is: if you have a VPIPT I-cache, then
|
||||||
|
* you should be running with VHE enabled.
|
||||||
|
*/
|
||||||
|
if (icache_is_vpipt())
|
||||||
|
icache_inval_all_pou();
|
||||||
|
|
||||||
|
__tlb_switch_to_host(&cxt);
|
||||||
|
}
|
||||||
|
|
||||||
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
|
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
|
||||||
{
|
{
|
||||||
struct tlb_inv_context cxt;
|
struct tlb_inv_context cxt;
|
||||||
|
@ -63,6 +63,16 @@ struct kvm_pgtable_walk_data {
|
|||||||
const u64 end;
|
const u64 end;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
|
||||||
|
{
|
||||||
|
return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
|
||||||
|
{
|
||||||
|
return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
|
||||||
|
}
|
||||||
|
|
||||||
static bool kvm_phys_is_valid(u64 phys)
|
static bool kvm_phys_is_valid(u64 phys)
|
||||||
{
|
{
|
||||||
return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
|
return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
|
||||||
@ -743,14 +753,17 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
|
|||||||
if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
|
if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/*
|
if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
|
||||||
* Perform the appropriate TLB invalidation based on the evicted pte
|
/*
|
||||||
* value (if any).
|
* Perform the appropriate TLB invalidation based on the
|
||||||
*/
|
* evicted pte value (if any).
|
||||||
if (kvm_pte_table(ctx->old, ctx->level))
|
*/
|
||||||
kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
|
if (kvm_pte_table(ctx->old, ctx->level))
|
||||||
else if (kvm_pte_valid(ctx->old))
|
kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
|
||||||
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
|
else if (kvm_pte_valid(ctx->old))
|
||||||
|
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
|
||||||
|
ctx->addr, ctx->level);
|
||||||
|
}
|
||||||
|
|
||||||
if (stage2_pte_is_counted(ctx->old))
|
if (stage2_pte_is_counted(ctx->old))
|
||||||
mm_ops->put_page(ctx->ptep);
|
mm_ops->put_page(ctx->ptep);
|
||||||
@ -857,11 +870,13 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
|
|||||||
return -EAGAIN;
|
return -EAGAIN;
|
||||||
|
|
||||||
/* Perform CMOs before installation of the guest stage-2 PTE */
|
/* Perform CMOs before installation of the guest stage-2 PTE */
|
||||||
if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
|
if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
|
||||||
|
stage2_pte_cacheable(pgt, new))
|
||||||
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
|
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
|
||||||
granule);
|
granule);
|
||||||
|
|
||||||
if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
|
if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
|
||||||
|
stage2_pte_executable(new))
|
||||||
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
|
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
|
||||||
|
|
||||||
stage2_make_pte(ctx, new);
|
stage2_make_pte(ctx, new);
|
||||||
@ -883,7 +898,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
|
|||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
mm_ops->free_removed_table(childp, ctx->level);
|
mm_ops->free_unlinked_table(childp, ctx->level);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -928,7 +943,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
|
|||||||
* The TABLE_PRE callback runs for table entries on the way down, looking
|
* The TABLE_PRE callback runs for table entries on the way down, looking
|
||||||
* for table entries which we could conceivably replace with a block entry
|
* for table entries which we could conceivably replace with a block entry
|
||||||
* for this mapping. If it finds one it replaces the entry and calls
|
* for this mapping. If it finds one it replaces the entry and calls
|
||||||
* kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table.
|
* kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
|
||||||
*
|
*
|
||||||
* Otherwise, the LEAF callback performs the mapping at the existing leaves
|
* Otherwise, the LEAF callback performs the mapping at the existing leaves
|
||||||
* instead.
|
* instead.
|
||||||
@ -1197,7 +1212,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
|
|||||||
KVM_PGTABLE_WALK_HANDLE_FAULT |
|
KVM_PGTABLE_WALK_HANDLE_FAULT |
|
||||||
KVM_PGTABLE_WALK_SHARED);
|
KVM_PGTABLE_WALK_SHARED);
|
||||||
if (!ret)
|
if (!ret)
|
||||||
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
|
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1230,6 +1245,162 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
|
|||||||
return kvm_pgtable_walk(pgt, addr, size, &walker);
|
return kvm_pgtable_walk(pgt, addr, size, &walker);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
|
||||||
|
u64 phys, u32 level,
|
||||||
|
enum kvm_pgtable_prot prot,
|
||||||
|
void *mc, bool force_pte)
|
||||||
|
{
|
||||||
|
struct stage2_map_data map_data = {
|
||||||
|
.phys = phys,
|
||||||
|
.mmu = pgt->mmu,
|
||||||
|
.memcache = mc,
|
||||||
|
.force_pte = force_pte,
|
||||||
|
};
|
||||||
|
struct kvm_pgtable_walker walker = {
|
||||||
|
.cb = stage2_map_walker,
|
||||||
|
.flags = KVM_PGTABLE_WALK_LEAF |
|
||||||
|
KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
|
||||||
|
KVM_PGTABLE_WALK_SKIP_CMO,
|
||||||
|
.arg = &map_data,
|
||||||
|
};
|
||||||
|
/*
|
||||||
|
* The input address (.addr) is irrelevant for walking an
|
||||||
|
* unlinked table. Construct an ambiguous IA range to map
|
||||||
|
* kvm_granule_size(level) worth of memory.
|
||||||
|
*/
|
||||||
|
struct kvm_pgtable_walk_data data = {
|
||||||
|
.walker = &walker,
|
||||||
|
.addr = 0,
|
||||||
|
.end = kvm_granule_size(level),
|
||||||
|
};
|
||||||
|
struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
|
||||||
|
kvm_pte_t *pgtable;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
if (!IS_ALIGNED(phys, kvm_granule_size(level)))
|
||||||
|
return ERR_PTR(-EINVAL);
|
||||||
|
|
||||||
|
ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
|
||||||
|
if (ret)
|
||||||
|
return ERR_PTR(ret);
|
||||||
|
|
||||||
|
pgtable = mm_ops->zalloc_page(mc);
|
||||||
|
if (!pgtable)
|
||||||
|
return ERR_PTR(-ENOMEM);
|
||||||
|
|
||||||
|
ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
|
||||||
|
level + 1);
|
||||||
|
if (ret) {
|
||||||
|
kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
|
||||||
|
mm_ops->put_page(pgtable);
|
||||||
|
return ERR_PTR(ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
return pgtable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get the number of page-tables needed to replace a block with a
|
||||||
|
* fully populated tree up to the PTE entries. Note that @level is
|
||||||
|
* interpreted as in "level @level entry".
|
||||||
|
*/
|
||||||
|
static int stage2_block_get_nr_page_tables(u32 level)
|
||||||
|
{
|
||||||
|
switch (level) {
|
||||||
|
case 1:
|
||||||
|
return PTRS_PER_PTE + 1;
|
||||||
|
case 2:
|
||||||
|
return 1;
|
||||||
|
case 3:
|
||||||
|
return 0;
|
||||||
|
default:
|
||||||
|
WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
|
||||||
|
level >= KVM_PGTABLE_MAX_LEVELS);
|
||||||
|
return -EINVAL;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
||||||
|
enum kvm_pgtable_walk_flags visit)
|
||||||
|
{
|
||||||
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
||||||
|
struct kvm_mmu_memory_cache *mc = ctx->arg;
|
||||||
|
struct kvm_s2_mmu *mmu;
|
||||||
|
kvm_pte_t pte = ctx->old, new, *childp;
|
||||||
|
enum kvm_pgtable_prot prot;
|
||||||
|
u32 level = ctx->level;
|
||||||
|
bool force_pte;
|
||||||
|
int nr_pages;
|
||||||
|
u64 phys;
|
||||||
|
|
||||||
|
/* No huge-pages exist at the last level */
|
||||||
|
if (level == KVM_PGTABLE_MAX_LEVELS - 1)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* We only split valid block mappings */
|
||||||
|
if (!kvm_pte_valid(pte))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
nr_pages = stage2_block_get_nr_page_tables(level);
|
||||||
|
if (nr_pages < 0)
|
||||||
|
return nr_pages;
|
||||||
|
|
||||||
|
if (mc->nobjs >= nr_pages) {
|
||||||
|
/* Build a tree mapped down to the PTE granularity. */
|
||||||
|
force_pte = true;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Don't force PTEs, so create_unlinked() below does
|
||||||
|
* not populate the tree up to the PTE level. The
|
||||||
|
* consequence is that the call will require a single
|
||||||
|
* page of level 2 entries at level 1, or a single
|
||||||
|
* page of PTEs at level 2. If we are at level 1, the
|
||||||
|
* PTEs will be created recursively.
|
||||||
|
*/
|
||||||
|
force_pte = false;
|
||||||
|
nr_pages = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mc->nobjs < nr_pages)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
|
||||||
|
phys = kvm_pte_to_phys(pte);
|
||||||
|
prot = kvm_pgtable_stage2_pte_prot(pte);
|
||||||
|
|
||||||
|
childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
|
||||||
|
level, prot, mc, force_pte);
|
||||||
|
if (IS_ERR(childp))
|
||||||
|
return PTR_ERR(childp);
|
||||||
|
|
||||||
|
if (!stage2_try_break_pte(ctx, mmu)) {
|
||||||
|
kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
|
||||||
|
mm_ops->put_page(childp);
|
||||||
|
return -EAGAIN;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note, the contents of the page table are guaranteed to be made
|
||||||
|
* visible before the new PTE is assigned because stage2_make_pte()
|
||||||
|
* writes the PTE using smp_store_release().
|
||||||
|
*/
|
||||||
|
new = kvm_init_table_pte(childp, mm_ops);
|
||||||
|
stage2_make_pte(ctx, new);
|
||||||
|
dsb(ishst);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||||
|
struct kvm_mmu_memory_cache *mc)
|
||||||
|
{
|
||||||
|
struct kvm_pgtable_walker walker = {
|
||||||
|
.cb = stage2_split_walker,
|
||||||
|
.flags = KVM_PGTABLE_WALK_LEAF,
|
||||||
|
.arg = mc,
|
||||||
|
};
|
||||||
|
|
||||||
|
return kvm_pgtable_walk(pgt, addr, size, &walker);
|
||||||
|
}
|
||||||
|
|
||||||
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
|
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
|
||||||
struct kvm_pgtable_mm_ops *mm_ops,
|
struct kvm_pgtable_mm_ops *mm_ops,
|
||||||
@ -1299,7 +1470,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
|
|||||||
pgt->pgd = NULL;
|
pgt->pgd = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
|
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
|
||||||
{
|
{
|
||||||
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
|
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
|
||||||
struct kvm_pgtable_walker walker = {
|
struct kvm_pgtable_walker walker = {
|
||||||
|
@ -111,6 +111,38 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
|
|||||||
__tlb_switch_to_host(&cxt);
|
__tlb_switch_to_host(&cxt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
|
||||||
|
phys_addr_t ipa, int level)
|
||||||
|
{
|
||||||
|
struct tlb_inv_context cxt;
|
||||||
|
|
||||||
|
dsb(nshst);
|
||||||
|
|
||||||
|
/* Switch to requested VMID */
|
||||||
|
__tlb_switch_to_guest(mmu, &cxt);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We could do so much better if we had the VA as well.
|
||||||
|
* Instead, we invalidate Stage-2 for this IPA, and the
|
||||||
|
* whole of Stage-1. Weep...
|
||||||
|
*/
|
||||||
|
ipa >>= 12;
|
||||||
|
__tlbi_level(ipas2e1, ipa, level);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We have to ensure completion of the invalidation at Stage-2,
|
||||||
|
* since a table walk on another CPU could refill a TLB with a
|
||||||
|
* complete (S1 + S2) walk based on the old Stage-2 mapping if
|
||||||
|
* the Stage-1 invalidation happened first.
|
||||||
|
*/
|
||||||
|
dsb(nsh);
|
||||||
|
__tlbi(vmalle1);
|
||||||
|
dsb(nsh);
|
||||||
|
isb();
|
||||||
|
|
||||||
|
__tlb_switch_to_host(&cxt);
|
||||||
|
}
|
||||||
|
|
||||||
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
|
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
|
||||||
{
|
{
|
||||||
struct tlb_inv_context cxt;
|
struct tlb_inv_context cxt;
|
||||||
|
@ -31,14 +31,21 @@ static phys_addr_t __ro_after_init hyp_idmap_vector;
|
|||||||
|
|
||||||
static unsigned long __ro_after_init io_map_base;
|
static unsigned long __ro_after_init io_map_base;
|
||||||
|
|
||||||
static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
|
static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
|
||||||
|
phys_addr_t size)
|
||||||
{
|
{
|
||||||
phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
|
|
||||||
phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
|
phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
|
||||||
|
|
||||||
return (boundary - 1 < end - 1) ? boundary : end;
|
return (boundary - 1 < end - 1) ? boundary : end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
|
||||||
|
{
|
||||||
|
phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
|
||||||
|
|
||||||
|
return __stage2_range_addr_end(addr, end, size);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
|
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
|
||||||
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
|
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
|
||||||
@ -75,6 +82,79 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
|
|||||||
#define stage2_apply_range_resched(mmu, addr, end, fn) \
|
#define stage2_apply_range_resched(mmu, addr, end, fn) \
|
||||||
stage2_apply_range(mmu, addr, end, fn, true)
|
stage2_apply_range(mmu, addr, end, fn, true)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get the maximum number of page-tables pages needed to split a range
|
||||||
|
* of blocks into PAGE_SIZE PTEs. It assumes the range is already
|
||||||
|
* mapped at level 2, or at level 1 if allowed.
|
||||||
|
*/
|
||||||
|
static int kvm_mmu_split_nr_page_tables(u64 range)
|
||||||
|
{
|
||||||
|
int n = 0;
|
||||||
|
|
||||||
|
if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
|
||||||
|
n += DIV_ROUND_UP(range, PUD_SIZE);
|
||||||
|
n += DIV_ROUND_UP(range, PMD_SIZE);
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
|
||||||
|
{
|
||||||
|
struct kvm_mmu_memory_cache *cache;
|
||||||
|
u64 chunk_size, min;
|
||||||
|
|
||||||
|
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
chunk_size = kvm->arch.mmu.split_page_chunk_size;
|
||||||
|
min = kvm_mmu_split_nr_page_tables(chunk_size);
|
||||||
|
cache = &kvm->arch.mmu.split_page_cache;
|
||||||
|
return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
|
||||||
|
phys_addr_t end)
|
||||||
|
{
|
||||||
|
struct kvm_mmu_memory_cache *cache;
|
||||||
|
struct kvm_pgtable *pgt;
|
||||||
|
int ret, cache_capacity;
|
||||||
|
u64 next, chunk_size;
|
||||||
|
|
||||||
|
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||||
|
|
||||||
|
chunk_size = kvm->arch.mmu.split_page_chunk_size;
|
||||||
|
cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
|
||||||
|
|
||||||
|
if (chunk_size == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
cache = &kvm->arch.mmu.split_page_cache;
|
||||||
|
|
||||||
|
do {
|
||||||
|
if (need_split_memcache_topup_or_resched(kvm)) {
|
||||||
|
write_unlock(&kvm->mmu_lock);
|
||||||
|
cond_resched();
|
||||||
|
/* Eager page splitting is best-effort. */
|
||||||
|
ret = __kvm_mmu_topup_memory_cache(cache,
|
||||||
|
cache_capacity,
|
||||||
|
cache_capacity);
|
||||||
|
write_lock(&kvm->mmu_lock);
|
||||||
|
if (ret)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
pgt = kvm->arch.mmu.pgt;
|
||||||
|
if (!pgt)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
next = __stage2_range_addr_end(addr, end, chunk_size);
|
||||||
|
ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
|
||||||
|
if (ret)
|
||||||
|
break;
|
||||||
|
} while (addr = next, addr != end);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
|
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
|
||||||
{
|
{
|
||||||
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
|
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
|
||||||
@ -131,21 +211,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
|
|||||||
|
|
||||||
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
|
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
|
||||||
|
|
||||||
static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
|
static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
|
||||||
{
|
{
|
||||||
struct page *page = container_of(head, struct page, rcu_head);
|
struct page *page = container_of(head, struct page, rcu_head);
|
||||||
void *pgtable = page_to_virt(page);
|
void *pgtable = page_to_virt(page);
|
||||||
u32 level = page_private(page);
|
u32 level = page_private(page);
|
||||||
|
|
||||||
kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
|
kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void stage2_free_removed_table(void *addr, u32 level)
|
static void stage2_free_unlinked_table(void *addr, u32 level)
|
||||||
{
|
{
|
||||||
struct page *page = virt_to_page(addr);
|
struct page *page = virt_to_page(addr);
|
||||||
|
|
||||||
set_page_private(page, (unsigned long)level);
|
set_page_private(page, (unsigned long)level);
|
||||||
call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
|
call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kvm_host_get_page(void *addr)
|
static void kvm_host_get_page(void *addr)
|
||||||
@ -701,7 +781,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
|
|||||||
.zalloc_page = stage2_memcache_zalloc_page,
|
.zalloc_page = stage2_memcache_zalloc_page,
|
||||||
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
|
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
|
||||||
.free_pages_exact = kvm_s2_free_pages_exact,
|
.free_pages_exact = kvm_s2_free_pages_exact,
|
||||||
.free_removed_table = stage2_free_removed_table,
|
.free_unlinked_table = stage2_free_unlinked_table,
|
||||||
.get_page = kvm_host_get_page,
|
.get_page = kvm_host_get_page,
|
||||||
.put_page = kvm_s2_put_page,
|
.put_page = kvm_s2_put_page,
|
||||||
.page_count = kvm_host_page_count,
|
.page_count = kvm_host_page_count,
|
||||||
@ -775,6 +855,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
|
|||||||
for_each_possible_cpu(cpu)
|
for_each_possible_cpu(cpu)
|
||||||
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
|
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
|
||||||
|
|
||||||
|
/* The eager page splitting is disabled by default */
|
||||||
|
mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
|
||||||
|
mmu->split_page_cache.gfp_zero = __GFP_ZERO;
|
||||||
|
|
||||||
mmu->pgt = pgt;
|
mmu->pgt = pgt;
|
||||||
mmu->pgd_phys = __pa(pgt->pgd);
|
mmu->pgd_phys = __pa(pgt->pgd);
|
||||||
return 0;
|
return 0;
|
||||||
@ -786,6 +870,12 @@ out_free_pgtable:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void kvm_uninit_stage2_mmu(struct kvm *kvm)
|
||||||
|
{
|
||||||
|
kvm_free_stage2_pgd(&kvm->arch.mmu);
|
||||||
|
kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
|
||||||
|
}
|
||||||
|
|
||||||
static void stage2_unmap_memslot(struct kvm *kvm,
|
static void stage2_unmap_memslot(struct kvm *kvm,
|
||||||
struct kvm_memory_slot *memslot)
|
struct kvm_memory_slot *memslot)
|
||||||
{
|
{
|
||||||
@ -989,17 +1079,45 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* kvm_mmu_write_protect_pt_masked() - write protect dirty pages
|
* kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
|
||||||
|
* pages for memory slot
|
||||||
|
* @kvm: The KVM pointer
|
||||||
|
* @slot: The memory slot to split
|
||||||
|
*
|
||||||
|
* Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
|
||||||
|
* serializing operations for VM memory regions.
|
||||||
|
*/
|
||||||
|
static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
|
||||||
|
{
|
||||||
|
struct kvm_memslots *slots;
|
||||||
|
struct kvm_memory_slot *memslot;
|
||||||
|
phys_addr_t start, end;
|
||||||
|
|
||||||
|
lockdep_assert_held(&kvm->slots_lock);
|
||||||
|
|
||||||
|
slots = kvm_memslots(kvm);
|
||||||
|
memslot = id_to_memslot(slots, slot);
|
||||||
|
|
||||||
|
start = memslot->base_gfn << PAGE_SHIFT;
|
||||||
|
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
|
||||||
|
|
||||||
|
write_lock(&kvm->mmu_lock);
|
||||||
|
kvm_mmu_split_huge_pages(kvm, start, end);
|
||||||
|
write_unlock(&kvm->mmu_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
|
||||||
* @kvm: The KVM pointer
|
* @kvm: The KVM pointer
|
||||||
* @slot: The memory slot associated with mask
|
* @slot: The memory slot associated with mask
|
||||||
* @gfn_offset: The gfn offset in memory slot
|
* @gfn_offset: The gfn offset in memory slot
|
||||||
* @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
|
* @mask: The mask of pages at offset 'gfn_offset' in this memory
|
||||||
* slot to be write protected
|
* slot to enable dirty logging on
|
||||||
*
|
*
|
||||||
* Walks bits set in mask write protects the associated pte's. Caller must
|
* Writes protect selected pages to enable dirty logging, and then
|
||||||
* acquire kvm_mmu_lock.
|
* splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
|
||||||
*/
|
*/
|
||||||
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
|
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
|
||||||
struct kvm_memory_slot *slot,
|
struct kvm_memory_slot *slot,
|
||||||
gfn_t gfn_offset, unsigned long mask)
|
gfn_t gfn_offset, unsigned long mask)
|
||||||
{
|
{
|
||||||
@ -1007,21 +1125,20 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
|
|||||||
phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
|
phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
|
||||||
phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
|
phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
|
||||||
|
|
||||||
stage2_wp_range(&kvm->arch.mmu, start, end);
|
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
stage2_wp_range(&kvm->arch.mmu, start, end);
|
||||||
* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
|
|
||||||
* dirty pages.
|
/*
|
||||||
*
|
* Eager-splitting is done when manual-protect is set. We
|
||||||
* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
|
* also check for initially-all-set because we can avoid
|
||||||
* enable dirty logging for them.
|
* eager-splitting if initially-all-set is false.
|
||||||
*/
|
* Initially-all-set equal false implies that huge-pages were
|
||||||
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
|
* already split when enabling dirty logging: no need to do it
|
||||||
struct kvm_memory_slot *slot,
|
* again.
|
||||||
gfn_t gfn_offset, unsigned long mask)
|
*/
|
||||||
{
|
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
|
||||||
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
|
kvm_mmu_split_huge_pages(kvm, start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
|
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
|
||||||
@ -1790,20 +1907,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
|
|||||||
const struct kvm_memory_slot *new,
|
const struct kvm_memory_slot *new,
|
||||||
enum kvm_mr_change change)
|
enum kvm_mr_change change)
|
||||||
{
|
{
|
||||||
|
bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* At this point memslot has been committed and there is an
|
* At this point memslot has been committed and there is an
|
||||||
* allocated dirty_bitmap[], dirty pages will be tracked while the
|
* allocated dirty_bitmap[], dirty pages will be tracked while the
|
||||||
* memory slot is write protected.
|
* memory slot is write protected.
|
||||||
*/
|
*/
|
||||||
if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
|
if (log_dirty_pages) {
|
||||||
|
|
||||||
|
if (change == KVM_MR_DELETE)
|
||||||
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we're with initial-all-set, we don't need to write
|
* Huge and normal pages are write-protected and split
|
||||||
* protect any pages because they're all reported as dirty.
|
* on either of these two cases:
|
||||||
* Huge pages and normal pages will be write protect gradually.
|
*
|
||||||
|
* 1. with initial-all-set: gradually with CLEAR ioctls,
|
||||||
*/
|
*/
|
||||||
if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
|
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
|
||||||
kvm_mmu_wp_memory_region(kvm, new->id);
|
return;
|
||||||
}
|
/*
|
||||||
|
* or
|
||||||
|
* 2. without initial-all-set: all in one shot when
|
||||||
|
* enabling dirty logging.
|
||||||
|
*/
|
||||||
|
kvm_mmu_wp_memory_region(kvm, new->id);
|
||||||
|
kvm_mmu_split_memory_region(kvm, new->id);
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Free any leftovers from the eager page splitting cache. Do
|
||||||
|
* this when deleting, moving, disabling dirty logging, or
|
||||||
|
* creating the memslot (a nop). Doing it for deletes makes
|
||||||
|
* sure we don't leak memory, and there's no need to keep the
|
||||||
|
* cache around for any of the other cases.
|
||||||
|
*/
|
||||||
|
kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1877,7 +2016,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
|
|||||||
|
|
||||||
void kvm_arch_flush_shadow_all(struct kvm *kvm)
|
void kvm_arch_flush_shadow_all(struct kvm *kvm)
|
||||||
{
|
{
|
||||||
kvm_free_stage2_pgd(&kvm->arch.mmu);
|
kvm_uninit_stage2_mmu(kvm);
|
||||||
}
|
}
|
||||||
|
|
||||||
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
|
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
|
||||||
|
@ -991,6 +991,8 @@ static inline bool kvm_memslots_empty(struct kvm_memslots *slots)
|
|||||||
return RB_EMPTY_ROOT(&slots->gfn_tree);
|
return RB_EMPTY_ROOT(&slots->gfn_tree);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool kvm_are_all_memslots_empty(struct kvm *kvm);
|
||||||
|
|
||||||
#define kvm_for_each_memslot(memslot, bkt, slots) \
|
#define kvm_for_each_memslot(memslot, bkt, slots) \
|
||||||
hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \
|
hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \
|
||||||
if (WARN_ON_ONCE(!memslot->npages)) { \
|
if (WARN_ON_ONCE(!memslot->npages)) { \
|
||||||
|
@ -1190,6 +1190,8 @@ struct kvm_ppc_resize_hpt {
|
|||||||
#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
|
#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
|
||||||
#define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
|
#define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
|
||||||
#define KVM_CAP_COUNTER_OFFSET 227
|
#define KVM_CAP_COUNTER_OFFSET 227
|
||||||
|
#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
|
||||||
|
#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
|
||||||
|
|
||||||
#ifdef KVM_CAP_IRQ_ROUTING
|
#ifdef KVM_CAP_IRQ_ROUTING
|
||||||
|
|
||||||
|
@ -4602,7 +4602,7 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool kvm_are_all_memslots_empty(struct kvm *kvm)
|
bool kvm_are_all_memslots_empty(struct kvm *kvm)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@ -4615,6 +4615,7 @@ static bool kvm_are_all_memslots_empty(struct kvm *kvm)
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
|
||||||
|
|
||||||
static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
|
static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
|
||||||
struct kvm_enable_cap *cap)
|
struct kvm_enable_cap *cap)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user