Merge branch kvm-arm64/s2-ptdump into kvmarm-master/next

* kvm-arm64/s2-ptdump:
  : .
  : Stage-2 page table dumper, reusing the main ptdump infrastructure,
  : courtesy of Sebastian Ene. From the cover letter:
  :
  : "This series extends the ptdump support to allow dumping the guest
  : stage-2 pagetables. When CONFIG_PTDUMP_STAGE2_DEBUGFS is enabled, ptdump
  : registers the new following files under debugfs:
  : - /sys/debug/kvm/<guest_id>/stage2_page_tables
  : - /sys/debug/kvm/<guest_id>/stage2_levels
  : - /sys/debug/kvm/<guest_id>/ipa_range
  :
  : This allows userspace tools (eg. cat) to dump the stage-2 pagetables by
  : reading the 'stage2_page_tables' file.
  : [...]"
  : .
  KVM: arm64: Register ptdump with debugfs on guest creation
  arm64: ptdump: Don't override the level when operating on the stage-2 tables
  arm64: ptdump: Use the ptdump description from a local context
  arm64: ptdump: Expose the attribute parsing functionality
  KVM: arm64: Move pagetable definitions to common header

Signed-off-by: Marc Zyngier <maz@kernel.org>
This commit is contained in:
Marc Zyngier 2024-09-12 08:38:02 +01:00
commit f625469051
9 changed files with 397 additions and 93 deletions

View File

@ -352,5 +352,11 @@ static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
return &kvm->arch.mmu != mmu;
}
#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
#else
static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */

View File

@ -59,6 +59,48 @@ typedef u64 kvm_pte_t;
#define KVM_PHYS_INVALID (-1ULL)
#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \
({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \
({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50)
#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
#define KVM_MAX_OWNER_ID 1
/*
* Used to indicate a pte for which a 'break-before-make' sequence is in
* progress.
*/
#define KVM_INVALID_PTE_LOCKED BIT(10)
static inline bool kvm_pte_valid(kvm_pte_t pte)
{
return pte & KVM_PTE_VALID;

View File

@ -5,6 +5,8 @@
#ifndef __ASM_PTDUMP_H
#define __ASM_PTDUMP_H
#include <linux/ptdump.h>
#ifdef CONFIG_PTDUMP_CORE
#include <linux/mm_types.h>
@ -21,14 +23,53 @@ struct ptdump_info {
unsigned long base_addr;
};
struct ptdump_prot_bits {
u64 mask;
u64 val;
const char *set;
const char *clear;
};
struct ptdump_pg_level {
const struct ptdump_prot_bits *bits;
char name[4];
int num;
u64 mask;
};
/*
* The page dumper groups page table entries of the same type into a single
* description. It uses pg_state to track the range information while
* iterating over the pte entries. When the continuity is broken it then
* dumps out a description of the range.
*/
struct ptdump_pg_state {
struct ptdump_state ptdump;
struct ptdump_pg_level *pg_level;
struct seq_file *seq;
const struct addr_marker *marker;
const struct mm_struct *mm;
unsigned long start_address;
int level;
u64 current_prot;
bool check_wx;
unsigned long wx_pages;
unsigned long uxn_pages;
};
void ptdump_walk(struct seq_file *s, struct ptdump_info *info);
void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
u64 val);
#ifdef CONFIG_PTDUMP_DEBUGFS
#define EFI_RUNTIME_MAP_END DEFAULT_MAP_WINDOW_64
void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
#else
static inline void ptdump_debugfs_register(struct ptdump_info *info,
const char *name) { }
#endif
#endif /* CONFIG_PTDUMP_DEBUGFS */
#else
static inline void note_page(struct ptdump_state *pt_st, unsigned long addr,
int level, u64 val) { }
#endif /* CONFIG_PTDUMP_CORE */
#endif /* __ASM_PTDUMP_H */

View File

@ -66,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE
If unsure, or not using protected nVHE (pKVM), say N.
config PTDUMP_STAGE2_DEBUGFS
bool "Present the stage-2 pagetables to debugfs"
depends on KVM
depends on DEBUG_KERNEL
depends on DEBUG_FS
depends on GENERIC_PTDUMP
select PTDUMP_CORE
default n
help
Say Y here if you want to show the stage-2 kernel pagetables
layout in a debugfs file. This information is only useful for kernel developers
who are working in architecture specific areas of the kernel.
It is probably not a good idea to enable this feature in a production
kernel.
If in doubt, say N.
endif # VIRTUALIZATION

View File

@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
always-y := hyp_constants.h hyp-constants.s

View File

@ -230,6 +230,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
void kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
kvm_sys_regs_create_debugfs(kvm);
kvm_s2_ptdump_create_debugfs(kvm);
}
static void kvm_destroy_mpidr_data(struct kvm *kvm)

View File

@ -17,48 +17,6 @@
#define KVM_PTE_TYPE_PAGE 1
#define KVM_PTE_TYPE_TABLE 1
#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \
({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \
({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50)
#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
#define KVM_MAX_OWNER_ID 1
/*
* Used to indicate a pte for which a 'break-before-make' sequence is in
* progress.
*/
#define KVM_INVALID_PTE_LOCKED BIT(10)
struct kvm_pgtable_walk_data {
struct kvm_pgtable_walker *walker;

268
arch/arm64/kvm/ptdump.c Normal file
View File

@ -0,0 +1,268 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Debug helper used to dump the stage-2 pagetables of the system and their
* associated permissions.
*
* Copyright (C) Google, 2024
* Author: Sebastian Ene <sebastianene@google.com>
*/
#include <linux/debugfs.h>
#include <linux/kvm_host.h>
#include <linux/seq_file.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/ptdump.h>
#define MARKERS_LEN 2
#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1)
struct kvm_ptdump_guest_state {
struct kvm *kvm;
struct ptdump_pg_state parser_state;
struct addr_marker ipa_marker[MARKERS_LEN];
struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS];
struct ptdump_range range[MARKERS_LEN];
};
static const struct ptdump_prot_bits stage2_pte_bits[] = {
{
.mask = PTE_VALID,
.val = PTE_VALID,
.set = " ",
.clear = "F",
}, {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
.val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
.set = "R",
.clear = " ",
}, {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
.val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
.set = "W",
.clear = " ",
}, {
.mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
.val = PTE_VALID,
.set = " ",
.clear = "X",
}, {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
.val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
.set = "AF",
.clear = " ",
}, {
.mask = PTE_TABLE_BIT | PTE_VALID,
.val = PTE_VALID,
.set = "BLK",
.clear = " ",
},
};
static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
struct ptdump_pg_state *st = ctx->arg;
struct ptdump_state *pt_st = &st->ptdump;
note_page(pt_st, ctx->addr, ctx->level, ctx->old);
return 0;
}
static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
{
u32 i;
u64 mask;
if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
return -EINVAL;
mask = 0;
for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
mask |= stage2_pte_bits[i].mask;
for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
snprintf(level[i].name, sizeof(level[i].name), "%u", i);
level[i].num = ARRAY_SIZE(stage2_pte_bits);
level[i].bits = stage2_pte_bits;
level[i].mask = mask;
}
return 0;
}
static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
{
struct kvm_ptdump_guest_state *st;
struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
struct kvm_pgtable *pgtable = mmu->pgt;
int ret;
st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
if (!st)
return ERR_PTR(-ENOMEM);
ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
if (ret) {
kfree(st);
return ERR_PTR(ret);
}
st->ipa_marker[0].name = "Guest IPA";
st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
st->range[0].end = BIT(pgtable->ia_bits);
st->kvm = kvm;
st->parser_state = (struct ptdump_pg_state) {
.marker = &st->ipa_marker[0],
.level = -1,
.pg_level = &st->level[0],
.ptdump.range = &st->range[0],
.start_address = 0,
};
return st;
}
static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
{
int ret;
struct kvm_ptdump_guest_state *st = m->private;
struct kvm *kvm = st->kvm;
struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
struct ptdump_pg_state *parser_state = &st->parser_state;
struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
.cb = kvm_ptdump_visitor,
.arg = parser_state,
.flags = KVM_PGTABLE_WALK_LEAF,
};
parser_state->seq = m;
write_lock(&kvm->mmu_lock);
ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
write_unlock(&kvm->mmu_lock);
return ret;
}
static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
{
struct kvm *kvm = m->i_private;
struct kvm_ptdump_guest_state *st;
int ret;
if (!kvm_get_kvm_safe(kvm))
return -ENOENT;
st = kvm_ptdump_parser_create(kvm);
if (IS_ERR(st)) {
ret = PTR_ERR(st);
goto err_with_kvm_ref;
}
ret = single_open(file, kvm_ptdump_guest_show, st);
if (!ret)
return 0;
kfree(st);
err_with_kvm_ref:
kvm_put_kvm(kvm);
return ret;
}
static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
{
struct kvm *kvm = m->i_private;
void *st = ((struct seq_file *)file->private_data)->private;
kfree(st);
kvm_put_kvm(kvm);
return single_release(m, file);
}
static const struct file_operations kvm_ptdump_guest_fops = {
.open = kvm_ptdump_guest_open,
.read = seq_read,
.llseek = seq_lseek,
.release = kvm_ptdump_guest_close,
};
static int kvm_pgtable_range_show(struct seq_file *m, void *unused)
{
struct kvm_pgtable *pgtable = m->private;
seq_printf(m, "%2u\n", pgtable->ia_bits);
return 0;
}
static int kvm_pgtable_levels_show(struct seq_file *m, void *unused)
{
struct kvm_pgtable *pgtable = m->private;
seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level);
return 0;
}
static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file,
int (*show)(struct seq_file *, void *))
{
struct kvm *kvm = m->i_private;
struct kvm_pgtable *pgtable;
int ret;
if (!kvm_get_kvm_safe(kvm))
return -ENOENT;
pgtable = kvm->arch.mmu.pgt;
ret = single_open(file, show, pgtable);
if (ret < 0)
kvm_put_kvm(kvm);
return ret;
}
static int kvm_pgtable_range_open(struct inode *m, struct file *file)
{
return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show);
}
static int kvm_pgtable_levels_open(struct inode *m, struct file *file)
{
return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show);
}
static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
{
struct kvm *kvm = m->i_private;
kvm_put_kvm(kvm);
return single_release(m, file);
}
static const struct file_operations kvm_pgtable_range_fops = {
.open = kvm_pgtable_range_open,
.read = seq_read,
.llseek = seq_lseek,
.release = kvm_pgtable_debugfs_close,
};
static const struct file_operations kvm_pgtable_levels_fops = {
.open = kvm_pgtable_levels_open,
.read = seq_read,
.llseek = seq_lseek,
.release = kvm_pgtable_debugfs_close,
};
void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
{
debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
kvm, &kvm_ptdump_guest_fops);
debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
&kvm_pgtable_range_fops);
debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
kvm, &kvm_pgtable_levels_fops);
}

View File

@ -38,33 +38,7 @@
seq_printf(m, fmt); \
})
/*
* The page dumper groups page table entries of the same type into a single
* description. It uses pg_state to track the range information while
* iterating over the pte entries. When the continuity is broken it then
* dumps out a description of the range.
*/
struct pg_state {
struct ptdump_state ptdump;
struct seq_file *seq;
const struct addr_marker *marker;
const struct mm_struct *mm;
unsigned long start_address;
int level;
u64 current_prot;
bool check_wx;
unsigned long wx_pages;
unsigned long uxn_pages;
};
struct prot_bits {
u64 mask;
u64 val;
const char *set;
const char *clear;
};
static const struct prot_bits pte_bits[] = {
static const struct ptdump_prot_bits pte_bits[] = {
{
.mask = PTE_VALID,
.val = PTE_VALID,
@ -143,14 +117,7 @@ static const struct prot_bits pte_bits[] = {
}
};
struct pg_level {
const struct prot_bits *bits;
char name[4];
int num;
u64 mask;
};
static struct pg_level pg_level[] __ro_after_init = {
static struct ptdump_pg_level kernel_pg_levels[] __ro_after_init = {
{ /* pgd */
.name = "PGD",
.bits = pte_bits,
@ -174,7 +141,7 @@ static struct pg_level pg_level[] __ro_after_init = {
},
};
static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
static void dump_prot(struct ptdump_pg_state *st, const struct ptdump_prot_bits *bits,
size_t num)
{
unsigned i;
@ -192,7 +159,7 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
}
}
static void note_prot_uxn(struct pg_state *st, unsigned long addr)
static void note_prot_uxn(struct ptdump_pg_state *st, unsigned long addr)
{
if (!st->check_wx)
return;
@ -206,7 +173,7 @@ static void note_prot_uxn(struct pg_state *st, unsigned long addr)
st->uxn_pages += (addr - st->start_address) / PAGE_SIZE;
}
static void note_prot_wx(struct pg_state *st, unsigned long addr)
static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr)
{
if (!st->check_wx)
return;
@ -221,16 +188,17 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
}
static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
u64 val)
void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
u64 val)
{
struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump);
struct ptdump_pg_level *pg_level = st->pg_level;
static const char units[] = "KMGTPE";
u64 prot = 0;
/* check if the current level has been folded dynamically */
if ((level == 1 && mm_p4d_folded(st->mm)) ||
(level == 2 && mm_pud_folded(st->mm)))
if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) ||
(level == 2 && mm_pud_folded(st->mm))))
level = 0;
if (level >= 0)
@ -286,15 +254,16 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
{
unsigned long end = ~0UL;
struct pg_state st;
struct ptdump_pg_state st;
if (info->base_addr < TASK_SIZE_64)
end = TASK_SIZE_64;
st = (struct pg_state){
st = (struct ptdump_pg_state){
.seq = s,
.marker = info->markers,
.mm = info->mm,
.pg_level = &kernel_pg_levels[0],
.level = -1,
.ptdump = {
.note_page = note_page,
@ -312,10 +281,10 @@ static void __init ptdump_initialize(void)
{
unsigned i, j;
for (i = 0; i < ARRAY_SIZE(pg_level); i++)
if (pg_level[i].bits)
for (j = 0; j < pg_level[i].num; j++)
pg_level[i].mask |= pg_level[i].bits[j].mask;
for (i = 0; i < ARRAY_SIZE(kernel_pg_levels); i++)
if (kernel_pg_levels[i].bits)
for (j = 0; j < kernel_pg_levels[i].num; j++)
kernel_pg_levels[i].mask |= kernel_pg_levels[i].bits[j].mask;
}
static struct ptdump_info kernel_ptdump_info __ro_after_init = {
@ -324,12 +293,13 @@ static struct ptdump_info kernel_ptdump_info __ro_after_init = {
bool ptdump_check_wx(void)
{
struct pg_state st = {
struct ptdump_pg_state st = {
.seq = NULL,
.marker = (struct addr_marker[]) {
{ 0, NULL},
{ -1, NULL},
},
.pg_level = &kernel_pg_levels[0],
.level = -1,
.check_wx = true,
.ptdump = {