367 lines
9.1 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* ACRN: Memory mapping management
*
* Copyright (C) 2020 Intel Corporation. All rights reserved.
*
* Authors:
* Fei Li <lei1.li@intel.com>
* Shuo Liu <shuo.a.liu@intel.com>
*/
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/slab.h>
fix missing vmalloc.h includes Patch series "Memory allocation profiling", v6. Overview: Low overhead [1] per-callsite memory allocation profiling. Not just for debug kernels, overhead low enough to be deployed in production. Example output: root@moria-kvm:~# sort -rn /proc/allocinfo 127664128 31168 mm/page_ext.c:270 func:alloc_page_ext 56373248 4737 mm/slub.c:2259 func:alloc_slab_page 14880768 3633 mm/readahead.c:247 func:page_cache_ra_unbounded 14417920 3520 mm/mm_init.c:2530 func:alloc_large_system_hash 13377536 234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs 11718656 2861 mm/filemap.c:1919 func:__filemap_get_folio 9192960 2800 kernel/fork.c:307 func:alloc_thread_stack_node 4206592 4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable 4136960 1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start 3940352 962 mm/memory.c:4214 func:alloc_anon_folio 2894464 22613 fs/kernfs/dir.c:615 func:__kernfs_new_node ... Usage: kconfig options: - CONFIG_MEM_ALLOC_PROFILING - CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT - CONFIG_MEM_ALLOC_PROFILING_DEBUG adds warnings for allocations that weren't accounted because of a missing annotation sysctl: /proc/sys/vm/mem_profiling Runtime info: /proc/allocinfo Notes: [1]: Overhead To measure the overhead we are comparing the following configurations: (1) Baseline with CONFIG_MEMCG_KMEM=n (2) Disabled by default (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n) (3) Enabled by default (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=y) (4) Enabled at runtime (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n && /proc/sys/vm/mem_profiling=1) (5) Baseline with CONFIG_MEMCG_KMEM=y && allocating with __GFP_ACCOUNT (6) Disabled by default (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n) && CONFIG_MEMCG_KMEM=y (7) Enabled by default (CONFIG_MEM_ALLOC_PROFILING=y && CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=y) && CONFIG_MEMCG_KMEM=y Performance overhead: To evaluate performance we implemented an in-kernel test executing multiple get_free_page/free_page and kmalloc/kfree calls with allocation sizes growing from 8 to 240 bytes with CPU frequency set to max and CPU affinity set to a specific CPU to minimize the noise. Below are results from running the test on Ubuntu 22.04.2 LTS with 6.8.0-rc1 kernel on 56 core Intel Xeon: kmalloc pgalloc (1 baseline) 6.764s 16.902s (2 default disabled) 6.793s (+0.43%) 17.007s (+0.62%) (3 default enabled) 7.197s (+6.40%) 23.666s (+40.02%) (4 runtime enabled) 7.405s (+9.48%) 23.901s (+41.41%) (5 memcg) 13.388s (+97.94%) 48.460s (+186.71%) (6 def disabled+memcg) 13.332s (+97.10%) 48.105s (+184.61%) (7 def enabled+memcg) 13.446s (+98.78%) 54.963s (+225.18%) Memory overhead: Kernel size: text data bss dec diff (1) 26515311 18890222 17018880 62424413 (2) 26524728 19423818 16740352 62688898 264485 (3) 26524724 19423818 16740352 62688894 264481 (4) 26524728 19423818 16740352 62688898 264485 (5) 26541782 18964374 16957440 62463596 39183 Memory consumption on a 56 core Intel CPU with 125GB of memory: Code tags: 192 kB PageExts: 262144 kB (256MB) SlabExts: 9876 kB (9.6MB) PcpuExts: 512 kB (0.5MB) Total overhead is 0.2% of total memory. Benchmarks: Hackbench tests run 100 times: hackbench -s 512 -l 200 -g 15 -f 25 -P baseline disabled profiling enabled profiling avg 0.3543 0.3559 (+0.0016) 0.3566 (+0.0023) stdev 0.0137 0.0188 0.0077 hackbench -l 10000 baseline disabled profiling enabled profiling avg 6.4218 6.4306 (+0.0088) 6.5077 (+0.0859) stdev 0.0933 0.0286 0.0489 stress-ng tests: stress-ng --class memory --seq 4 -t 60 stress-ng --class cpu --seq 4 -t 60 Results posted at: https://evilpiepirate.org/~kent/memalloc_prof_v4_stress-ng/ [2] https://lore.kernel.org/all/20240306182440.2003814-1-surenb@google.com/ This patch (of 37): The next patch drops vmalloc.h from a system header in order to fix a circular dependency; this adds it to all the files that were pulling it in implicitly. [kent.overstreet@linux.dev: fix arch/alpha/lib/memcpy.c] Link: https://lkml.kernel.org/r/20240327002152.3339937-1-kent.overstreet@linux.dev [surenb@google.com: fix arch/x86/mm/numa_32.c] Link: https://lkml.kernel.org/r/20240402180933.1663992-1-surenb@google.com [kent.overstreet@linux.dev: a few places were depending on sizes.h] Link: https://lkml.kernel.org/r/20240404034744.1664840-1-kent.overstreet@linux.dev [arnd@arndb.de: fix mm/kasan/hw_tags.c] Link: https://lkml.kernel.org/r/20240404124435.3121534-1-arnd@kernel.org [surenb@google.com: fix arc build] Link: https://lkml.kernel.org/r/20240405225115.431056-1-surenb@google.com Link: https://lkml.kernel.org/r/20240321163705.3067592-1-surenb@google.com Link: https://lkml.kernel.org/r/20240321163705.3067592-2-surenb@google.com Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> Signed-off-by: Suren Baghdasaryan <surenb@google.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com> Tested-by: Kees Cook <keescook@chromium.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alex Gaynor <alex.gaynor@gmail.com> Cc: Alice Ryhl <aliceryhl@google.com> Cc: Andreas Hindborg <a.hindborg@samsung.com> Cc: Benno Lossin <benno.lossin@proton.me> Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Christoph Lameter <cl@linux.com> Cc: Dennis Zhou <dennis@kernel.org> Cc: Gary Guo <gary@garyguo.net> Cc: Miguel Ojeda <ojeda@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Wedson Almeida Filho <wedsonaf@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-03-21 09:36:23 -07:00
#include <linux/vmalloc.h>
#include "acrn_drv.h"
static int modify_region(struct acrn_vm *vm, struct vm_memory_region_op *region)
{
struct vm_memory_region_batch *regions;
int ret;
regions = kzalloc(sizeof(*regions), GFP_KERNEL);
if (!regions)
return -ENOMEM;
regions->vmid = vm->vmid;
regions->regions_num = 1;
regions->regions_gpa = virt_to_phys(region);
ret = hcall_set_memory_regions(virt_to_phys(regions));
if (ret < 0)
dev_dbg(acrn_dev.this_device,
"Failed to set memory region for VM[%u]!\n", vm->vmid);
kfree(regions);
return ret;
}
/**
* acrn_mm_region_add() - Set up the EPT mapping of a memory region.
* @vm: User VM.
* @user_gpa: A GPA of User VM.
* @service_gpa: A GPA of Service VM.
* @size: Size of the region.
* @mem_type: Combination of ACRN_MEM_TYPE_*.
* @mem_access_right: Combination of ACRN_MEM_ACCESS_*.
*
* Return: 0 on success, <0 on error.
*/
int acrn_mm_region_add(struct acrn_vm *vm, u64 user_gpa, u64 service_gpa,
u64 size, u32 mem_type, u32 mem_access_right)
{
struct vm_memory_region_op *region;
int ret = 0;
region = kzalloc(sizeof(*region), GFP_KERNEL);
if (!region)
return -ENOMEM;
region->type = ACRN_MEM_REGION_ADD;
region->user_vm_pa = user_gpa;
region->service_vm_pa = service_gpa;
region->size = size;
region->attr = ((mem_type & ACRN_MEM_TYPE_MASK) |
(mem_access_right & ACRN_MEM_ACCESS_RIGHT_MASK));
ret = modify_region(vm, region);
dev_dbg(acrn_dev.this_device,
"%s: user-GPA[%pK] service-GPA[%pK] size[0x%llx].\n",
__func__, (void *)user_gpa, (void *)service_gpa, size);
kfree(region);
return ret;
}
/**
* acrn_mm_region_del() - Del the EPT mapping of a memory region.
* @vm: User VM.
* @user_gpa: A GPA of the User VM.
* @size: Size of the region.
*
* Return: 0 on success, <0 for error.
*/
int acrn_mm_region_del(struct acrn_vm *vm, u64 user_gpa, u64 size)
{
struct vm_memory_region_op *region;
int ret = 0;
region = kzalloc(sizeof(*region), GFP_KERNEL);
if (!region)
return -ENOMEM;
region->type = ACRN_MEM_REGION_DEL;
region->user_vm_pa = user_gpa;
region->service_vm_pa = 0UL;
region->size = size;
region->attr = 0U;
ret = modify_region(vm, region);
dev_dbg(acrn_dev.this_device, "%s: user-GPA[%pK] size[0x%llx].\n",
__func__, (void *)user_gpa, size);
kfree(region);
return ret;
}
int acrn_vm_memseg_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
{
int ret;
if (memmap->type == ACRN_MEMMAP_RAM)
return acrn_vm_ram_map(vm, memmap);
if (memmap->type != ACRN_MEMMAP_MMIO) {
dev_dbg(acrn_dev.this_device,
"Invalid memmap type: %u\n", memmap->type);
return -EINVAL;
}
ret = acrn_mm_region_add(vm, memmap->user_vm_pa,
memmap->service_vm_pa, memmap->len,
ACRN_MEM_TYPE_UC, memmap->attr);
if (ret < 0)
dev_dbg(acrn_dev.this_device,
"Add memory region failed, VM[%u]!\n", vm->vmid);
return ret;
}
int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
{
int ret;
if (memmap->type != ACRN_MEMMAP_MMIO) {
dev_dbg(acrn_dev.this_device,
"Invalid memmap type: %u\n", memmap->type);
return -EINVAL;
}
ret = acrn_mm_region_del(vm, memmap->user_vm_pa, memmap->len);
if (ret < 0)
dev_dbg(acrn_dev.this_device,
"Del memory region failed, VM[%u]!\n", vm->vmid);
return ret;
}
/**
* acrn_vm_ram_map() - Create a RAM EPT mapping of User VM.
* @vm: The User VM pointer
* @memmap: Info of the EPT mapping
*
* Return: 0 on success, <0 for error.
*/
int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
{
struct vm_memory_region_batch *regions_info;
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
int nr_pages, i, order, nr_regions = 0;
struct vm_memory_mapping *region_mapping;
struct vm_memory_region_op *vm_region;
struct page **pages = NULL, *page;
void *remap_vaddr;
int ret, pinned;
u64 user_vm_pa;
struct vm_area_struct *vma;
if (!vm || !memmap)
return -EINVAL;
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
/* Get the page number of the map region */
nr_pages = memmap->len >> PAGE_SHIFT;
if (!nr_pages)
return -EINVAL;
mmap_read_lock(current->mm);
vma = vma_lookup(current->mm, memmap->vma_base);
if (vma && ((vma->vm_flags & VM_PFNMAP) != 0)) {
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
unsigned long start_pfn, cur_pfn;
spinlock_t *ptl;
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
bool writable;
pte_t *ptep;
if ((memmap->vma_base + memmap->len) > vma->vm_end) {
mmap_read_unlock(current->mm);
return -EINVAL;
}
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
for (i = 0; i < nr_pages; i++) {
ret = follow_pte(vma, memmap->vma_base + i * PAGE_SIZE,
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
&ptep, &ptl);
if (ret)
break;
cur_pfn = pte_pfn(ptep_get(ptep));
if (i == 0)
start_pfn = cur_pfn;
writable = !!pte_write(ptep_get(ptep));
pte_unmap_unlock(ptep, ptl);
/* Disallow write access if the PTE is not writable. */
if (!writable &&
(memmap->attr & ACRN_MEM_ACCESS_WRITE)) {
ret = -EFAULT;
break;
}
/* Disallow refcounted pages. */
if (pfn_valid(cur_pfn) &&
!PageReserved(pfn_to_page(cur_pfn))) {
ret = -EFAULT;
break;
}
/* Disallow non-contiguous ranges. */
if (cur_pfn != start_pfn + i) {
ret = -EINVAL;
break;
}
}
mmap_read_unlock(current->mm);
if (ret) {
dev_dbg(acrn_dev.this_device,
"Failed to lookup PFN at VMA:%pK.\n", (void *)memmap->vma_base);
return ret;
}
return acrn_mm_region_add(vm, memmap->user_vm_pa,
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
PFN_PHYS(start_pfn), memmap->len,
ACRN_MEM_TYPE_WB, memmap->attr);
}
mmap_read_unlock(current->mm);
pages = vzalloc(array_size(nr_pages, sizeof(*pages)));
if (!pages)
return -ENOMEM;
/* Lock the pages of user memory map region */
pinned = pin_user_pages_fast(memmap->vma_base,
nr_pages, FOLL_WRITE | FOLL_LONGTERM,
pages);
if (pinned < 0) {
ret = pinned;
goto free_pages;
} else if (pinned != nr_pages) {
ret = -EFAULT;
goto put_pages;
}
/* Create a kernel map for the map region */
remap_vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (!remap_vaddr) {
ret = -ENOMEM;
goto put_pages;
}
/* Record Service VM va <-> User VM pa mapping */
mutex_lock(&vm->regions_mapping_lock);
region_mapping = &vm->regions_mapping[vm->regions_mapping_count];
if (vm->regions_mapping_count < ACRN_MEM_MAPPING_MAX) {
region_mapping->pages = pages;
region_mapping->npages = nr_pages;
region_mapping->size = memmap->len;
region_mapping->service_vm_va = remap_vaddr;
region_mapping->user_vm_pa = memmap->user_vm_pa;
vm->regions_mapping_count++;
} else {
dev_warn(acrn_dev.this_device,
"Run out of memory mapping slots!\n");
ret = -ENOMEM;
mutex_unlock(&vm->regions_mapping_lock);
goto unmap_no_count;
}
mutex_unlock(&vm->regions_mapping_lock);
/* Calculate count of vm_memory_region_op */
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
for (i = 0; i < nr_pages; i += 1 << order) {
page = pages[i];
VM_BUG_ON_PAGE(PageTail(page), page);
order = compound_order(page);
nr_regions++;
}
/* Prepare the vm_memory_region_batch */
regions_info = kzalloc(struct_size(regions_info, regions_op,
nr_regions), GFP_KERNEL);
if (!regions_info) {
ret = -ENOMEM;
goto unmap_kernel_map;
}
regions_info->regions_num = nr_regions;
/* Fill each vm_memory_region_op */
vm_region = regions_info->regions_op;
regions_info->vmid = vm->vmid;
regions_info->regions_gpa = virt_to_phys(vm_region);
user_vm_pa = memmap->user_vm_pa;
drivers/virt/acrn: fix PFNMAP PTE checks in acrn_vm_ram_map() Patch series "mm: follow_pte() improvements and acrn follow_pte() fixes". Patch #1 fixes a bunch of issues I spotted in the acrn driver. It compiles, that's all I know. I'll appreciate some review and testing from acrn folks. Patch #2+#3 improve follow_pte(), passing a VMA instead of the MM, adding more sanity checks, and improving the documentation. Gave it a quick test on x86-64 using VM_PAT that ends up using follow_pte(). This patch (of 3): We currently miss handling various cases, resulting in a dangerous follow_pte() (previously follow_pfn()) usage. (1) We're not checking PTE write permissions. Maybe we should simply always require pte_write() like we do for pin_user_pages_fast(FOLL_WRITE)? Hard to tell, so let's check for ACRN_MEM_ACCESS_WRITE for now. (2) We're not rejecting refcounted pages. As we are not using MMU notifiers, messing with refcounted pages is dangerous and can result in use-after-free. Let's make sure to reject them. (3) We are only looking at the first PTE of a bigger range. We only lookup a single PTE, but memmap->len may span a larger area. Let's loop over all involved PTEs and make sure the PFN range is actually contiguous. Reject everything else: it couldn't have worked either way, and rather made use access PFNs we shouldn't be accessing. Link: https://lkml.kernel.org/r/20240410155527.474777-1-david@redhat.com Link: https://lkml.kernel.org/r/20240410155527.474777-2-david@redhat.com Fixes: 8a6e85f75a83 ("virt: acrn: obtain pa from VMA with PFNMAP flag") Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Fei Li <fei1.li@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Yonghua Huang <yonghua.huang@intel.com> Cc: Sean Christopherson <seanjc@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-10 17:55:25 +02:00
for (i = 0; i < nr_pages; i += 1 << order) {
u32 region_size;
page = pages[i];
VM_BUG_ON_PAGE(PageTail(page), page);
order = compound_order(page);
region_size = PAGE_SIZE << order;
vm_region->type = ACRN_MEM_REGION_ADD;
vm_region->user_vm_pa = user_vm_pa;
vm_region->service_vm_pa = page_to_phys(page);
vm_region->size = region_size;
vm_region->attr = (ACRN_MEM_TYPE_WB & ACRN_MEM_TYPE_MASK) |
(memmap->attr & ACRN_MEM_ACCESS_RIGHT_MASK);
vm_region++;
user_vm_pa += region_size;
}
/* Inform the ACRN Hypervisor to set up EPT mappings */
ret = hcall_set_memory_regions(virt_to_phys(regions_info));
if (ret < 0) {
dev_dbg(acrn_dev.this_device,
"Failed to set regions, VM[%u]!\n", vm->vmid);
goto unset_region;
}
kfree(regions_info);
dev_dbg(acrn_dev.this_device,
"%s: VM[%u] service-GVA[%pK] user-GPA[%pK] size[0x%llx]\n",
__func__, vm->vmid,
remap_vaddr, (void *)memmap->user_vm_pa, memmap->len);
return ret;
unset_region:
kfree(regions_info);
unmap_kernel_map:
mutex_lock(&vm->regions_mapping_lock);
vm->regions_mapping_count--;
mutex_unlock(&vm->regions_mapping_lock);
unmap_no_count:
vunmap(remap_vaddr);
put_pages:
for (i = 0; i < pinned; i++)
unpin_user_page(pages[i]);
free_pages:
vfree(pages);
return ret;
}
/**
* acrn_vm_all_ram_unmap() - Destroy a RAM EPT mapping of User VM.
* @vm: The User VM
*/
void acrn_vm_all_ram_unmap(struct acrn_vm *vm)
{
struct vm_memory_mapping *region_mapping;
int i, j;
mutex_lock(&vm->regions_mapping_lock);
for (i = 0; i < vm->regions_mapping_count; i++) {
region_mapping = &vm->regions_mapping[i];
vunmap(region_mapping->service_vm_va);
for (j = 0; j < region_mapping->npages; j++)
unpin_user_page(region_mapping->pages[j]);
vfree(region_mapping->pages);
}
mutex_unlock(&vm->regions_mapping_lock);
}