mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-07 22:03:14 +00:00
617a814f14
this pull request are: "Align kvrealloc() with krealloc()" from Danilo Krummrich. Adds consistency to the APIs and behaviour of these two core allocation functions. This also simplifies/enables Rustification. "Some cleanups for shmem" from Baolin Wang. No functional changes - mode code reuse, better function naming, logic simplifications. "mm: some small page fault cleanups" from Josef Bacik. No functional changes - code cleanups only. "Various memory tiering fixes" from Zi Yan. A small fix and a little cleanup. "mm/swap: remove boilerplate" from Yu Zhao. Code cleanups and simplifications and .text shrinkage. "Kernel stack usage histogram" from Pasha Tatashin and Shakeel Butt. This is a feature, it adds new feilds to /proc/vmstat such as $ grep kstack /proc/vmstat kstack_1k 3 kstack_2k 188 kstack_4k 11391 kstack_8k 243 kstack_16k 0 which tells us that 11391 processes used 4k of stack while none at all used 16k. Useful for some system tuning things, but partivularly useful for "the dynamic kernel stack project". "kmemleak: support for percpu memory leak detect" from Pavel Tikhomirov. Teaches kmemleak to detect leaksage of percpu memory. "mm: memcg: page counters optimizations" from Roman Gushchin. "3 independent small optimizations of page counters". "mm: split PTE/PMD PT table Kconfig cleanups+clarifications" from David Hildenbrand. Improves PTE/PMD splitlock detection, makes powerpc/8xx work correctly by design rather than by accident. "mm: remove arch_make_page_accessible()" from David Hildenbrand. Some folio conversions which make arch_make_page_accessible() unneeded. "mm, memcg: cg2 memory{.swap,}.peak write handlers" fro David Finkel. Cleans up and fixes our handling of the resetting of the cgroup/process peak-memory-use detector. "Make core VMA operations internal and testable" from Lorenzo Stoakes. Rationalizaion and encapsulation of the VMA manipulation APIs. With a view to better enable testing of the VMA functions, even from a userspace-only harness. "mm: zswap: fixes for global shrinker" from Takero Funaki. Fix issues in the zswap global shrinker, resulting in improved performance. "mm: print the promo watermark in zoneinfo" from Kaiyang Zhao. Fill in some missing info in /proc/zoneinfo. "mm: replace follow_page() by folio_walk" from David Hildenbrand. Code cleanups and rationalizations (conversion to folio_walk()) resulting in the removal of follow_page(). "improving dynamic zswap shrinker protection scheme" from Nhat Pham. Some tuning to improve zswap's dynamic shrinker. Significant reductions in swapin and improvements in performance are shown. "mm: Fix several issues with unaccepted memory" from Kirill Shutemov. Improvements to the new unaccepted memory feature, "mm/mprotect: Fix dax puds" from Peter Xu. Implements mprotect on DAX PUDs. This was missing, although nobody seems to have notied yet. "Introduce a store type enum for the Maple tree" from Sidhartha Kumar. Cleanups and modest performance improvements for the maple tree library code. "memcg: further decouple v1 code from v2" from Shakeel Butt. Move more cgroup v1 remnants away from the v2 memcg code. "memcg: initiate deprecation of v1 features" from Shakeel Butt. Adds various warnings telling users that memcg v1 features are deprecated. "mm: swap: mTHP swap allocator base on swap cluster order" from Chris Li. Greatly improves the success rate of the mTHP swap allocation. "mm: introduce numa_memblks" from Mike Rapoport. Moves various disparate per-arch implementations of numa_memblk code into generic code. "mm: batch free swaps for zap_pte_range()" from Barry Song. Greatly improves the performance of munmap() of swap-filled ptes. "support large folio swap-out and swap-in for shmem" from Baolin Wang. With this series we no longer split shmem large folios into simgle-page folios when swapping out shmem. "mm/hugetlb: alloc/free gigantic folios" from Yu Zhao. Nice performance improvements and code reductions for gigantic folios. "support shmem mTHP collapse" from Baolin Wang. Adds support for khugepaged's collapsing of shmem mTHP folios. "mm: Optimize mseal checks" from Pedro Falcato. Fixes an mprotect() performance regression due to the addition of mseal(). "Increase the number of bits available in page_type" from Matthew Wilcox. Increases the number of bits available in page_type! "Simplify the page flags a little" from Matthew Wilcox. Many legacy page flags are now folio flags, so the page-based flags and their accessors/mutators can be removed. "mm: store zero pages to be swapped out in a bitmap" from Usama Arif. An optimization which permits us to avoid writing/reading zero-filled zswap pages to backing store. "Avoid MAP_FIXED gap exposure" from Liam Howlett. Fixes a race window which occurs when a MAP_FIXED operqtion is occurring during an unrelated vma tree walk. "mm: remove vma_merge()" from Lorenzo Stoakes. Major rotorooting of the vma_merge() functionality, making ot cleaner, more testable and better tested. "misc fixups for DAMON {self,kunit} tests" from SeongJae Park. Minor fixups of DAMON selftests and kunit tests. "mm: memory_hotplug: improve do_migrate_range()" from Kefeng Wang. Code cleanups and folio conversions. "Shmem mTHP controls and stats improvements" from Ryan Roberts. Cleanups for shmem controls and stats. "mm: count the number of anonymous THPs per size" from Barry Song. Expose additional anon THP stats to userspace for improved tuning. "mm: finish isolate/putback_lru_page()" from Kefeng Wang: more folio conversions and removal of now-unused page-based APIs. "replace per-quota region priorities histogram buffer with per-context one" from SeongJae Park. DAMON histogram rationalization. "Docs/damon: update GitHub repo URLs and maintainer-profile" from SeongJae Park. DAMON documentation updates. "mm/vdpa: correct misuse of non-direct-reclaim __GFP_NOFAIL and improve related doc and warn" from Jason Wang: fixes usage of page allocator __GFP_NOFAIL and GFP_ATOMIC flags. "mm: split underused THPs" from Yu Zhao. Improve THP=always policy - this was overprovisioning THPs in sparsely accessed memory areas. "zram: introduce custom comp backends API" frm Sergey Senozhatsky. Add support for zram run-time compression algorithm tuning. "mm: Care about shadow stack guard gap when getting an unmapped area" from Mark Brown. Fix up the various arch_get_unmapped_area() implementations to better respect guard areas. "Improve mem_cgroup_iter()" from Kinsey Ho. Improve the reliability of mem_cgroup_iter() and various code cleanups. "mm: Support huge pfnmaps" from Peter Xu. Extends the usage of huge pfnmap support. "resource: Fix region_intersects() vs add_memory_driver_managed()" from Huang Ying. Fix a bug in region_intersects() for systems with CXL memory. "mm: hwpoison: two more poison recovery" from Kefeng Wang. Teaches a couple more code paths to correctly recover from the encountering of poisoned memry. "mm: enable large folios swap-in support" from Barry Song. Support the swapin of mTHP memory into appropriately-sized folios, rather than into single-page folios. -----BEGIN PGP SIGNATURE----- iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZu1BBwAKCRDdBJ7gKXxA jlWNAQDYlqQLun7bgsAN4sSvi27VUuWv1q70jlMXTfmjJAvQqwD/fBFVR6IOOiw7 AkDbKWP2k0hWPiNJBGwoqxdHHx09Xgo= =s0T+ -----END PGP SIGNATURE----- Merge tag 'mm-stable-2024-09-20-02-31' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull MM updates from Andrew Morton: "Along with the usual shower of singleton patches, notable patch series in this pull request are: - "Align kvrealloc() with krealloc()" from Danilo Krummrich. Adds consistency to the APIs and behaviour of these two core allocation functions. This also simplifies/enables Rustification. - "Some cleanups for shmem" from Baolin Wang. No functional changes - mode code reuse, better function naming, logic simplifications. - "mm: some small page fault cleanups" from Josef Bacik. No functional changes - code cleanups only. - "Various memory tiering fixes" from Zi Yan. A small fix and a little cleanup. - "mm/swap: remove boilerplate" from Yu Zhao. Code cleanups and simplifications and .text shrinkage. - "Kernel stack usage histogram" from Pasha Tatashin and Shakeel Butt. This is a feature, it adds new feilds to /proc/vmstat such as $ grep kstack /proc/vmstat kstack_1k 3 kstack_2k 188 kstack_4k 11391 kstack_8k 243 kstack_16k 0 which tells us that 11391 processes used 4k of stack while none at all used 16k. Useful for some system tuning things, but partivularly useful for "the dynamic kernel stack project". - "kmemleak: support for percpu memory leak detect" from Pavel Tikhomirov. Teaches kmemleak to detect leaksage of percpu memory. - "mm: memcg: page counters optimizations" from Roman Gushchin. "3 independent small optimizations of page counters". - "mm: split PTE/PMD PT table Kconfig cleanups+clarifications" from David Hildenbrand. Improves PTE/PMD splitlock detection, makes powerpc/8xx work correctly by design rather than by accident. - "mm: remove arch_make_page_accessible()" from David Hildenbrand. Some folio conversions which make arch_make_page_accessible() unneeded. - "mm, memcg: cg2 memory{.swap,}.peak write handlers" fro David Finkel. Cleans up and fixes our handling of the resetting of the cgroup/process peak-memory-use detector. - "Make core VMA operations internal and testable" from Lorenzo Stoakes. Rationalizaion and encapsulation of the VMA manipulation APIs. With a view to better enable testing of the VMA functions, even from a userspace-only harness. - "mm: zswap: fixes for global shrinker" from Takero Funaki. Fix issues in the zswap global shrinker, resulting in improved performance. - "mm: print the promo watermark in zoneinfo" from Kaiyang Zhao. Fill in some missing info in /proc/zoneinfo. - "mm: replace follow_page() by folio_walk" from David Hildenbrand. Code cleanups and rationalizations (conversion to folio_walk()) resulting in the removal of follow_page(). - "improving dynamic zswap shrinker protection scheme" from Nhat Pham. Some tuning to improve zswap's dynamic shrinker. Significant reductions in swapin and improvements in performance are shown. - "mm: Fix several issues with unaccepted memory" from Kirill Shutemov. Improvements to the new unaccepted memory feature, - "mm/mprotect: Fix dax puds" from Peter Xu. Implements mprotect on DAX PUDs. This was missing, although nobody seems to have notied yet. - "Introduce a store type enum for the Maple tree" from Sidhartha Kumar. Cleanups and modest performance improvements for the maple tree library code. - "memcg: further decouple v1 code from v2" from Shakeel Butt. Move more cgroup v1 remnants away from the v2 memcg code. - "memcg: initiate deprecation of v1 features" from Shakeel Butt. Adds various warnings telling users that memcg v1 features are deprecated. - "mm: swap: mTHP swap allocator base on swap cluster order" from Chris Li. Greatly improves the success rate of the mTHP swap allocation. - "mm: introduce numa_memblks" from Mike Rapoport. Moves various disparate per-arch implementations of numa_memblk code into generic code. - "mm: batch free swaps for zap_pte_range()" from Barry Song. Greatly improves the performance of munmap() of swap-filled ptes. - "support large folio swap-out and swap-in for shmem" from Baolin Wang. With this series we no longer split shmem large folios into simgle-page folios when swapping out shmem. - "mm/hugetlb: alloc/free gigantic folios" from Yu Zhao. Nice performance improvements and code reductions for gigantic folios. - "support shmem mTHP collapse" from Baolin Wang. Adds support for khugepaged's collapsing of shmem mTHP folios. - "mm: Optimize mseal checks" from Pedro Falcato. Fixes an mprotect() performance regression due to the addition of mseal(). - "Increase the number of bits available in page_type" from Matthew Wilcox. Increases the number of bits available in page_type! - "Simplify the page flags a little" from Matthew Wilcox. Many legacy page flags are now folio flags, so the page-based flags and their accessors/mutators can be removed. - "mm: store zero pages to be swapped out in a bitmap" from Usama Arif. An optimization which permits us to avoid writing/reading zero-filled zswap pages to backing store. - "Avoid MAP_FIXED gap exposure" from Liam Howlett. Fixes a race window which occurs when a MAP_FIXED operqtion is occurring during an unrelated vma tree walk. - "mm: remove vma_merge()" from Lorenzo Stoakes. Major rotorooting of the vma_merge() functionality, making ot cleaner, more testable and better tested. - "misc fixups for DAMON {self,kunit} tests" from SeongJae Park. Minor fixups of DAMON selftests and kunit tests. - "mm: memory_hotplug: improve do_migrate_range()" from Kefeng Wang. Code cleanups and folio conversions. - "Shmem mTHP controls and stats improvements" from Ryan Roberts. Cleanups for shmem controls and stats. - "mm: count the number of anonymous THPs per size" from Barry Song. Expose additional anon THP stats to userspace for improved tuning. - "mm: finish isolate/putback_lru_page()" from Kefeng Wang: more folio conversions and removal of now-unused page-based APIs. - "replace per-quota region priorities histogram buffer with per-context one" from SeongJae Park. DAMON histogram rationalization. - "Docs/damon: update GitHub repo URLs and maintainer-profile" from SeongJae Park. DAMON documentation updates. - "mm/vdpa: correct misuse of non-direct-reclaim __GFP_NOFAIL and improve related doc and warn" from Jason Wang: fixes usage of page allocator __GFP_NOFAIL and GFP_ATOMIC flags. - "mm: split underused THPs" from Yu Zhao. Improve THP=always policy. This was overprovisioning THPs in sparsely accessed memory areas. - "zram: introduce custom comp backends API" frm Sergey Senozhatsky. Add support for zram run-time compression algorithm tuning. - "mm: Care about shadow stack guard gap when getting an unmapped area" from Mark Brown. Fix up the various arch_get_unmapped_area() implementations to better respect guard areas. - "Improve mem_cgroup_iter()" from Kinsey Ho. Improve the reliability of mem_cgroup_iter() and various code cleanups. - "mm: Support huge pfnmaps" from Peter Xu. Extends the usage of huge pfnmap support. - "resource: Fix region_intersects() vs add_memory_driver_managed()" from Huang Ying. Fix a bug in region_intersects() for systems with CXL memory. - "mm: hwpoison: two more poison recovery" from Kefeng Wang. Teaches a couple more code paths to correctly recover from the encountering of poisoned memry. - "mm: enable large folios swap-in support" from Barry Song. Support the swapin of mTHP memory into appropriately-sized folios, rather than into single-page folios" * tag 'mm-stable-2024-09-20-02-31' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (416 commits) zram: free secondary algorithms names uprobes: turn xol_area->pages[2] into xol_area->page uprobes: introduce the global struct vm_special_mapping xol_mapping Revert "uprobes: use vm_special_mapping close() functionality" mm: support large folios swap-in for sync io devices mm: add nr argument in mem_cgroup_swapin_uncharge_swap() helper to support large folios mm: fix swap_read_folio_zeromap() for large folios with partial zeromap mm/debug_vm_pgtable: Use pxdp_get() for accessing page table entries set_memory: add __must_check to generic stubs mm/vma: return the exact errno in vms_gather_munmap_vmas() memcg: cleanup with !CONFIG_MEMCG_V1 mm/show_mem.c: report alloc tags in human readable units mm: support poison recovery from copy_present_page() mm: support poison recovery from do_cow_fault() resource, kunit: add test case for region_intersects() resource: make alloc_free_mem_region() works for iomem_resource mm: z3fold: deprecate CONFIG_Z3FOLD vfio/pci: implement huge_fault support mm/arm64: support large pfn mappings mm/x86: support large pfn mappings ...
5213 lines
133 KiB
C
5213 lines
133 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (C) 1993 Linus Torvalds
|
|
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
|
|
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
|
|
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
|
|
* Numa awareness, Christoph Lameter, SGI, June 2005
|
|
* Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
|
|
*/
|
|
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/module.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/set_memory.h>
|
|
#include <linux/debugobjects.h>
|
|
#include <linux/kallsyms.h>
|
|
#include <linux/list.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/xarray.h>
|
|
#include <linux/io.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/pfn.h>
|
|
#include <linux/kmemleak.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/llist.h>
|
|
#include <linux/uio.h>
|
|
#include <linux/bitops.h>
|
|
#include <linux/rbtree_augmented.h>
|
|
#include <linux/overflow.h>
|
|
#include <linux/pgtable.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/shmparam.h>
|
|
#include <linux/page_owner.h>
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/vmalloc.h>
|
|
|
|
#include "internal.h"
|
|
#include "pgalloc-track.h"
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
|
|
static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
|
|
|
|
static int __init set_nohugeiomap(char *str)
|
|
{
|
|
ioremap_max_page_shift = PAGE_SHIFT;
|
|
return 0;
|
|
}
|
|
early_param("nohugeiomap", set_nohugeiomap);
|
|
#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
|
|
static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
|
|
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
|
|
static bool __ro_after_init vmap_allow_huge = true;
|
|
|
|
static int __init set_nohugevmalloc(char *str)
|
|
{
|
|
vmap_allow_huge = false;
|
|
return 0;
|
|
}
|
|
early_param("nohugevmalloc", set_nohugevmalloc);
|
|
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
|
|
static const bool vmap_allow_huge = false;
|
|
#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
|
|
|
|
bool is_vmalloc_addr(const void *x)
|
|
{
|
|
unsigned long addr = (unsigned long)kasan_reset_tag(x);
|
|
|
|
return addr >= VMALLOC_START && addr < VMALLOC_END;
|
|
}
|
|
EXPORT_SYMBOL(is_vmalloc_addr);
|
|
|
|
struct vfree_deferred {
|
|
struct llist_head list;
|
|
struct work_struct wq;
|
|
};
|
|
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
|
|
|
|
/*** Page table manipulation functions ***/
|
|
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot,
|
|
unsigned int max_page_shift, pgtbl_mod_mask *mask)
|
|
{
|
|
pte_t *pte;
|
|
u64 pfn;
|
|
struct page *page;
|
|
unsigned long size = PAGE_SIZE;
|
|
|
|
pfn = phys_addr >> PAGE_SHIFT;
|
|
pte = pte_alloc_kernel_track(pmd, addr, mask);
|
|
if (!pte)
|
|
return -ENOMEM;
|
|
do {
|
|
if (unlikely(!pte_none(ptep_get(pte)))) {
|
|
if (pfn_valid(pfn)) {
|
|
page = pfn_to_page(pfn);
|
|
dump_page(page, "remapping already mapped page");
|
|
}
|
|
BUG();
|
|
}
|
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
|
|
if (size != PAGE_SIZE) {
|
|
pte_t entry = pfn_pte(pfn, prot);
|
|
|
|
entry = arch_make_huge_pte(entry, ilog2(size), 0);
|
|
set_huge_pte_at(&init_mm, addr, pte, entry, size);
|
|
pfn += PFN_DOWN(size);
|
|
continue;
|
|
}
|
|
#endif
|
|
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
|
|
pfn++;
|
|
} while (pte += PFN_DOWN(size), addr += size, addr != end);
|
|
*mask |= PGTBL_PTE_MODIFIED;
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot,
|
|
unsigned int max_page_shift)
|
|
{
|
|
if (max_page_shift < PMD_SHIFT)
|
|
return 0;
|
|
|
|
if (!arch_vmap_pmd_supported(prot))
|
|
return 0;
|
|
|
|
if ((end - addr) != PMD_SIZE)
|
|
return 0;
|
|
|
|
if (!IS_ALIGNED(addr, PMD_SIZE))
|
|
return 0;
|
|
|
|
if (!IS_ALIGNED(phys_addr, PMD_SIZE))
|
|
return 0;
|
|
|
|
if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
|
|
return 0;
|
|
|
|
return pmd_set_huge(pmd, phys_addr, prot);
|
|
}
|
|
|
|
static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot,
|
|
unsigned int max_page_shift, pgtbl_mod_mask *mask)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
|
|
pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
|
|
if (!pmd)
|
|
return -ENOMEM;
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
|
|
max_page_shift)) {
|
|
*mask |= PGTBL_PMD_MODIFIED;
|
|
continue;
|
|
}
|
|
|
|
if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
|
|
return -ENOMEM;
|
|
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot,
|
|
unsigned int max_page_shift)
|
|
{
|
|
if (max_page_shift < PUD_SHIFT)
|
|
return 0;
|
|
|
|
if (!arch_vmap_pud_supported(prot))
|
|
return 0;
|
|
|
|
if ((end - addr) != PUD_SIZE)
|
|
return 0;
|
|
|
|
if (!IS_ALIGNED(addr, PUD_SIZE))
|
|
return 0;
|
|
|
|
if (!IS_ALIGNED(phys_addr, PUD_SIZE))
|
|
return 0;
|
|
|
|
if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
|
|
return 0;
|
|
|
|
return pud_set_huge(pud, phys_addr, prot);
|
|
}
|
|
|
|
static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot,
|
|
unsigned int max_page_shift, pgtbl_mod_mask *mask)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
|
|
pud = pud_alloc_track(&init_mm, p4d, addr, mask);
|
|
if (!pud)
|
|
return -ENOMEM;
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
|
|
if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
|
|
max_page_shift)) {
|
|
*mask |= PGTBL_PUD_MODIFIED;
|
|
continue;
|
|
}
|
|
|
|
if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
|
|
max_page_shift, mask))
|
|
return -ENOMEM;
|
|
} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot,
|
|
unsigned int max_page_shift)
|
|
{
|
|
if (max_page_shift < P4D_SHIFT)
|
|
return 0;
|
|
|
|
if (!arch_vmap_p4d_supported(prot))
|
|
return 0;
|
|
|
|
if ((end - addr) != P4D_SIZE)
|
|
return 0;
|
|
|
|
if (!IS_ALIGNED(addr, P4D_SIZE))
|
|
return 0;
|
|
|
|
if (!IS_ALIGNED(phys_addr, P4D_SIZE))
|
|
return 0;
|
|
|
|
if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
|
|
return 0;
|
|
|
|
return p4d_set_huge(p4d, phys_addr, prot);
|
|
}
|
|
|
|
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot,
|
|
unsigned int max_page_shift, pgtbl_mod_mask *mask)
|
|
{
|
|
p4d_t *p4d;
|
|
unsigned long next;
|
|
|
|
p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
|
|
if (!p4d)
|
|
return -ENOMEM;
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
|
|
if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
|
|
max_page_shift)) {
|
|
*mask |= PGTBL_P4D_MODIFIED;
|
|
continue;
|
|
}
|
|
|
|
if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
|
|
max_page_shift, mask))
|
|
return -ENOMEM;
|
|
} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_range_noflush(unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot,
|
|
unsigned int max_page_shift)
|
|
{
|
|
pgd_t *pgd;
|
|
unsigned long start;
|
|
unsigned long next;
|
|
int err;
|
|
pgtbl_mod_mask mask = 0;
|
|
|
|
might_sleep();
|
|
BUG_ON(addr >= end);
|
|
|
|
start = addr;
|
|
pgd = pgd_offset_k(addr);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
|
|
max_page_shift, &mask);
|
|
if (err)
|
|
break;
|
|
} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
|
|
|
|
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
|
|
arch_sync_kernel_mappings(start, end);
|
|
|
|
return err;
|
|
}
|
|
|
|
int vmap_page_range(unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot)
|
|
{
|
|
int err;
|
|
|
|
err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
|
|
ioremap_max_page_shift);
|
|
flush_cache_vmap(addr, end);
|
|
if (!err)
|
|
err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
|
|
ioremap_max_page_shift);
|
|
return err;
|
|
}
|
|
|
|
int ioremap_page_range(unsigned long addr, unsigned long end,
|
|
phys_addr_t phys_addr, pgprot_t prot)
|
|
{
|
|
struct vm_struct *area;
|
|
|
|
area = find_vm_area((void *)addr);
|
|
if (!area || !(area->flags & VM_IOREMAP)) {
|
|
WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
|
|
return -EINVAL;
|
|
}
|
|
if (addr != (unsigned long)area->addr ||
|
|
(void *)end != area->addr + get_vm_area_size(area)) {
|
|
WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
|
|
addr, end, (long)area->addr,
|
|
(long)area->addr + get_vm_area_size(area));
|
|
return -ERANGE;
|
|
}
|
|
return vmap_page_range(addr, end, phys_addr, prot);
|
|
}
|
|
|
|
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
|
pgtbl_mod_mask *mask)
|
|
{
|
|
pte_t *pte;
|
|
|
|
pte = pte_offset_kernel(pmd, addr);
|
|
do {
|
|
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
|
|
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
|
|
} while (pte++, addr += PAGE_SIZE, addr != end);
|
|
*mask |= PGTBL_PTE_MODIFIED;
|
|
}
|
|
|
|
static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
|
|
pgtbl_mod_mask *mask)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
int cleared;
|
|
|
|
pmd = pmd_offset(pud, addr);
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
cleared = pmd_clear_huge(pmd);
|
|
if (cleared || pmd_bad(*pmd))
|
|
*mask |= PGTBL_PMD_MODIFIED;
|
|
|
|
if (cleared)
|
|
continue;
|
|
if (pmd_none_or_clear_bad(pmd))
|
|
continue;
|
|
vunmap_pte_range(pmd, addr, next, mask);
|
|
|
|
cond_resched();
|
|
} while (pmd++, addr = next, addr != end);
|
|
}
|
|
|
|
static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
|
|
pgtbl_mod_mask *mask)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
int cleared;
|
|
|
|
pud = pud_offset(p4d, addr);
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
|
|
cleared = pud_clear_huge(pud);
|
|
if (cleared || pud_bad(*pud))
|
|
*mask |= PGTBL_PUD_MODIFIED;
|
|
|
|
if (cleared)
|
|
continue;
|
|
if (pud_none_or_clear_bad(pud))
|
|
continue;
|
|
vunmap_pmd_range(pud, addr, next, mask);
|
|
} while (pud++, addr = next, addr != end);
|
|
}
|
|
|
|
static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
|
|
pgtbl_mod_mask *mask)
|
|
{
|
|
p4d_t *p4d;
|
|
unsigned long next;
|
|
|
|
p4d = p4d_offset(pgd, addr);
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
|
|
p4d_clear_huge(p4d);
|
|
if (p4d_bad(*p4d))
|
|
*mask |= PGTBL_P4D_MODIFIED;
|
|
|
|
if (p4d_none_or_clear_bad(p4d))
|
|
continue;
|
|
vunmap_pud_range(p4d, addr, next, mask);
|
|
} while (p4d++, addr = next, addr != end);
|
|
}
|
|
|
|
/*
|
|
* vunmap_range_noflush is similar to vunmap_range, but does not
|
|
* flush caches or TLBs.
|
|
*
|
|
* The caller is responsible for calling flush_cache_vmap() before calling
|
|
* this function, and flush_tlb_kernel_range after it has returned
|
|
* successfully (and before the addresses are expected to cause a page fault
|
|
* or be re-mapped for something else, if TLB flushes are being delayed or
|
|
* coalesced).
|
|
*
|
|
* This is an internal function only. Do not use outside mm/.
|
|
*/
|
|
void __vunmap_range_noflush(unsigned long start, unsigned long end)
|
|
{
|
|
unsigned long next;
|
|
pgd_t *pgd;
|
|
unsigned long addr = start;
|
|
pgtbl_mod_mask mask = 0;
|
|
|
|
BUG_ON(addr >= end);
|
|
pgd = pgd_offset_k(addr);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
if (pgd_bad(*pgd))
|
|
mask |= PGTBL_PGD_MODIFIED;
|
|
if (pgd_none_or_clear_bad(pgd))
|
|
continue;
|
|
vunmap_p4d_range(pgd, addr, next, &mask);
|
|
} while (pgd++, addr = next, addr != end);
|
|
|
|
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
|
|
arch_sync_kernel_mappings(start, end);
|
|
}
|
|
|
|
void vunmap_range_noflush(unsigned long start, unsigned long end)
|
|
{
|
|
kmsan_vunmap_range_noflush(start, end);
|
|
__vunmap_range_noflush(start, end);
|
|
}
|
|
|
|
/**
|
|
* vunmap_range - unmap kernel virtual addresses
|
|
* @addr: start of the VM area to unmap
|
|
* @end: end of the VM area to unmap (non-inclusive)
|
|
*
|
|
* Clears any present PTEs in the virtual address range, flushes TLBs and
|
|
* caches. Any subsequent access to the address before it has been re-mapped
|
|
* is a kernel bug.
|
|
*/
|
|
void vunmap_range(unsigned long addr, unsigned long end)
|
|
{
|
|
flush_cache_vunmap(addr, end);
|
|
vunmap_range_noflush(addr, end);
|
|
flush_tlb_kernel_range(addr, end);
|
|
}
|
|
|
|
static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
|
|
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
|
|
pgtbl_mod_mask *mask)
|
|
{
|
|
pte_t *pte;
|
|
|
|
/*
|
|
* nr is a running index into the array which helps higher level
|
|
* callers keep track of where we're up to.
|
|
*/
|
|
|
|
pte = pte_alloc_kernel_track(pmd, addr, mask);
|
|
if (!pte)
|
|
return -ENOMEM;
|
|
do {
|
|
struct page *page = pages[*nr];
|
|
|
|
if (WARN_ON(!pte_none(ptep_get(pte))))
|
|
return -EBUSY;
|
|
if (WARN_ON(!page))
|
|
return -ENOMEM;
|
|
if (WARN_ON(!pfn_valid(page_to_pfn(page))))
|
|
return -EINVAL;
|
|
|
|
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
|
|
(*nr)++;
|
|
} while (pte++, addr += PAGE_SIZE, addr != end);
|
|
*mask |= PGTBL_PTE_MODIFIED;
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
|
|
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
|
|
pgtbl_mod_mask *mask)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
|
|
pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
|
|
if (!pmd)
|
|
return -ENOMEM;
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
|
|
return -ENOMEM;
|
|
} while (pmd++, addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
|
|
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
|
|
pgtbl_mod_mask *mask)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
|
|
pud = pud_alloc_track(&init_mm, p4d, addr, mask);
|
|
if (!pud)
|
|
return -ENOMEM;
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
|
|
return -ENOMEM;
|
|
} while (pud++, addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
|
|
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
|
|
pgtbl_mod_mask *mask)
|
|
{
|
|
p4d_t *p4d;
|
|
unsigned long next;
|
|
|
|
p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
|
|
if (!p4d)
|
|
return -ENOMEM;
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
|
|
return -ENOMEM;
|
|
} while (p4d++, addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
|
|
pgprot_t prot, struct page **pages)
|
|
{
|
|
unsigned long start = addr;
|
|
pgd_t *pgd;
|
|
unsigned long next;
|
|
int err = 0;
|
|
int nr = 0;
|
|
pgtbl_mod_mask mask = 0;
|
|
|
|
BUG_ON(addr >= end);
|
|
pgd = pgd_offset_k(addr);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
if (pgd_bad(*pgd))
|
|
mask |= PGTBL_PGD_MODIFIED;
|
|
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
|
|
if (err)
|
|
return err;
|
|
} while (pgd++, addr = next, addr != end);
|
|
|
|
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
|
|
arch_sync_kernel_mappings(start, end);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* vmap_pages_range_noflush is similar to vmap_pages_range, but does not
|
|
* flush caches.
|
|
*
|
|
* The caller is responsible for calling flush_cache_vmap() after this
|
|
* function returns successfully and before the addresses are accessed.
|
|
*
|
|
* This is an internal function only. Do not use outside mm/.
|
|
*/
|
|
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
|
|
pgprot_t prot, struct page **pages, unsigned int page_shift)
|
|
{
|
|
unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
|
|
|
|
WARN_ON(page_shift < PAGE_SHIFT);
|
|
|
|
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
|
|
page_shift == PAGE_SHIFT)
|
|
return vmap_small_pages_range_noflush(addr, end, prot, pages);
|
|
|
|
for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
|
|
int err;
|
|
|
|
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
|
|
page_to_phys(pages[i]), prot,
|
|
page_shift);
|
|
if (err)
|
|
return err;
|
|
|
|
addr += 1UL << page_shift;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
|
|
pgprot_t prot, struct page **pages, unsigned int page_shift)
|
|
{
|
|
int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
|
|
page_shift);
|
|
|
|
if (ret)
|
|
return ret;
|
|
return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
|
|
}
|
|
|
|
/**
|
|
* vmap_pages_range - map pages to a kernel virtual address
|
|
* @addr: start of the VM area to map
|
|
* @end: end of the VM area to map (non-inclusive)
|
|
* @prot: page protection flags to use
|
|
* @pages: pages to map (always PAGE_SIZE pages)
|
|
* @page_shift: maximum shift that the pages may be mapped with, @pages must
|
|
* be aligned and contiguous up to at least this shift.
|
|
*
|
|
* RETURNS:
|
|
* 0 on success, -errno on failure.
|
|
*/
|
|
static int vmap_pages_range(unsigned long addr, unsigned long end,
|
|
pgprot_t prot, struct page **pages, unsigned int page_shift)
|
|
{
|
|
int err;
|
|
|
|
err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
|
|
flush_cache_vmap(addr, end);
|
|
return err;
|
|
}
|
|
|
|
static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
|
|
unsigned long end)
|
|
{
|
|
might_sleep();
|
|
if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
|
|
return -EINVAL;
|
|
if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
|
|
return -EINVAL;
|
|
if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
|
|
return -EINVAL;
|
|
if ((end - start) >> PAGE_SHIFT > totalram_pages())
|
|
return -E2BIG;
|
|
if (start < (unsigned long)area->addr ||
|
|
(void *)end > area->addr + get_vm_area_size(area))
|
|
return -ERANGE;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* vm_area_map_pages - map pages inside given sparse vm_area
|
|
* @area: vm_area
|
|
* @start: start address inside vm_area
|
|
* @end: end address inside vm_area
|
|
* @pages: pages to map (always PAGE_SIZE pages)
|
|
*/
|
|
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
|
|
unsigned long end, struct page **pages)
|
|
{
|
|
int err;
|
|
|
|
err = check_sparse_vm_area(area, start, end);
|
|
if (err)
|
|
return err;
|
|
|
|
return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
|
|
}
|
|
|
|
/**
|
|
* vm_area_unmap_pages - unmap pages inside given sparse vm_area
|
|
* @area: vm_area
|
|
* @start: start address inside vm_area
|
|
* @end: end address inside vm_area
|
|
*/
|
|
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
|
|
unsigned long end)
|
|
{
|
|
if (check_sparse_vm_area(area, start, end))
|
|
return;
|
|
|
|
vunmap_range(start, end);
|
|
}
|
|
|
|
int is_vmalloc_or_module_addr(const void *x)
|
|
{
|
|
/*
|
|
* ARM, x86-64 and sparc64 put modules in a special place,
|
|
* and fall back on vmalloc() if that fails. Others
|
|
* just put it in the vmalloc space.
|
|
*/
|
|
#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
|
|
unsigned long addr = (unsigned long)kasan_reset_tag(x);
|
|
if (addr >= MODULES_VADDR && addr < MODULES_END)
|
|
return 1;
|
|
#endif
|
|
return is_vmalloc_addr(x);
|
|
}
|
|
EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
|
|
|
|
/*
|
|
* Walk a vmap address to the struct page it maps. Huge vmap mappings will
|
|
* return the tail page that corresponds to the base page address, which
|
|
* matches small vmap mappings.
|
|
*/
|
|
struct page *vmalloc_to_page(const void *vmalloc_addr)
|
|
{
|
|
unsigned long addr = (unsigned long) vmalloc_addr;
|
|
struct page *page = NULL;
|
|
pgd_t *pgd = pgd_offset_k(addr);
|
|
p4d_t *p4d;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
pte_t *ptep, pte;
|
|
|
|
/*
|
|
* XXX we might need to change this if we add VIRTUAL_BUG_ON for
|
|
* architectures that do not vmalloc module space
|
|
*/
|
|
VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
|
|
|
|
if (pgd_none(*pgd))
|
|
return NULL;
|
|
if (WARN_ON_ONCE(pgd_leaf(*pgd)))
|
|
return NULL; /* XXX: no allowance for huge pgd */
|
|
if (WARN_ON_ONCE(pgd_bad(*pgd)))
|
|
return NULL;
|
|
|
|
p4d = p4d_offset(pgd, addr);
|
|
if (p4d_none(*p4d))
|
|
return NULL;
|
|
if (p4d_leaf(*p4d))
|
|
return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
|
|
if (WARN_ON_ONCE(p4d_bad(*p4d)))
|
|
return NULL;
|
|
|
|
pud = pud_offset(p4d, addr);
|
|
if (pud_none(*pud))
|
|
return NULL;
|
|
if (pud_leaf(*pud))
|
|
return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
|
if (WARN_ON_ONCE(pud_bad(*pud)))
|
|
return NULL;
|
|
|
|
pmd = pmd_offset(pud, addr);
|
|
if (pmd_none(*pmd))
|
|
return NULL;
|
|
if (pmd_leaf(*pmd))
|
|
return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
|
if (WARN_ON_ONCE(pmd_bad(*pmd)))
|
|
return NULL;
|
|
|
|
ptep = pte_offset_kernel(pmd, addr);
|
|
pte = ptep_get(ptep);
|
|
if (pte_present(pte))
|
|
page = pte_page(pte);
|
|
|
|
return page;
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_to_page);
|
|
|
|
/*
|
|
* Map a vmalloc()-space virtual address to the physical page frame number.
|
|
*/
|
|
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
|
|
{
|
|
return page_to_pfn(vmalloc_to_page(vmalloc_addr));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_to_pfn);
|
|
|
|
|
|
/*** Global kva allocator ***/
|
|
|
|
#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
|
|
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
|
|
|
|
|
|
static DEFINE_SPINLOCK(free_vmap_area_lock);
|
|
static bool vmap_initialized __read_mostly;
|
|
|
|
/*
|
|
* This kmem_cache is used for vmap_area objects. Instead of
|
|
* allocating from slab we reuse an object from this cache to
|
|
* make things faster. Especially in "no edge" splitting of
|
|
* free block.
|
|
*/
|
|
static struct kmem_cache *vmap_area_cachep;
|
|
|
|
/*
|
|
* This linked list is used in pair with free_vmap_area_root.
|
|
* It gives O(1) access to prev/next to perform fast coalescing.
|
|
*/
|
|
static LIST_HEAD(free_vmap_area_list);
|
|
|
|
/*
|
|
* This augment red-black tree represents the free vmap space.
|
|
* All vmap_area objects in this tree are sorted by va->va_start
|
|
* address. It is used for allocation and merging when a vmap
|
|
* object is released.
|
|
*
|
|
* Each vmap_area node contains a maximum available free block
|
|
* of its sub-tree, right or left. Therefore it is possible to
|
|
* find a lowest match of free area.
|
|
*/
|
|
static struct rb_root free_vmap_area_root = RB_ROOT;
|
|
|
|
/*
|
|
* Preload a CPU with one object for "no edge" split case. The
|
|
* aim is to get rid of allocations from the atomic context, thus
|
|
* to use more permissive allocation masks.
|
|
*/
|
|
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
|
|
|
|
/*
|
|
* This structure defines a single, solid model where a list and
|
|
* rb-tree are part of one entity protected by the lock. Nodes are
|
|
* sorted in ascending order, thus for O(1) access to left/right
|
|
* neighbors a list is used as well as for sequential traversal.
|
|
*/
|
|
struct rb_list {
|
|
struct rb_root root;
|
|
struct list_head head;
|
|
spinlock_t lock;
|
|
};
|
|
|
|
/*
|
|
* A fast size storage contains VAs up to 1M size. A pool consists
|
|
* of linked between each other ready to go VAs of certain sizes.
|
|
* An index in the pool-array corresponds to number of pages + 1.
|
|
*/
|
|
#define MAX_VA_SIZE_PAGES 256
|
|
|
|
struct vmap_pool {
|
|
struct list_head head;
|
|
unsigned long len;
|
|
};
|
|
|
|
/*
|
|
* An effective vmap-node logic. Users make use of nodes instead
|
|
* of a global heap. It allows to balance an access and mitigate
|
|
* contention.
|
|
*/
|
|
static struct vmap_node {
|
|
/* Simple size segregated storage. */
|
|
struct vmap_pool pool[MAX_VA_SIZE_PAGES];
|
|
spinlock_t pool_lock;
|
|
bool skip_populate;
|
|
|
|
/* Bookkeeping data of this node. */
|
|
struct rb_list busy;
|
|
struct rb_list lazy;
|
|
|
|
/*
|
|
* Ready-to-free areas.
|
|
*/
|
|
struct list_head purge_list;
|
|
struct work_struct purge_work;
|
|
unsigned long nr_purged;
|
|
} single;
|
|
|
|
/*
|
|
* Initial setup consists of one single node, i.e. a balancing
|
|
* is fully disabled. Later on, after vmap is initialized these
|
|
* parameters are updated based on a system capacity.
|
|
*/
|
|
static struct vmap_node *vmap_nodes = &single;
|
|
static __read_mostly unsigned int nr_vmap_nodes = 1;
|
|
static __read_mostly unsigned int vmap_zone_size = 1;
|
|
|
|
static inline unsigned int
|
|
addr_to_node_id(unsigned long addr)
|
|
{
|
|
return (addr / vmap_zone_size) % nr_vmap_nodes;
|
|
}
|
|
|
|
static inline struct vmap_node *
|
|
addr_to_node(unsigned long addr)
|
|
{
|
|
return &vmap_nodes[addr_to_node_id(addr)];
|
|
}
|
|
|
|
static inline struct vmap_node *
|
|
id_to_node(unsigned int id)
|
|
{
|
|
return &vmap_nodes[id % nr_vmap_nodes];
|
|
}
|
|
|
|
/*
|
|
* We use the value 0 to represent "no node", that is why
|
|
* an encoded value will be the node-id incremented by 1.
|
|
* It is always greater then 0. A valid node_id which can
|
|
* be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
|
|
* is not valid 0 is returned.
|
|
*/
|
|
static unsigned int
|
|
encode_vn_id(unsigned int node_id)
|
|
{
|
|
/* Can store U8_MAX [0:254] nodes. */
|
|
if (node_id < nr_vmap_nodes)
|
|
return (node_id + 1) << BITS_PER_BYTE;
|
|
|
|
/* Warn and no node encoded. */
|
|
WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Returns an encoded node-id, the valid range is within
|
|
* [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
|
|
* returned if extracted data is wrong.
|
|
*/
|
|
static unsigned int
|
|
decode_vn_id(unsigned int val)
|
|
{
|
|
unsigned int node_id = (val >> BITS_PER_BYTE) - 1;
|
|
|
|
/* Can store U8_MAX [0:254] nodes. */
|
|
if (node_id < nr_vmap_nodes)
|
|
return node_id;
|
|
|
|
/* If it was _not_ zero, warn. */
|
|
WARN_ONCE(node_id != UINT_MAX,
|
|
"Decode wrong node id (%d)\n", node_id);
|
|
|
|
return nr_vmap_nodes;
|
|
}
|
|
|
|
static bool
|
|
is_vn_id_valid(unsigned int node_id)
|
|
{
|
|
if (node_id < nr_vmap_nodes)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static __always_inline unsigned long
|
|
va_size(struct vmap_area *va)
|
|
{
|
|
return (va->va_end - va->va_start);
|
|
}
|
|
|
|
static __always_inline unsigned long
|
|
get_subtree_max_size(struct rb_node *node)
|
|
{
|
|
struct vmap_area *va;
|
|
|
|
va = rb_entry_safe(node, struct vmap_area, rb_node);
|
|
return va ? va->subtree_max_size : 0;
|
|
}
|
|
|
|
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
|
|
struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
|
|
|
|
static void reclaim_and_purge_vmap_areas(void);
|
|
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
|
|
static void drain_vmap_area_work(struct work_struct *work);
|
|
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
|
|
|
|
static atomic_long_t nr_vmalloc_pages;
|
|
|
|
unsigned long vmalloc_nr_pages(void)
|
|
{
|
|
return atomic_long_read(&nr_vmalloc_pages);
|
|
}
|
|
|
|
static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
|
|
{
|
|
struct rb_node *n = root->rb_node;
|
|
|
|
addr = (unsigned long)kasan_reset_tag((void *)addr);
|
|
|
|
while (n) {
|
|
struct vmap_area *va;
|
|
|
|
va = rb_entry(n, struct vmap_area, rb_node);
|
|
if (addr < va->va_start)
|
|
n = n->rb_left;
|
|
else if (addr >= va->va_end)
|
|
n = n->rb_right;
|
|
else
|
|
return va;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Look up the first VA which satisfies addr < va_end, NULL if none. */
|
|
static struct vmap_area *
|
|
__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
|
|
{
|
|
struct vmap_area *va = NULL;
|
|
struct rb_node *n = root->rb_node;
|
|
|
|
addr = (unsigned long)kasan_reset_tag((void *)addr);
|
|
|
|
while (n) {
|
|
struct vmap_area *tmp;
|
|
|
|
tmp = rb_entry(n, struct vmap_area, rb_node);
|
|
if (tmp->va_end > addr) {
|
|
va = tmp;
|
|
if (tmp->va_start <= addr)
|
|
break;
|
|
|
|
n = n->rb_left;
|
|
} else
|
|
n = n->rb_right;
|
|
}
|
|
|
|
return va;
|
|
}
|
|
|
|
/*
|
|
* Returns a node where a first VA, that satisfies addr < va_end, resides.
|
|
* If success, a node is locked. A user is responsible to unlock it when a
|
|
* VA is no longer needed to be accessed.
|
|
*
|
|
* Returns NULL if nothing found.
|
|
*/
|
|
static struct vmap_node *
|
|
find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
|
|
{
|
|
unsigned long va_start_lowest;
|
|
struct vmap_node *vn;
|
|
int i;
|
|
|
|
repeat:
|
|
for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
|
|
vn = &vmap_nodes[i];
|
|
|
|
spin_lock(&vn->busy.lock);
|
|
*va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
|
|
|
|
if (*va)
|
|
if (!va_start_lowest || (*va)->va_start < va_start_lowest)
|
|
va_start_lowest = (*va)->va_start;
|
|
spin_unlock(&vn->busy.lock);
|
|
}
|
|
|
|
/*
|
|
* Check if found VA exists, it might have gone away. In this case we
|
|
* repeat the search because a VA has been removed concurrently and we
|
|
* need to proceed to the next one, which is a rare case.
|
|
*/
|
|
if (va_start_lowest) {
|
|
vn = addr_to_node(va_start_lowest);
|
|
|
|
spin_lock(&vn->busy.lock);
|
|
*va = __find_vmap_area(va_start_lowest, &vn->busy.root);
|
|
|
|
if (*va)
|
|
return vn;
|
|
|
|
spin_unlock(&vn->busy.lock);
|
|
goto repeat;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* This function returns back addresses of parent node
|
|
* and its left or right link for further processing.
|
|
*
|
|
* Otherwise NULL is returned. In that case all further
|
|
* steps regarding inserting of conflicting overlap range
|
|
* have to be declined and actually considered as a bug.
|
|
*/
|
|
static __always_inline struct rb_node **
|
|
find_va_links(struct vmap_area *va,
|
|
struct rb_root *root, struct rb_node *from,
|
|
struct rb_node **parent)
|
|
{
|
|
struct vmap_area *tmp_va;
|
|
struct rb_node **link;
|
|
|
|
if (root) {
|
|
link = &root->rb_node;
|
|
if (unlikely(!*link)) {
|
|
*parent = NULL;
|
|
return link;
|
|
}
|
|
} else {
|
|
link = &from;
|
|
}
|
|
|
|
/*
|
|
* Go to the bottom of the tree. When we hit the last point
|
|
* we end up with parent rb_node and correct direction, i name
|
|
* it link, where the new va->rb_node will be attached to.
|
|
*/
|
|
do {
|
|
tmp_va = rb_entry(*link, struct vmap_area, rb_node);
|
|
|
|
/*
|
|
* During the traversal we also do some sanity check.
|
|
* Trigger the BUG() if there are sides(left/right)
|
|
* or full overlaps.
|
|
*/
|
|
if (va->va_end <= tmp_va->va_start)
|
|
link = &(*link)->rb_left;
|
|
else if (va->va_start >= tmp_va->va_end)
|
|
link = &(*link)->rb_right;
|
|
else {
|
|
WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
|
|
va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
|
|
|
|
return NULL;
|
|
}
|
|
} while (*link);
|
|
|
|
*parent = &tmp_va->rb_node;
|
|
return link;
|
|
}
|
|
|
|
static __always_inline struct list_head *
|
|
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
|
|
{
|
|
struct list_head *list;
|
|
|
|
if (unlikely(!parent))
|
|
/*
|
|
* The red-black tree where we try to find VA neighbors
|
|
* before merging or inserting is empty, i.e. it means
|
|
* there is no free vmap space. Normally it does not
|
|
* happen but we handle this case anyway.
|
|
*/
|
|
return NULL;
|
|
|
|
list = &rb_entry(parent, struct vmap_area, rb_node)->list;
|
|
return (&parent->rb_right == link ? list->next : list);
|
|
}
|
|
|
|
static __always_inline void
|
|
__link_va(struct vmap_area *va, struct rb_root *root,
|
|
struct rb_node *parent, struct rb_node **link,
|
|
struct list_head *head, bool augment)
|
|
{
|
|
/*
|
|
* VA is still not in the list, but we can
|
|
* identify its future previous list_head node.
|
|
*/
|
|
if (likely(parent)) {
|
|
head = &rb_entry(parent, struct vmap_area, rb_node)->list;
|
|
if (&parent->rb_right != link)
|
|
head = head->prev;
|
|
}
|
|
|
|
/* Insert to the rb-tree */
|
|
rb_link_node(&va->rb_node, parent, link);
|
|
if (augment) {
|
|
/*
|
|
* Some explanation here. Just perform simple insertion
|
|
* to the tree. We do not set va->subtree_max_size to
|
|
* its current size before calling rb_insert_augmented().
|
|
* It is because we populate the tree from the bottom
|
|
* to parent levels when the node _is_ in the tree.
|
|
*
|
|
* Therefore we set subtree_max_size to zero after insertion,
|
|
* to let __augment_tree_propagate_from() puts everything to
|
|
* the correct order later on.
|
|
*/
|
|
rb_insert_augmented(&va->rb_node,
|
|
root, &free_vmap_area_rb_augment_cb);
|
|
va->subtree_max_size = 0;
|
|
} else {
|
|
rb_insert_color(&va->rb_node, root);
|
|
}
|
|
|
|
/* Address-sort this list */
|
|
list_add(&va->list, head);
|
|
}
|
|
|
|
static __always_inline void
|
|
link_va(struct vmap_area *va, struct rb_root *root,
|
|
struct rb_node *parent, struct rb_node **link,
|
|
struct list_head *head)
|
|
{
|
|
__link_va(va, root, parent, link, head, false);
|
|
}
|
|
|
|
static __always_inline void
|
|
link_va_augment(struct vmap_area *va, struct rb_root *root,
|
|
struct rb_node *parent, struct rb_node **link,
|
|
struct list_head *head)
|
|
{
|
|
__link_va(va, root, parent, link, head, true);
|
|
}
|
|
|
|
static __always_inline void
|
|
__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
|
|
{
|
|
if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
|
|
return;
|
|
|
|
if (augment)
|
|
rb_erase_augmented(&va->rb_node,
|
|
root, &free_vmap_area_rb_augment_cb);
|
|
else
|
|
rb_erase(&va->rb_node, root);
|
|
|
|
list_del_init(&va->list);
|
|
RB_CLEAR_NODE(&va->rb_node);
|
|
}
|
|
|
|
static __always_inline void
|
|
unlink_va(struct vmap_area *va, struct rb_root *root)
|
|
{
|
|
__unlink_va(va, root, false);
|
|
}
|
|
|
|
static __always_inline void
|
|
unlink_va_augment(struct vmap_area *va, struct rb_root *root)
|
|
{
|
|
__unlink_va(va, root, true);
|
|
}
|
|
|
|
#if DEBUG_AUGMENT_PROPAGATE_CHECK
|
|
/*
|
|
* Gets called when remove the node and rotate.
|
|
*/
|
|
static __always_inline unsigned long
|
|
compute_subtree_max_size(struct vmap_area *va)
|
|
{
|
|
return max3(va_size(va),
|
|
get_subtree_max_size(va->rb_node.rb_left),
|
|
get_subtree_max_size(va->rb_node.rb_right));
|
|
}
|
|
|
|
static void
|
|
augment_tree_propagate_check(void)
|
|
{
|
|
struct vmap_area *va;
|
|
unsigned long computed_size;
|
|
|
|
list_for_each_entry(va, &free_vmap_area_list, list) {
|
|
computed_size = compute_subtree_max_size(va);
|
|
if (computed_size != va->subtree_max_size)
|
|
pr_emerg("tree is corrupted: %lu, %lu\n",
|
|
va_size(va), va->subtree_max_size);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* This function populates subtree_max_size from bottom to upper
|
|
* levels starting from VA point. The propagation must be done
|
|
* when VA size is modified by changing its va_start/va_end. Or
|
|
* in case of newly inserting of VA to the tree.
|
|
*
|
|
* It means that __augment_tree_propagate_from() must be called:
|
|
* - After VA has been inserted to the tree(free path);
|
|
* - After VA has been shrunk(allocation path);
|
|
* - After VA has been increased(merging path).
|
|
*
|
|
* Please note that, it does not mean that upper parent nodes
|
|
* and their subtree_max_size are recalculated all the time up
|
|
* to the root node.
|
|
*
|
|
* 4--8
|
|
* /\
|
|
* / \
|
|
* / \
|
|
* 2--2 8--8
|
|
*
|
|
* For example if we modify the node 4, shrinking it to 2, then
|
|
* no any modification is required. If we shrink the node 2 to 1
|
|
* its subtree_max_size is updated only, and set to 1. If we shrink
|
|
* the node 8 to 6, then its subtree_max_size is set to 6 and parent
|
|
* node becomes 4--6.
|
|
*/
|
|
static __always_inline void
|
|
augment_tree_propagate_from(struct vmap_area *va)
|
|
{
|
|
/*
|
|
* Populate the tree from bottom towards the root until
|
|
* the calculated maximum available size of checked node
|
|
* is equal to its current one.
|
|
*/
|
|
free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
|
|
|
|
#if DEBUG_AUGMENT_PROPAGATE_CHECK
|
|
augment_tree_propagate_check();
|
|
#endif
|
|
}
|
|
|
|
static void
|
|
insert_vmap_area(struct vmap_area *va,
|
|
struct rb_root *root, struct list_head *head)
|
|
{
|
|
struct rb_node **link;
|
|
struct rb_node *parent;
|
|
|
|
link = find_va_links(va, root, NULL, &parent);
|
|
if (link)
|
|
link_va(va, root, parent, link, head);
|
|
}
|
|
|
|
static void
|
|
insert_vmap_area_augment(struct vmap_area *va,
|
|
struct rb_node *from, struct rb_root *root,
|
|
struct list_head *head)
|
|
{
|
|
struct rb_node **link;
|
|
struct rb_node *parent;
|
|
|
|
if (from)
|
|
link = find_va_links(va, NULL, from, &parent);
|
|
else
|
|
link = find_va_links(va, root, NULL, &parent);
|
|
|
|
if (link) {
|
|
link_va_augment(va, root, parent, link, head);
|
|
augment_tree_propagate_from(va);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Merge de-allocated chunk of VA memory with previous
|
|
* and next free blocks. If coalesce is not done a new
|
|
* free area is inserted. If VA has been merged, it is
|
|
* freed.
|
|
*
|
|
* Please note, it can return NULL in case of overlap
|
|
* ranges, followed by WARN() report. Despite it is a
|
|
* buggy behaviour, a system can be alive and keep
|
|
* ongoing.
|
|
*/
|
|
static __always_inline struct vmap_area *
|
|
__merge_or_add_vmap_area(struct vmap_area *va,
|
|
struct rb_root *root, struct list_head *head, bool augment)
|
|
{
|
|
struct vmap_area *sibling;
|
|
struct list_head *next;
|
|
struct rb_node **link;
|
|
struct rb_node *parent;
|
|
bool merged = false;
|
|
|
|
/*
|
|
* Find a place in the tree where VA potentially will be
|
|
* inserted, unless it is merged with its sibling/siblings.
|
|
*/
|
|
link = find_va_links(va, root, NULL, &parent);
|
|
if (!link)
|
|
return NULL;
|
|
|
|
/*
|
|
* Get next node of VA to check if merging can be done.
|
|
*/
|
|
next = get_va_next_sibling(parent, link);
|
|
if (unlikely(next == NULL))
|
|
goto insert;
|
|
|
|
/*
|
|
* start end
|
|
* | |
|
|
* |<------VA------>|<-----Next----->|
|
|
* | |
|
|
* start end
|
|
*/
|
|
if (next != head) {
|
|
sibling = list_entry(next, struct vmap_area, list);
|
|
if (sibling->va_start == va->va_end) {
|
|
sibling->va_start = va->va_start;
|
|
|
|
/* Free vmap_area object. */
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
|
|
/* Point to the new merged area. */
|
|
va = sibling;
|
|
merged = true;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* start end
|
|
* | |
|
|
* |<-----Prev----->|<------VA------>|
|
|
* | |
|
|
* start end
|
|
*/
|
|
if (next->prev != head) {
|
|
sibling = list_entry(next->prev, struct vmap_area, list);
|
|
if (sibling->va_end == va->va_start) {
|
|
/*
|
|
* If both neighbors are coalesced, it is important
|
|
* to unlink the "next" node first, followed by merging
|
|
* with "previous" one. Otherwise the tree might not be
|
|
* fully populated if a sibling's augmented value is
|
|
* "normalized" because of rotation operations.
|
|
*/
|
|
if (merged)
|
|
__unlink_va(va, root, augment);
|
|
|
|
sibling->va_end = va->va_end;
|
|
|
|
/* Free vmap_area object. */
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
|
|
/* Point to the new merged area. */
|
|
va = sibling;
|
|
merged = true;
|
|
}
|
|
}
|
|
|
|
insert:
|
|
if (!merged)
|
|
__link_va(va, root, parent, link, head, augment);
|
|
|
|
return va;
|
|
}
|
|
|
|
static __always_inline struct vmap_area *
|
|
merge_or_add_vmap_area(struct vmap_area *va,
|
|
struct rb_root *root, struct list_head *head)
|
|
{
|
|
return __merge_or_add_vmap_area(va, root, head, false);
|
|
}
|
|
|
|
static __always_inline struct vmap_area *
|
|
merge_or_add_vmap_area_augment(struct vmap_area *va,
|
|
struct rb_root *root, struct list_head *head)
|
|
{
|
|
va = __merge_or_add_vmap_area(va, root, head, true);
|
|
if (va)
|
|
augment_tree_propagate_from(va);
|
|
|
|
return va;
|
|
}
|
|
|
|
static __always_inline bool
|
|
is_within_this_va(struct vmap_area *va, unsigned long size,
|
|
unsigned long align, unsigned long vstart)
|
|
{
|
|
unsigned long nva_start_addr;
|
|
|
|
if (va->va_start > vstart)
|
|
nva_start_addr = ALIGN(va->va_start, align);
|
|
else
|
|
nva_start_addr = ALIGN(vstart, align);
|
|
|
|
/* Can be overflowed due to big size or alignment. */
|
|
if (nva_start_addr + size < nva_start_addr ||
|
|
nva_start_addr < vstart)
|
|
return false;
|
|
|
|
return (nva_start_addr + size <= va->va_end);
|
|
}
|
|
|
|
/*
|
|
* Find the first free block(lowest start address) in the tree,
|
|
* that will accomplish the request corresponding to passing
|
|
* parameters. Please note, with an alignment bigger than PAGE_SIZE,
|
|
* a search length is adjusted to account for worst case alignment
|
|
* overhead.
|
|
*/
|
|
static __always_inline struct vmap_area *
|
|
find_vmap_lowest_match(struct rb_root *root, unsigned long size,
|
|
unsigned long align, unsigned long vstart, bool adjust_search_size)
|
|
{
|
|
struct vmap_area *va;
|
|
struct rb_node *node;
|
|
unsigned long length;
|
|
|
|
/* Start from the root. */
|
|
node = root->rb_node;
|
|
|
|
/* Adjust the search size for alignment overhead. */
|
|
length = adjust_search_size ? size + align - 1 : size;
|
|
|
|
while (node) {
|
|
va = rb_entry(node, struct vmap_area, rb_node);
|
|
|
|
if (get_subtree_max_size(node->rb_left) >= length &&
|
|
vstart < va->va_start) {
|
|
node = node->rb_left;
|
|
} else {
|
|
if (is_within_this_va(va, size, align, vstart))
|
|
return va;
|
|
|
|
/*
|
|
* Does not make sense to go deeper towards the right
|
|
* sub-tree if it does not have a free block that is
|
|
* equal or bigger to the requested search length.
|
|
*/
|
|
if (get_subtree_max_size(node->rb_right) >= length) {
|
|
node = node->rb_right;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* OK. We roll back and find the first right sub-tree,
|
|
* that will satisfy the search criteria. It can happen
|
|
* due to "vstart" restriction or an alignment overhead
|
|
* that is bigger then PAGE_SIZE.
|
|
*/
|
|
while ((node = rb_parent(node))) {
|
|
va = rb_entry(node, struct vmap_area, rb_node);
|
|
if (is_within_this_va(va, size, align, vstart))
|
|
return va;
|
|
|
|
if (get_subtree_max_size(node->rb_right) >= length &&
|
|
vstart <= va->va_start) {
|
|
/*
|
|
* Shift the vstart forward. Please note, we update it with
|
|
* parent's start address adding "1" because we do not want
|
|
* to enter same sub-tree after it has already been checked
|
|
* and no suitable free block found there.
|
|
*/
|
|
vstart = va->va_start + 1;
|
|
node = node->rb_right;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
|
|
#include <linux/random.h>
|
|
|
|
static struct vmap_area *
|
|
find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
|
|
unsigned long align, unsigned long vstart)
|
|
{
|
|
struct vmap_area *va;
|
|
|
|
list_for_each_entry(va, head, list) {
|
|
if (!is_within_this_va(va, size, align, vstart))
|
|
continue;
|
|
|
|
return va;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void
|
|
find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
|
|
unsigned long size, unsigned long align)
|
|
{
|
|
struct vmap_area *va_1, *va_2;
|
|
unsigned long vstart;
|
|
unsigned int rnd;
|
|
|
|
get_random_bytes(&rnd, sizeof(rnd));
|
|
vstart = VMALLOC_START + rnd;
|
|
|
|
va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
|
|
va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
|
|
|
|
if (va_1 != va_2)
|
|
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
|
|
va_1, va_2, vstart);
|
|
}
|
|
#endif
|
|
|
|
enum fit_type {
|
|
NOTHING_FIT = 0,
|
|
FL_FIT_TYPE = 1, /* full fit */
|
|
LE_FIT_TYPE = 2, /* left edge fit */
|
|
RE_FIT_TYPE = 3, /* right edge fit */
|
|
NE_FIT_TYPE = 4 /* no edge fit */
|
|
};
|
|
|
|
static __always_inline enum fit_type
|
|
classify_va_fit_type(struct vmap_area *va,
|
|
unsigned long nva_start_addr, unsigned long size)
|
|
{
|
|
enum fit_type type;
|
|
|
|
/* Check if it is within VA. */
|
|
if (nva_start_addr < va->va_start ||
|
|
nva_start_addr + size > va->va_end)
|
|
return NOTHING_FIT;
|
|
|
|
/* Now classify. */
|
|
if (va->va_start == nva_start_addr) {
|
|
if (va->va_end == nva_start_addr + size)
|
|
type = FL_FIT_TYPE;
|
|
else
|
|
type = LE_FIT_TYPE;
|
|
} else if (va->va_end == nva_start_addr + size) {
|
|
type = RE_FIT_TYPE;
|
|
} else {
|
|
type = NE_FIT_TYPE;
|
|
}
|
|
|
|
return type;
|
|
}
|
|
|
|
static __always_inline int
|
|
va_clip(struct rb_root *root, struct list_head *head,
|
|
struct vmap_area *va, unsigned long nva_start_addr,
|
|
unsigned long size)
|
|
{
|
|
struct vmap_area *lva = NULL;
|
|
enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
|
|
|
|
if (type == FL_FIT_TYPE) {
|
|
/*
|
|
* No need to split VA, it fully fits.
|
|
*
|
|
* | |
|
|
* V NVA V
|
|
* |---------------|
|
|
*/
|
|
unlink_va_augment(va, root);
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
} else if (type == LE_FIT_TYPE) {
|
|
/*
|
|
* Split left edge of fit VA.
|
|
*
|
|
* | |
|
|
* V NVA V R
|
|
* |-------|-------|
|
|
*/
|
|
va->va_start += size;
|
|
} else if (type == RE_FIT_TYPE) {
|
|
/*
|
|
* Split right edge of fit VA.
|
|
*
|
|
* | |
|
|
* L V NVA V
|
|
* |-------|-------|
|
|
*/
|
|
va->va_end = nva_start_addr;
|
|
} else if (type == NE_FIT_TYPE) {
|
|
/*
|
|
* Split no edge of fit VA.
|
|
*
|
|
* | |
|
|
* L V NVA V R
|
|
* |---|-------|---|
|
|
*/
|
|
lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
|
|
if (unlikely(!lva)) {
|
|
/*
|
|
* For percpu allocator we do not do any pre-allocation
|
|
* and leave it as it is. The reason is it most likely
|
|
* never ends up with NE_FIT_TYPE splitting. In case of
|
|
* percpu allocations offsets and sizes are aligned to
|
|
* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
|
|
* are its main fitting cases.
|
|
*
|
|
* There are a few exceptions though, as an example it is
|
|
* a first allocation (early boot up) when we have "one"
|
|
* big free space that has to be split.
|
|
*
|
|
* Also we can hit this path in case of regular "vmap"
|
|
* allocations, if "this" current CPU was not preloaded.
|
|
* See the comment in alloc_vmap_area() why. If so, then
|
|
* GFP_NOWAIT is used instead to get an extra object for
|
|
* split purpose. That is rare and most time does not
|
|
* occur.
|
|
*
|
|
* What happens if an allocation gets failed. Basically,
|
|
* an "overflow" path is triggered to purge lazily freed
|
|
* areas to free some memory, then, the "retry" path is
|
|
* triggered to repeat one more time. See more details
|
|
* in alloc_vmap_area() function.
|
|
*/
|
|
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
|
|
if (!lva)
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Build the remainder.
|
|
*/
|
|
lva->va_start = va->va_start;
|
|
lva->va_end = nva_start_addr;
|
|
|
|
/*
|
|
* Shrink this VA to remaining size.
|
|
*/
|
|
va->va_start = nva_start_addr + size;
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
if (type != FL_FIT_TYPE) {
|
|
augment_tree_propagate_from(va);
|
|
|
|
if (lva) /* type == NE_FIT_TYPE */
|
|
insert_vmap_area_augment(lva, &va->rb_node, root, head);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static unsigned long
|
|
va_alloc(struct vmap_area *va,
|
|
struct rb_root *root, struct list_head *head,
|
|
unsigned long size, unsigned long align,
|
|
unsigned long vstart, unsigned long vend)
|
|
{
|
|
unsigned long nva_start_addr;
|
|
int ret;
|
|
|
|
if (va->va_start > vstart)
|
|
nva_start_addr = ALIGN(va->va_start, align);
|
|
else
|
|
nva_start_addr = ALIGN(vstart, align);
|
|
|
|
/* Check the "vend" restriction. */
|
|
if (nva_start_addr + size > vend)
|
|
return vend;
|
|
|
|
/* Update the free vmap_area. */
|
|
ret = va_clip(root, head, va, nva_start_addr, size);
|
|
if (WARN_ON_ONCE(ret))
|
|
return vend;
|
|
|
|
return nva_start_addr;
|
|
}
|
|
|
|
/*
|
|
* Returns a start address of the newly allocated area, if success.
|
|
* Otherwise a vend is returned that indicates failure.
|
|
*/
|
|
static __always_inline unsigned long
|
|
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
|
|
unsigned long size, unsigned long align,
|
|
unsigned long vstart, unsigned long vend)
|
|
{
|
|
bool adjust_search_size = true;
|
|
unsigned long nva_start_addr;
|
|
struct vmap_area *va;
|
|
|
|
/*
|
|
* Do not adjust when:
|
|
* a) align <= PAGE_SIZE, because it does not make any sense.
|
|
* All blocks(their start addresses) are at least PAGE_SIZE
|
|
* aligned anyway;
|
|
* b) a short range where a requested size corresponds to exactly
|
|
* specified [vstart:vend] interval and an alignment > PAGE_SIZE.
|
|
* With adjusted search length an allocation would not succeed.
|
|
*/
|
|
if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
|
|
adjust_search_size = false;
|
|
|
|
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
|
|
if (unlikely(!va))
|
|
return vend;
|
|
|
|
nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
|
|
if (nva_start_addr == vend)
|
|
return vend;
|
|
|
|
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
|
|
find_vmap_lowest_match_check(root, head, size, align);
|
|
#endif
|
|
|
|
return nva_start_addr;
|
|
}
|
|
|
|
/*
|
|
* Free a region of KVA allocated by alloc_vmap_area
|
|
*/
|
|
static void free_vmap_area(struct vmap_area *va)
|
|
{
|
|
struct vmap_node *vn = addr_to_node(va->va_start);
|
|
|
|
/*
|
|
* Remove from the busy tree/list.
|
|
*/
|
|
spin_lock(&vn->busy.lock);
|
|
unlink_va(va, &vn->busy.root);
|
|
spin_unlock(&vn->busy.lock);
|
|
|
|
/*
|
|
* Insert/Merge it back to the free tree/list.
|
|
*/
|
|
spin_lock(&free_vmap_area_lock);
|
|
merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
|
|
spin_unlock(&free_vmap_area_lock);
|
|
}
|
|
|
|
static inline void
|
|
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
|
|
{
|
|
struct vmap_area *va = NULL, *tmp;
|
|
|
|
/*
|
|
* Preload this CPU with one extra vmap_area object. It is used
|
|
* when fit type of free area is NE_FIT_TYPE. It guarantees that
|
|
* a CPU that does an allocation is preloaded.
|
|
*
|
|
* We do it in non-atomic context, thus it allows us to use more
|
|
* permissive allocation masks to be more stable under low memory
|
|
* condition and high memory pressure.
|
|
*/
|
|
if (!this_cpu_read(ne_fit_preload_node))
|
|
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
|
|
|
|
spin_lock(lock);
|
|
|
|
tmp = NULL;
|
|
if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
}
|
|
|
|
static struct vmap_pool *
|
|
size_to_va_pool(struct vmap_node *vn, unsigned long size)
|
|
{
|
|
unsigned int idx = (size - 1) / PAGE_SIZE;
|
|
|
|
if (idx < MAX_VA_SIZE_PAGES)
|
|
return &vn->pool[idx];
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static bool
|
|
node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
|
|
{
|
|
struct vmap_pool *vp;
|
|
|
|
vp = size_to_va_pool(n, va_size(va));
|
|
if (!vp)
|
|
return false;
|
|
|
|
spin_lock(&n->pool_lock);
|
|
list_add(&va->list, &vp->head);
|
|
WRITE_ONCE(vp->len, vp->len + 1);
|
|
spin_unlock(&n->pool_lock);
|
|
|
|
return true;
|
|
}
|
|
|
|
static struct vmap_area *
|
|
node_pool_del_va(struct vmap_node *vn, unsigned long size,
|
|
unsigned long align, unsigned long vstart,
|
|
unsigned long vend)
|
|
{
|
|
struct vmap_area *va = NULL;
|
|
struct vmap_pool *vp;
|
|
int err = 0;
|
|
|
|
vp = size_to_va_pool(vn, size);
|
|
if (!vp || list_empty(&vp->head))
|
|
return NULL;
|
|
|
|
spin_lock(&vn->pool_lock);
|
|
if (!list_empty(&vp->head)) {
|
|
va = list_first_entry(&vp->head, struct vmap_area, list);
|
|
|
|
if (IS_ALIGNED(va->va_start, align)) {
|
|
/*
|
|
* Do some sanity check and emit a warning
|
|
* if one of below checks detects an error.
|
|
*/
|
|
err |= (va_size(va) != size);
|
|
err |= (va->va_start < vstart);
|
|
err |= (va->va_end > vend);
|
|
|
|
if (!WARN_ON_ONCE(err)) {
|
|
list_del_init(&va->list);
|
|
WRITE_ONCE(vp->len, vp->len - 1);
|
|
} else {
|
|
va = NULL;
|
|
}
|
|
} else {
|
|
list_move_tail(&va->list, &vp->head);
|
|
va = NULL;
|
|
}
|
|
}
|
|
spin_unlock(&vn->pool_lock);
|
|
|
|
return va;
|
|
}
|
|
|
|
static struct vmap_area *
|
|
node_alloc(unsigned long size, unsigned long align,
|
|
unsigned long vstart, unsigned long vend,
|
|
unsigned long *addr, unsigned int *vn_id)
|
|
{
|
|
struct vmap_area *va;
|
|
|
|
*vn_id = 0;
|
|
*addr = vend;
|
|
|
|
/*
|
|
* Fallback to a global heap if not vmalloc or there
|
|
* is only one node.
|
|
*/
|
|
if (vstart != VMALLOC_START || vend != VMALLOC_END ||
|
|
nr_vmap_nodes == 1)
|
|
return NULL;
|
|
|
|
*vn_id = raw_smp_processor_id() % nr_vmap_nodes;
|
|
va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
|
|
*vn_id = encode_vn_id(*vn_id);
|
|
|
|
if (va)
|
|
*addr = va->va_start;
|
|
|
|
return va;
|
|
}
|
|
|
|
static inline void setup_vmalloc_vm(struct vm_struct *vm,
|
|
struct vmap_area *va, unsigned long flags, const void *caller)
|
|
{
|
|
vm->flags = flags;
|
|
vm->addr = (void *)va->va_start;
|
|
vm->size = va_size(va);
|
|
vm->caller = caller;
|
|
va->vm = vm;
|
|
}
|
|
|
|
/*
|
|
* Allocate a region of KVA of the specified size and alignment, within the
|
|
* vstart and vend. If vm is passed in, the two will also be bound.
|
|
*/
|
|
static struct vmap_area *alloc_vmap_area(unsigned long size,
|
|
unsigned long align,
|
|
unsigned long vstart, unsigned long vend,
|
|
int node, gfp_t gfp_mask,
|
|
unsigned long va_flags, struct vm_struct *vm)
|
|
{
|
|
struct vmap_node *vn;
|
|
struct vmap_area *va;
|
|
unsigned long freed;
|
|
unsigned long addr;
|
|
unsigned int vn_id;
|
|
int purged = 0;
|
|
int ret;
|
|
|
|
if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
if (unlikely(!vmap_initialized))
|
|
return ERR_PTR(-EBUSY);
|
|
|
|
might_sleep();
|
|
|
|
/*
|
|
* If a VA is obtained from a global heap(if it fails here)
|
|
* it is anyway marked with this "vn_id" so it is returned
|
|
* to this pool's node later. Such way gives a possibility
|
|
* to populate pools based on users demand.
|
|
*
|
|
* On success a ready to go VA is returned.
|
|
*/
|
|
va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
|
|
if (!va) {
|
|
gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
|
|
|
|
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
|
|
if (unlikely(!va))
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
/*
|
|
* Only scan the relevant parts containing pointers to other objects
|
|
* to avoid false negatives.
|
|
*/
|
|
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
|
|
}
|
|
|
|
retry:
|
|
if (addr == vend) {
|
|
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
|
|
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
|
|
size, align, vstart, vend);
|
|
spin_unlock(&free_vmap_area_lock);
|
|
}
|
|
|
|
trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
|
|
|
|
/*
|
|
* If an allocation fails, the "vend" address is
|
|
* returned. Therefore trigger the overflow path.
|
|
*/
|
|
if (unlikely(addr == vend))
|
|
goto overflow;
|
|
|
|
va->va_start = addr;
|
|
va->va_end = addr + size;
|
|
va->vm = NULL;
|
|
va->flags = (va_flags | vn_id);
|
|
|
|
if (vm) {
|
|
vm->addr = (void *)va->va_start;
|
|
vm->size = va_size(va);
|
|
va->vm = vm;
|
|
}
|
|
|
|
vn = addr_to_node(va->va_start);
|
|
|
|
spin_lock(&vn->busy.lock);
|
|
insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
|
|
spin_unlock(&vn->busy.lock);
|
|
|
|
BUG_ON(!IS_ALIGNED(va->va_start, align));
|
|
BUG_ON(va->va_start < vstart);
|
|
BUG_ON(va->va_end > vend);
|
|
|
|
ret = kasan_populate_vmalloc(addr, size);
|
|
if (ret) {
|
|
free_vmap_area(va);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
return va;
|
|
|
|
overflow:
|
|
if (!purged) {
|
|
reclaim_and_purge_vmap_areas();
|
|
purged = 1;
|
|
goto retry;
|
|
}
|
|
|
|
freed = 0;
|
|
blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
|
|
|
|
if (freed > 0) {
|
|
purged = 0;
|
|
goto retry;
|
|
}
|
|
|
|
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
|
|
pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
|
|
size, vstart, vend);
|
|
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
return ERR_PTR(-EBUSY);
|
|
}
|
|
|
|
int register_vmap_purge_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_register(&vmap_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
|
|
|
|
int unregister_vmap_purge_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
|
|
|
|
/*
|
|
* lazy_max_pages is the maximum amount of virtual address space we gather up
|
|
* before attempting to purge with a TLB flush.
|
|
*
|
|
* There is a tradeoff here: a larger number will cover more kernel page tables
|
|
* and take slightly longer to purge, but it will linearly reduce the number of
|
|
* global TLB flushes that must be performed. It would seem natural to scale
|
|
* this number up linearly with the number of CPUs (because vmapping activity
|
|
* could also scale linearly with the number of CPUs), however it is likely
|
|
* that in practice, workloads might be constrained in other ways that mean
|
|
* vmap activity will not scale linearly with CPUs. Also, I want to be
|
|
* conservative and not introduce a big latency on huge systems, so go with
|
|
* a less aggressive log scale. It will still be an improvement over the old
|
|
* code, and it will be simple to change the scale factor if we find that it
|
|
* becomes a problem on bigger systems.
|
|
*/
|
|
static unsigned long lazy_max_pages(void)
|
|
{
|
|
unsigned int log;
|
|
|
|
log = fls(num_online_cpus());
|
|
|
|
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
|
|
}
|
|
|
|
static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
|
|
|
|
/*
|
|
* Serialize vmap purging. There is no actual critical section protected
|
|
* by this lock, but we want to avoid concurrent calls for performance
|
|
* reasons and to make the pcpu_get_vm_areas more deterministic.
|
|
*/
|
|
static DEFINE_MUTEX(vmap_purge_lock);
|
|
|
|
/* for per-CPU blocks */
|
|
static void purge_fragmented_blocks_allcpus(void);
|
|
static cpumask_t purge_nodes;
|
|
|
|
static void
|
|
reclaim_list_global(struct list_head *head)
|
|
{
|
|
struct vmap_area *va, *n;
|
|
|
|
if (list_empty(head))
|
|
return;
|
|
|
|
spin_lock(&free_vmap_area_lock);
|
|
list_for_each_entry_safe(va, n, head, list)
|
|
merge_or_add_vmap_area_augment(va,
|
|
&free_vmap_area_root, &free_vmap_area_list);
|
|
spin_unlock(&free_vmap_area_lock);
|
|
}
|
|
|
|
static void
|
|
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
|
|
{
|
|
LIST_HEAD(decay_list);
|
|
struct rb_root decay_root = RB_ROOT;
|
|
struct vmap_area *va, *nva;
|
|
unsigned long n_decay;
|
|
int i;
|
|
|
|
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
|
|
LIST_HEAD(tmp_list);
|
|
|
|
if (list_empty(&vn->pool[i].head))
|
|
continue;
|
|
|
|
/* Detach the pool, so no-one can access it. */
|
|
spin_lock(&vn->pool_lock);
|
|
list_replace_init(&vn->pool[i].head, &tmp_list);
|
|
spin_unlock(&vn->pool_lock);
|
|
|
|
if (full_decay)
|
|
WRITE_ONCE(vn->pool[i].len, 0);
|
|
|
|
/* Decay a pool by ~25% out of left objects. */
|
|
n_decay = vn->pool[i].len >> 2;
|
|
|
|
list_for_each_entry_safe(va, nva, &tmp_list, list) {
|
|
list_del_init(&va->list);
|
|
merge_or_add_vmap_area(va, &decay_root, &decay_list);
|
|
|
|
if (!full_decay) {
|
|
WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
|
|
|
|
if (!--n_decay)
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Attach the pool back if it has been partly decayed.
|
|
* Please note, it is supposed that nobody(other contexts)
|
|
* can populate the pool therefore a simple list replace
|
|
* operation takes place here.
|
|
*/
|
|
if (!full_decay && !list_empty(&tmp_list)) {
|
|
spin_lock(&vn->pool_lock);
|
|
list_replace_init(&tmp_list, &vn->pool[i].head);
|
|
spin_unlock(&vn->pool_lock);
|
|
}
|
|
}
|
|
|
|
reclaim_list_global(&decay_list);
|
|
}
|
|
|
|
static void purge_vmap_node(struct work_struct *work)
|
|
{
|
|
struct vmap_node *vn = container_of(work,
|
|
struct vmap_node, purge_work);
|
|
unsigned long nr_purged_pages = 0;
|
|
struct vmap_area *va, *n_va;
|
|
LIST_HEAD(local_list);
|
|
|
|
vn->nr_purged = 0;
|
|
|
|
list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
|
|
unsigned long nr = va_size(va) >> PAGE_SHIFT;
|
|
unsigned long orig_start = va->va_start;
|
|
unsigned long orig_end = va->va_end;
|
|
unsigned int vn_id = decode_vn_id(va->flags);
|
|
|
|
list_del_init(&va->list);
|
|
|
|
if (is_vmalloc_or_module_addr((void *)orig_start))
|
|
kasan_release_vmalloc(orig_start, orig_end,
|
|
va->va_start, va->va_end);
|
|
|
|
nr_purged_pages += nr;
|
|
vn->nr_purged++;
|
|
|
|
if (is_vn_id_valid(vn_id) && !vn->skip_populate)
|
|
if (node_pool_add_va(vn, va))
|
|
continue;
|
|
|
|
/* Go back to global. */
|
|
list_add(&va->list, &local_list);
|
|
}
|
|
|
|
atomic_long_sub(nr_purged_pages, &vmap_lazy_nr);
|
|
|
|
reclaim_list_global(&local_list);
|
|
}
|
|
|
|
/*
|
|
* Purges all lazily-freed vmap areas.
|
|
*/
|
|
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
|
|
bool full_pool_decay)
|
|
{
|
|
unsigned long nr_purged_areas = 0;
|
|
unsigned int nr_purge_helpers;
|
|
unsigned int nr_purge_nodes;
|
|
struct vmap_node *vn;
|
|
int i;
|
|
|
|
lockdep_assert_held(&vmap_purge_lock);
|
|
|
|
/*
|
|
* Use cpumask to mark which node has to be processed.
|
|
*/
|
|
purge_nodes = CPU_MASK_NONE;
|
|
|
|
for (i = 0; i < nr_vmap_nodes; i++) {
|
|
vn = &vmap_nodes[i];
|
|
|
|
INIT_LIST_HEAD(&vn->purge_list);
|
|
vn->skip_populate = full_pool_decay;
|
|
decay_va_pool_node(vn, full_pool_decay);
|
|
|
|
if (RB_EMPTY_ROOT(&vn->lazy.root))
|
|
continue;
|
|
|
|
spin_lock(&vn->lazy.lock);
|
|
WRITE_ONCE(vn->lazy.root.rb_node, NULL);
|
|
list_replace_init(&vn->lazy.head, &vn->purge_list);
|
|
spin_unlock(&vn->lazy.lock);
|
|
|
|
start = min(start, list_first_entry(&vn->purge_list,
|
|
struct vmap_area, list)->va_start);
|
|
|
|
end = max(end, list_last_entry(&vn->purge_list,
|
|
struct vmap_area, list)->va_end);
|
|
|
|
cpumask_set_cpu(i, &purge_nodes);
|
|
}
|
|
|
|
nr_purge_nodes = cpumask_weight(&purge_nodes);
|
|
if (nr_purge_nodes > 0) {
|
|
flush_tlb_kernel_range(start, end);
|
|
|
|
/* One extra worker is per a lazy_max_pages() full set minus one. */
|
|
nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
|
|
nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;
|
|
|
|
for_each_cpu(i, &purge_nodes) {
|
|
vn = &vmap_nodes[i];
|
|
|
|
if (nr_purge_helpers > 0) {
|
|
INIT_WORK(&vn->purge_work, purge_vmap_node);
|
|
|
|
if (cpumask_test_cpu(i, cpu_online_mask))
|
|
schedule_work_on(i, &vn->purge_work);
|
|
else
|
|
schedule_work(&vn->purge_work);
|
|
|
|
nr_purge_helpers--;
|
|
} else {
|
|
vn->purge_work.func = NULL;
|
|
purge_vmap_node(&vn->purge_work);
|
|
nr_purged_areas += vn->nr_purged;
|
|
}
|
|
}
|
|
|
|
for_each_cpu(i, &purge_nodes) {
|
|
vn = &vmap_nodes[i];
|
|
|
|
if (vn->purge_work.func) {
|
|
flush_work(&vn->purge_work);
|
|
nr_purged_areas += vn->nr_purged;
|
|
}
|
|
}
|
|
}
|
|
|
|
trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
|
|
return nr_purged_areas > 0;
|
|
}
|
|
|
|
/*
|
|
* Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
|
|
*/
|
|
static void reclaim_and_purge_vmap_areas(void)
|
|
|
|
{
|
|
mutex_lock(&vmap_purge_lock);
|
|
purge_fragmented_blocks_allcpus();
|
|
__purge_vmap_area_lazy(ULONG_MAX, 0, true);
|
|
mutex_unlock(&vmap_purge_lock);
|
|
}
|
|
|
|
static void drain_vmap_area_work(struct work_struct *work)
|
|
{
|
|
mutex_lock(&vmap_purge_lock);
|
|
__purge_vmap_area_lazy(ULONG_MAX, 0, false);
|
|
mutex_unlock(&vmap_purge_lock);
|
|
}
|
|
|
|
/*
|
|
* Free a vmap area, caller ensuring that the area has been unmapped,
|
|
* unlinked and flush_cache_vunmap had been called for the correct
|
|
* range previously.
|
|
*/
|
|
static void free_vmap_area_noflush(struct vmap_area *va)
|
|
{
|
|
unsigned long nr_lazy_max = lazy_max_pages();
|
|
unsigned long va_start = va->va_start;
|
|
unsigned int vn_id = decode_vn_id(va->flags);
|
|
struct vmap_node *vn;
|
|
unsigned long nr_lazy;
|
|
|
|
if (WARN_ON_ONCE(!list_empty(&va->list)))
|
|
return;
|
|
|
|
nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT,
|
|
&vmap_lazy_nr);
|
|
|
|
/*
|
|
* If it was request by a certain node we would like to
|
|
* return it to that node, i.e. its pool for later reuse.
|
|
*/
|
|
vn = is_vn_id_valid(vn_id) ?
|
|
id_to_node(vn_id):addr_to_node(va->va_start);
|
|
|
|
spin_lock(&vn->lazy.lock);
|
|
insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
|
|
spin_unlock(&vn->lazy.lock);
|
|
|
|
trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
|
|
|
|
/* After this point, we may free va at any time */
|
|
if (unlikely(nr_lazy > nr_lazy_max))
|
|
schedule_work(&drain_vmap_work);
|
|
}
|
|
|
|
/*
|
|
* Free and unmap a vmap area
|
|
*/
|
|
static void free_unmap_vmap_area(struct vmap_area *va)
|
|
{
|
|
flush_cache_vunmap(va->va_start, va->va_end);
|
|
vunmap_range_noflush(va->va_start, va->va_end);
|
|
if (debug_pagealloc_enabled_static())
|
|
flush_tlb_kernel_range(va->va_start, va->va_end);
|
|
|
|
free_vmap_area_noflush(va);
|
|
}
|
|
|
|
struct vmap_area *find_vmap_area(unsigned long addr)
|
|
{
|
|
struct vmap_node *vn;
|
|
struct vmap_area *va;
|
|
int i, j;
|
|
|
|
if (unlikely(!vmap_initialized))
|
|
return NULL;
|
|
|
|
/*
|
|
* An addr_to_node_id(addr) converts an address to a node index
|
|
* where a VA is located. If VA spans several zones and passed
|
|
* addr is not the same as va->va_start, what is not common, we
|
|
* may need to scan extra nodes. See an example:
|
|
*
|
|
* <----va---->
|
|
* -|-----|-----|-----|-----|-
|
|
* 1 2 0 1
|
|
*
|
|
* VA resides in node 1 whereas it spans 1, 2 an 0. If passed
|
|
* addr is within 2 or 0 nodes we should do extra work.
|
|
*/
|
|
i = j = addr_to_node_id(addr);
|
|
do {
|
|
vn = &vmap_nodes[i];
|
|
|
|
spin_lock(&vn->busy.lock);
|
|
va = __find_vmap_area(addr, &vn->busy.root);
|
|
spin_unlock(&vn->busy.lock);
|
|
|
|
if (va)
|
|
return va;
|
|
} while ((i = (i + 1) % nr_vmap_nodes) != j);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
|
|
{
|
|
struct vmap_node *vn;
|
|
struct vmap_area *va;
|
|
int i, j;
|
|
|
|
/*
|
|
* Check the comment in the find_vmap_area() about the loop.
|
|
*/
|
|
i = j = addr_to_node_id(addr);
|
|
do {
|
|
vn = &vmap_nodes[i];
|
|
|
|
spin_lock(&vn->busy.lock);
|
|
va = __find_vmap_area(addr, &vn->busy.root);
|
|
if (va)
|
|
unlink_va(va, &vn->busy.root);
|
|
spin_unlock(&vn->busy.lock);
|
|
|
|
if (va)
|
|
return va;
|
|
} while ((i = (i + 1) % nr_vmap_nodes) != j);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*** Per cpu kva allocator ***/
|
|
|
|
/*
|
|
* vmap space is limited especially on 32 bit architectures. Ensure there is
|
|
* room for at least 16 percpu vmap blocks per CPU.
|
|
*/
|
|
/*
|
|
* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
|
|
* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
|
|
* instead (we just need a rough idea)
|
|
*/
|
|
#if BITS_PER_LONG == 32
|
|
#define VMALLOC_SPACE (128UL*1024*1024)
|
|
#else
|
|
#define VMALLOC_SPACE (128UL*1024*1024*1024)
|
|
#endif
|
|
|
|
#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
|
|
#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
|
|
#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
|
|
#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
|
|
#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
|
|
#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
|
|
#define VMAP_BBMAP_BITS \
|
|
VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
|
|
VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
|
|
VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
|
|
|
|
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
|
|
|
|
/*
|
|
* Purge threshold to prevent overeager purging of fragmented blocks for
|
|
* regular operations: Purge if vb->free is less than 1/4 of the capacity.
|
|
*/
|
|
#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
|
|
|
|
#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
|
|
#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
|
|
#define VMAP_FLAGS_MASK 0x3
|
|
|
|
struct vmap_block_queue {
|
|
spinlock_t lock;
|
|
struct list_head free;
|
|
|
|
/*
|
|
* An xarray requires an extra memory dynamically to
|
|
* be allocated. If it is an issue, we can use rb-tree
|
|
* instead.
|
|
*/
|
|
struct xarray vmap_blocks;
|
|
};
|
|
|
|
struct vmap_block {
|
|
spinlock_t lock;
|
|
struct vmap_area *va;
|
|
unsigned long free, dirty;
|
|
DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
|
|
unsigned long dirty_min, dirty_max; /*< dirty range */
|
|
struct list_head free_list;
|
|
struct rcu_head rcu_head;
|
|
struct list_head purge;
|
|
unsigned int cpu;
|
|
};
|
|
|
|
/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
|
|
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
|
|
|
|
/*
|
|
* In order to fast access to any "vmap_block" associated with a
|
|
* specific address, we use a hash.
|
|
*
|
|
* A per-cpu vmap_block_queue is used in both ways, to serialize
|
|
* an access to free block chains among CPUs(alloc path) and it
|
|
* also acts as a vmap_block hash(alloc/free paths). It means we
|
|
* overload it, since we already have the per-cpu array which is
|
|
* used as a hash table. When used as a hash a 'cpu' passed to
|
|
* per_cpu() is not actually a CPU but rather a hash index.
|
|
*
|
|
* A hash function is addr_to_vb_xa() which hashes any address
|
|
* to a specific index(in a hash) it belongs to. This then uses a
|
|
* per_cpu() macro to access an array with generated index.
|
|
*
|
|
* An example:
|
|
*
|
|
* CPU_1 CPU_2 CPU_0
|
|
* | | |
|
|
* V V V
|
|
* 0 10 20 30 40 50 60
|
|
* |------|------|------|------|------|------|...<vmap address space>
|
|
* CPU0 CPU1 CPU2 CPU0 CPU1 CPU2
|
|
*
|
|
* - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
|
|
* it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
|
|
*
|
|
* - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
|
|
* it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
|
|
*
|
|
* - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
|
|
* it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
|
|
*
|
|
* This technique almost always avoids lock contention on insert/remove,
|
|
* however xarray spinlocks protect against any contention that remains.
|
|
*/
|
|
static struct xarray *
|
|
addr_to_vb_xa(unsigned long addr)
|
|
{
|
|
int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;
|
|
|
|
/*
|
|
* Please note, nr_cpu_ids points on a highest set
|
|
* possible bit, i.e. we never invoke cpumask_next()
|
|
* if an index points on it which is nr_cpu_ids - 1.
|
|
*/
|
|
if (!cpu_possible(index))
|
|
index = cpumask_next(index, cpu_possible_mask);
|
|
|
|
return &per_cpu(vmap_block_queue, index).vmap_blocks;
|
|
}
|
|
|
|
/*
|
|
* We should probably have a fallback mechanism to allocate virtual memory
|
|
* out of partially filled vmap blocks. However vmap block sizing should be
|
|
* fairly reasonable according to the vmalloc size, so it shouldn't be a
|
|
* big problem.
|
|
*/
|
|
|
|
static unsigned long addr_to_vb_idx(unsigned long addr)
|
|
{
|
|
addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
|
|
addr /= VMAP_BLOCK_SIZE;
|
|
return addr;
|
|
}
|
|
|
|
static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
|
|
{
|
|
unsigned long addr;
|
|
|
|
addr = va_start + (pages_off << PAGE_SHIFT);
|
|
BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
|
|
return (void *)addr;
|
|
}
|
|
|
|
/**
|
|
* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
|
|
* block. Of course pages number can't exceed VMAP_BBMAP_BITS
|
|
* @order: how many 2^order pages should be occupied in newly allocated block
|
|
* @gfp_mask: flags for the page level allocator
|
|
*
|
|
* Return: virtual address in a newly allocated block or ERR_PTR(-errno)
|
|
*/
|
|
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
|
|
{
|
|
struct vmap_block_queue *vbq;
|
|
struct vmap_block *vb;
|
|
struct vmap_area *va;
|
|
struct xarray *xa;
|
|
unsigned long vb_idx;
|
|
int node, err;
|
|
void *vaddr;
|
|
|
|
node = numa_node_id();
|
|
|
|
vb = kmalloc_node(sizeof(struct vmap_block),
|
|
gfp_mask & GFP_RECLAIM_MASK, node);
|
|
if (unlikely(!vb))
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
|
|
VMALLOC_START, VMALLOC_END,
|
|
node, gfp_mask,
|
|
VMAP_RAM|VMAP_BLOCK, NULL);
|
|
if (IS_ERR(va)) {
|
|
kfree(vb);
|
|
return ERR_CAST(va);
|
|
}
|
|
|
|
vaddr = vmap_block_vaddr(va->va_start, 0);
|
|
spin_lock_init(&vb->lock);
|
|
vb->va = va;
|
|
/* At least something should be left free */
|
|
BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
|
|
bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
|
|
vb->free = VMAP_BBMAP_BITS - (1UL << order);
|
|
vb->dirty = 0;
|
|
vb->dirty_min = VMAP_BBMAP_BITS;
|
|
vb->dirty_max = 0;
|
|
bitmap_set(vb->used_map, 0, (1UL << order));
|
|
INIT_LIST_HEAD(&vb->free_list);
|
|
vb->cpu = raw_smp_processor_id();
|
|
|
|
xa = addr_to_vb_xa(va->va_start);
|
|
vb_idx = addr_to_vb_idx(va->va_start);
|
|
err = xa_insert(xa, vb_idx, vb, gfp_mask);
|
|
if (err) {
|
|
kfree(vb);
|
|
free_vmap_area(va);
|
|
return ERR_PTR(err);
|
|
}
|
|
/*
|
|
* list_add_tail_rcu could happened in another core
|
|
* rather than vb->cpu due to task migration, which
|
|
* is safe as list_add_tail_rcu will ensure the list's
|
|
* integrity together with list_for_each_rcu from read
|
|
* side.
|
|
*/
|
|
vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
|
|
spin_lock(&vbq->lock);
|
|
list_add_tail_rcu(&vb->free_list, &vbq->free);
|
|
spin_unlock(&vbq->lock);
|
|
|
|
return vaddr;
|
|
}
|
|
|
|
static void free_vmap_block(struct vmap_block *vb)
|
|
{
|
|
struct vmap_node *vn;
|
|
struct vmap_block *tmp;
|
|
struct xarray *xa;
|
|
|
|
xa = addr_to_vb_xa(vb->va->va_start);
|
|
tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
|
|
BUG_ON(tmp != vb);
|
|
|
|
vn = addr_to_node(vb->va->va_start);
|
|
spin_lock(&vn->busy.lock);
|
|
unlink_va(vb->va, &vn->busy.root);
|
|
spin_unlock(&vn->busy.lock);
|
|
|
|
free_vmap_area_noflush(vb->va);
|
|
kfree_rcu(vb, rcu_head);
|
|
}
|
|
|
|
static bool purge_fragmented_block(struct vmap_block *vb,
|
|
struct list_head *purge_list, bool force_purge)
|
|
{
|
|
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);
|
|
|
|
if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
|
|
vb->dirty == VMAP_BBMAP_BITS)
|
|
return false;
|
|
|
|
/* Don't overeagerly purge usable blocks unless requested */
|
|
if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
|
|
return false;
|
|
|
|
/* prevent further allocs after releasing lock */
|
|
WRITE_ONCE(vb->free, 0);
|
|
/* prevent purging it again */
|
|
WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
|
|
vb->dirty_min = 0;
|
|
vb->dirty_max = VMAP_BBMAP_BITS;
|
|
spin_lock(&vbq->lock);
|
|
list_del_rcu(&vb->free_list);
|
|
spin_unlock(&vbq->lock);
|
|
list_add_tail(&vb->purge, purge_list);
|
|
return true;
|
|
}
|
|
|
|
static void free_purged_blocks(struct list_head *purge_list)
|
|
{
|
|
struct vmap_block *vb, *n_vb;
|
|
|
|
list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
|
|
list_del(&vb->purge);
|
|
free_vmap_block(vb);
|
|
}
|
|
}
|
|
|
|
static void purge_fragmented_blocks(int cpu)
|
|
{
|
|
LIST_HEAD(purge);
|
|
struct vmap_block *vb;
|
|
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
|
|
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
|
|
unsigned long free = READ_ONCE(vb->free);
|
|
unsigned long dirty = READ_ONCE(vb->dirty);
|
|
|
|
if (free + dirty != VMAP_BBMAP_BITS ||
|
|
dirty == VMAP_BBMAP_BITS)
|
|
continue;
|
|
|
|
spin_lock(&vb->lock);
|
|
purge_fragmented_block(vb, &purge, true);
|
|
spin_unlock(&vb->lock);
|
|
}
|
|
rcu_read_unlock();
|
|
free_purged_blocks(&purge);
|
|
}
|
|
|
|
static void purge_fragmented_blocks_allcpus(void)
|
|
{
|
|
int cpu;
|
|
|
|
for_each_possible_cpu(cpu)
|
|
purge_fragmented_blocks(cpu);
|
|
}
|
|
|
|
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
|
|
{
|
|
struct vmap_block_queue *vbq;
|
|
struct vmap_block *vb;
|
|
void *vaddr = NULL;
|
|
unsigned int order;
|
|
|
|
BUG_ON(offset_in_page(size));
|
|
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
|
|
if (WARN_ON(size == 0)) {
|
|
/*
|
|
* Allocating 0 bytes isn't what caller wants since
|
|
* get_order(0) returns funny result. Just warn and terminate
|
|
* early.
|
|
*/
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
order = get_order(size);
|
|
|
|
rcu_read_lock();
|
|
vbq = raw_cpu_ptr(&vmap_block_queue);
|
|
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
|
|
unsigned long pages_off;
|
|
|
|
if (READ_ONCE(vb->free) < (1UL << order))
|
|
continue;
|
|
|
|
spin_lock(&vb->lock);
|
|
if (vb->free < (1UL << order)) {
|
|
spin_unlock(&vb->lock);
|
|
continue;
|
|
}
|
|
|
|
pages_off = VMAP_BBMAP_BITS - vb->free;
|
|
vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
|
|
WRITE_ONCE(vb->free, vb->free - (1UL << order));
|
|
bitmap_set(vb->used_map, pages_off, (1UL << order));
|
|
if (vb->free == 0) {
|
|
spin_lock(&vbq->lock);
|
|
list_del_rcu(&vb->free_list);
|
|
spin_unlock(&vbq->lock);
|
|
}
|
|
|
|
spin_unlock(&vb->lock);
|
|
break;
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
/* Allocate new block if nothing was found */
|
|
if (!vaddr)
|
|
vaddr = new_vmap_block(order, gfp_mask);
|
|
|
|
return vaddr;
|
|
}
|
|
|
|
static void vb_free(unsigned long addr, unsigned long size)
|
|
{
|
|
unsigned long offset;
|
|
unsigned int order;
|
|
struct vmap_block *vb;
|
|
struct xarray *xa;
|
|
|
|
BUG_ON(offset_in_page(size));
|
|
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
|
|
|
|
flush_cache_vunmap(addr, addr + size);
|
|
|
|
order = get_order(size);
|
|
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
|
|
|
|
xa = addr_to_vb_xa(addr);
|
|
vb = xa_load(xa, addr_to_vb_idx(addr));
|
|
|
|
spin_lock(&vb->lock);
|
|
bitmap_clear(vb->used_map, offset, (1UL << order));
|
|
spin_unlock(&vb->lock);
|
|
|
|
vunmap_range_noflush(addr, addr + size);
|
|
|
|
if (debug_pagealloc_enabled_static())
|
|
flush_tlb_kernel_range(addr, addr + size);
|
|
|
|
spin_lock(&vb->lock);
|
|
|
|
/* Expand the not yet TLB flushed dirty range */
|
|
vb->dirty_min = min(vb->dirty_min, offset);
|
|
vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
|
|
|
|
WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
|
|
if (vb->dirty == VMAP_BBMAP_BITS) {
|
|
BUG_ON(vb->free);
|
|
spin_unlock(&vb->lock);
|
|
free_vmap_block(vb);
|
|
} else
|
|
spin_unlock(&vb->lock);
|
|
}
|
|
|
|
static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
|
|
{
|
|
LIST_HEAD(purge_list);
|
|
int cpu;
|
|
|
|
if (unlikely(!vmap_initialized))
|
|
return;
|
|
|
|
mutex_lock(&vmap_purge_lock);
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
|
|
struct vmap_block *vb;
|
|
unsigned long idx;
|
|
|
|
rcu_read_lock();
|
|
xa_for_each(&vbq->vmap_blocks, idx, vb) {
|
|
spin_lock(&vb->lock);
|
|
|
|
/*
|
|
* Try to purge a fragmented block first. If it's
|
|
* not purgeable, check whether there is dirty
|
|
* space to be flushed.
|
|
*/
|
|
if (!purge_fragmented_block(vb, &purge_list, false) &&
|
|
vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
|
|
unsigned long va_start = vb->va->va_start;
|
|
unsigned long s, e;
|
|
|
|
s = va_start + (vb->dirty_min << PAGE_SHIFT);
|
|
e = va_start + (vb->dirty_max << PAGE_SHIFT);
|
|
|
|
start = min(s, start);
|
|
end = max(e, end);
|
|
|
|
/* Prevent that this is flushed again */
|
|
vb->dirty_min = VMAP_BBMAP_BITS;
|
|
vb->dirty_max = 0;
|
|
|
|
flush = 1;
|
|
}
|
|
spin_unlock(&vb->lock);
|
|
}
|
|
rcu_read_unlock();
|
|
}
|
|
free_purged_blocks(&purge_list);
|
|
|
|
if (!__purge_vmap_area_lazy(start, end, false) && flush)
|
|
flush_tlb_kernel_range(start, end);
|
|
mutex_unlock(&vmap_purge_lock);
|
|
}
|
|
|
|
/**
|
|
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
|
|
*
|
|
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
|
|
* to amortize TLB flushing overheads. What this means is that any page you
|
|
* have now, may, in a former life, have been mapped into kernel virtual
|
|
* address by the vmap layer and so there might be some CPUs with TLB entries
|
|
* still referencing that page (additional to the regular 1:1 kernel mapping).
|
|
*
|
|
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
|
|
* be sure that none of the pages we have control over will have any aliases
|
|
* from the vmap layer.
|
|
*/
|
|
void vm_unmap_aliases(void)
|
|
{
|
|
unsigned long start = ULONG_MAX, end = 0;
|
|
int flush = 0;
|
|
|
|
_vm_unmap_aliases(start, end, flush);
|
|
}
|
|
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
|
|
|
|
/**
|
|
* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
|
|
* @mem: the pointer returned by vm_map_ram
|
|
* @count: the count passed to that vm_map_ram call (cannot unmap partial)
|
|
*/
|
|
void vm_unmap_ram(const void *mem, unsigned int count)
|
|
{
|
|
unsigned long size = (unsigned long)count << PAGE_SHIFT;
|
|
unsigned long addr = (unsigned long)kasan_reset_tag(mem);
|
|
struct vmap_area *va;
|
|
|
|
might_sleep();
|
|
BUG_ON(!addr);
|
|
BUG_ON(addr < VMALLOC_START);
|
|
BUG_ON(addr > VMALLOC_END);
|
|
BUG_ON(!PAGE_ALIGNED(addr));
|
|
|
|
kasan_poison_vmalloc(mem, size);
|
|
|
|
if (likely(count <= VMAP_MAX_ALLOC)) {
|
|
debug_check_no_locks_freed(mem, size);
|
|
vb_free(addr, size);
|
|
return;
|
|
}
|
|
|
|
va = find_unlink_vmap_area(addr);
|
|
if (WARN_ON_ONCE(!va))
|
|
return;
|
|
|
|
debug_check_no_locks_freed((void *)va->va_start, va_size(va));
|
|
free_unmap_vmap_area(va);
|
|
}
|
|
EXPORT_SYMBOL(vm_unmap_ram);
|
|
|
|
/**
|
|
* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
|
|
* @pages: an array of pointers to the pages to be mapped
|
|
* @count: number of pages
|
|
* @node: prefer to allocate data structures on this node
|
|
*
|
|
* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
|
|
* faster than vmap so it's good. But if you mix long-life and short-life
|
|
* objects with vm_map_ram(), it could consume lots of address space through
|
|
* fragmentation (especially on a 32bit machine). You could see failures in
|
|
* the end. Please use this function for short-lived objects.
|
|
*
|
|
* Returns: a pointer to the address that has been mapped, or %NULL on failure
|
|
*/
|
|
void *vm_map_ram(struct page **pages, unsigned int count, int node)
|
|
{
|
|
unsigned long size = (unsigned long)count << PAGE_SHIFT;
|
|
unsigned long addr;
|
|
void *mem;
|
|
|
|
if (likely(count <= VMAP_MAX_ALLOC)) {
|
|
mem = vb_alloc(size, GFP_KERNEL);
|
|
if (IS_ERR(mem))
|
|
return NULL;
|
|
addr = (unsigned long)mem;
|
|
} else {
|
|
struct vmap_area *va;
|
|
va = alloc_vmap_area(size, PAGE_SIZE,
|
|
VMALLOC_START, VMALLOC_END,
|
|
node, GFP_KERNEL, VMAP_RAM,
|
|
NULL);
|
|
if (IS_ERR(va))
|
|
return NULL;
|
|
|
|
addr = va->va_start;
|
|
mem = (void *)addr;
|
|
}
|
|
|
|
if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
|
|
pages, PAGE_SHIFT) < 0) {
|
|
vm_unmap_ram(mem, count);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Mark the pages as accessible, now that they are mapped.
|
|
* With hardware tag-based KASAN, marking is skipped for
|
|
* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
|
|
*/
|
|
mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
|
|
|
|
return mem;
|
|
}
|
|
EXPORT_SYMBOL(vm_map_ram);
|
|
|
|
static struct vm_struct *vmlist __initdata;
|
|
|
|
static inline unsigned int vm_area_page_order(struct vm_struct *vm)
|
|
{
|
|
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
|
|
return vm->page_order;
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
|
|
{
|
|
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
|
|
vm->page_order = order;
|
|
#else
|
|
BUG_ON(order != 0);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* vm_area_add_early - add vmap area early during boot
|
|
* @vm: vm_struct to add
|
|
*
|
|
* This function is used to add fixed kernel vm area to vmlist before
|
|
* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
|
|
* should contain proper values and the other fields should be zero.
|
|
*
|
|
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
|
|
*/
|
|
void __init vm_area_add_early(struct vm_struct *vm)
|
|
{
|
|
struct vm_struct *tmp, **p;
|
|
|
|
BUG_ON(vmap_initialized);
|
|
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
|
|
if (tmp->addr >= vm->addr) {
|
|
BUG_ON(tmp->addr < vm->addr + vm->size);
|
|
break;
|
|
} else
|
|
BUG_ON(tmp->addr + tmp->size > vm->addr);
|
|
}
|
|
vm->next = *p;
|
|
*p = vm;
|
|
}
|
|
|
|
/**
|
|
* vm_area_register_early - register vmap area early during boot
|
|
* @vm: vm_struct to register
|
|
* @align: requested alignment
|
|
*
|
|
* This function is used to register kernel vm area before
|
|
* vmalloc_init() is called. @vm->size and @vm->flags should contain
|
|
* proper values on entry and other fields should be zero. On return,
|
|
* vm->addr contains the allocated address.
|
|
*
|
|
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
|
|
*/
|
|
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
|
|
{
|
|
unsigned long addr = ALIGN(VMALLOC_START, align);
|
|
struct vm_struct *cur, **p;
|
|
|
|
BUG_ON(vmap_initialized);
|
|
|
|
for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
|
|
if ((unsigned long)cur->addr - addr >= vm->size)
|
|
break;
|
|
addr = ALIGN((unsigned long)cur->addr + cur->size, align);
|
|
}
|
|
|
|
BUG_ON(addr > VMALLOC_END - vm->size);
|
|
vm->addr = (void *)addr;
|
|
vm->next = *p;
|
|
*p = vm;
|
|
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
|
|
}
|
|
|
|
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
|
|
{
|
|
/*
|
|
* Before removing VM_UNINITIALIZED,
|
|
* we should make sure that vm has proper values.
|
|
* Pair with smp_rmb() in show_numa_info().
|
|
*/
|
|
smp_wmb();
|
|
vm->flags &= ~VM_UNINITIALIZED;
|
|
}
|
|
|
|
static struct vm_struct *__get_vm_area_node(unsigned long size,
|
|
unsigned long align, unsigned long shift, unsigned long flags,
|
|
unsigned long start, unsigned long end, int node,
|
|
gfp_t gfp_mask, const void *caller)
|
|
{
|
|
struct vmap_area *va;
|
|
struct vm_struct *area;
|
|
unsigned long requested_size = size;
|
|
|
|
BUG_ON(in_interrupt());
|
|
size = ALIGN(size, 1ul << shift);
|
|
if (unlikely(!size))
|
|
return NULL;
|
|
|
|
if (flags & VM_IOREMAP)
|
|
align = 1ul << clamp_t(int, get_count_order_long(size),
|
|
PAGE_SHIFT, IOREMAP_MAX_ORDER);
|
|
|
|
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
|
|
if (unlikely(!area))
|
|
return NULL;
|
|
|
|
if (!(flags & VM_NO_GUARD))
|
|
size += PAGE_SIZE;
|
|
|
|
area->flags = flags;
|
|
area->caller = caller;
|
|
|
|
va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
|
|
if (IS_ERR(va)) {
|
|
kfree(area);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
|
|
* best-effort approach, as they can be mapped outside of vmalloc code.
|
|
* For VM_ALLOC mappings, the pages are marked as accessible after
|
|
* getting mapped in __vmalloc_node_range().
|
|
* With hardware tag-based KASAN, marking is skipped for
|
|
* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
|
|
*/
|
|
if (!(flags & VM_ALLOC))
|
|
area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
|
|
KASAN_VMALLOC_PROT_NORMAL);
|
|
|
|
return area;
|
|
}
|
|
|
|
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
|
|
unsigned long start, unsigned long end,
|
|
const void *caller)
|
|
{
|
|
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
|
|
NUMA_NO_NODE, GFP_KERNEL, caller);
|
|
}
|
|
|
|
/**
|
|
* get_vm_area - reserve a contiguous kernel virtual area
|
|
* @size: size of the area
|
|
* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
|
|
*
|
|
* Search an area of @size in the kernel virtual mapping area,
|
|
* and reserved it for out purposes. Returns the area descriptor
|
|
* on success or %NULL on failure.
|
|
*
|
|
* Return: the area descriptor on success or %NULL on failure.
|
|
*/
|
|
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
|
|
{
|
|
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
|
|
VMALLOC_START, VMALLOC_END,
|
|
NUMA_NO_NODE, GFP_KERNEL,
|
|
__builtin_return_address(0));
|
|
}
|
|
|
|
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
|
|
const void *caller)
|
|
{
|
|
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
|
|
VMALLOC_START, VMALLOC_END,
|
|
NUMA_NO_NODE, GFP_KERNEL, caller);
|
|
}
|
|
|
|
/**
|
|
* find_vm_area - find a continuous kernel virtual area
|
|
* @addr: base address
|
|
*
|
|
* Search for the kernel VM area starting at @addr, and return it.
|
|
* It is up to the caller to do all required locking to keep the returned
|
|
* pointer valid.
|
|
*
|
|
* Return: the area descriptor on success or %NULL on failure.
|
|
*/
|
|
struct vm_struct *find_vm_area(const void *addr)
|
|
{
|
|
struct vmap_area *va;
|
|
|
|
va = find_vmap_area((unsigned long)addr);
|
|
if (!va)
|
|
return NULL;
|
|
|
|
return va->vm;
|
|
}
|
|
|
|
/**
|
|
* remove_vm_area - find and remove a continuous kernel virtual area
|
|
* @addr: base address
|
|
*
|
|
* Search for the kernel VM area starting at @addr, and remove it.
|
|
* This function returns the found VM area, but using it is NOT safe
|
|
* on SMP machines, except for its size or flags.
|
|
*
|
|
* Return: the area descriptor on success or %NULL on failure.
|
|
*/
|
|
struct vm_struct *remove_vm_area(const void *addr)
|
|
{
|
|
struct vmap_area *va;
|
|
struct vm_struct *vm;
|
|
|
|
might_sleep();
|
|
|
|
if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
|
|
addr))
|
|
return NULL;
|
|
|
|
va = find_unlink_vmap_area((unsigned long)addr);
|
|
if (!va || !va->vm)
|
|
return NULL;
|
|
vm = va->vm;
|
|
|
|
debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
|
|
debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
|
|
kasan_free_module_shadow(vm);
|
|
kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
|
|
|
|
free_unmap_vmap_area(va);
|
|
return vm;
|
|
}
|
|
|
|
static inline void set_area_direct_map(const struct vm_struct *area,
|
|
int (*set_direct_map)(struct page *page))
|
|
{
|
|
int i;
|
|
|
|
/* HUGE_VMALLOC passes small pages to set_direct_map */
|
|
for (i = 0; i < area->nr_pages; i++)
|
|
if (page_address(area->pages[i]))
|
|
set_direct_map(area->pages[i]);
|
|
}
|
|
|
|
/*
|
|
* Flush the vm mapping and reset the direct map.
|
|
*/
|
|
static void vm_reset_perms(struct vm_struct *area)
|
|
{
|
|
unsigned long start = ULONG_MAX, end = 0;
|
|
unsigned int page_order = vm_area_page_order(area);
|
|
int flush_dmap = 0;
|
|
int i;
|
|
|
|
/*
|
|
* Find the start and end range of the direct mappings to make sure that
|
|
* the vm_unmap_aliases() flush includes the direct map.
|
|
*/
|
|
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
|
|
unsigned long addr = (unsigned long)page_address(area->pages[i]);
|
|
|
|
if (addr) {
|
|
unsigned long page_size;
|
|
|
|
page_size = PAGE_SIZE << page_order;
|
|
start = min(addr, start);
|
|
end = max(addr + page_size, end);
|
|
flush_dmap = 1;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Set direct map to something invalid so that it won't be cached if
|
|
* there are any accesses after the TLB flush, then flush the TLB and
|
|
* reset the direct map permissions to the default.
|
|
*/
|
|
set_area_direct_map(area, set_direct_map_invalid_noflush);
|
|
_vm_unmap_aliases(start, end, flush_dmap);
|
|
set_area_direct_map(area, set_direct_map_default_noflush);
|
|
}
|
|
|
|
static void delayed_vfree_work(struct work_struct *w)
|
|
{
|
|
struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
|
|
struct llist_node *t, *llnode;
|
|
|
|
llist_for_each_safe(llnode, t, llist_del_all(&p->list))
|
|
vfree(llnode);
|
|
}
|
|
|
|
/**
|
|
* vfree_atomic - release memory allocated by vmalloc()
|
|
* @addr: memory base address
|
|
*
|
|
* This one is just like vfree() but can be called in any atomic context
|
|
* except NMIs.
|
|
*/
|
|
void vfree_atomic(const void *addr)
|
|
{
|
|
struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
|
|
|
|
BUG_ON(in_nmi());
|
|
kmemleak_free(addr);
|
|
|
|
/*
|
|
* Use raw_cpu_ptr() because this can be called from preemptible
|
|
* context. Preemption is absolutely fine here, because the llist_add()
|
|
* implementation is lockless, so it works even if we are adding to
|
|
* another cpu's list. schedule_work() should be fine with this too.
|
|
*/
|
|
if (addr && llist_add((struct llist_node *)addr, &p->list))
|
|
schedule_work(&p->wq);
|
|
}
|
|
|
|
/**
|
|
* vfree - Release memory allocated by vmalloc()
|
|
* @addr: Memory base address
|
|
*
|
|
* Free the virtually continuous memory area starting at @addr, as obtained
|
|
* from one of the vmalloc() family of APIs. This will usually also free the
|
|
* physical memory underlying the virtual allocation, but that memory is
|
|
* reference counted, so it will not be freed until the last user goes away.
|
|
*
|
|
* If @addr is NULL, no operation is performed.
|
|
*
|
|
* Context:
|
|
* May sleep if called *not* from interrupt context.
|
|
* Must not be called in NMI context (strictly speaking, it could be
|
|
* if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
|
|
* conventions for vfree() arch-dependent would be a really bad idea).
|
|
*/
|
|
void vfree(const void *addr)
|
|
{
|
|
struct vm_struct *vm;
|
|
int i;
|
|
|
|
if (unlikely(in_interrupt())) {
|
|
vfree_atomic(addr);
|
|
return;
|
|
}
|
|
|
|
BUG_ON(in_nmi());
|
|
kmemleak_free(addr);
|
|
might_sleep();
|
|
|
|
if (!addr)
|
|
return;
|
|
|
|
vm = remove_vm_area(addr);
|
|
if (unlikely(!vm)) {
|
|
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
|
|
addr);
|
|
return;
|
|
}
|
|
|
|
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
|
|
vm_reset_perms(vm);
|
|
for (i = 0; i < vm->nr_pages; i++) {
|
|
struct page *page = vm->pages[i];
|
|
|
|
BUG_ON(!page);
|
|
mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
|
|
/*
|
|
* High-order allocs for huge vmallocs are split, so
|
|
* can be freed as an array of order-0 allocations
|
|
*/
|
|
__free_page(page);
|
|
cond_resched();
|
|
}
|
|
atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
|
|
kvfree(vm->pages);
|
|
kfree(vm);
|
|
}
|
|
EXPORT_SYMBOL(vfree);
|
|
|
|
/**
|
|
* vunmap - release virtual mapping obtained by vmap()
|
|
* @addr: memory base address
|
|
*
|
|
* Free the virtually contiguous memory area starting at @addr,
|
|
* which was created from the page array passed to vmap().
|
|
*
|
|
* Must not be called in interrupt context.
|
|
*/
|
|
void vunmap(const void *addr)
|
|
{
|
|
struct vm_struct *vm;
|
|
|
|
BUG_ON(in_interrupt());
|
|
might_sleep();
|
|
|
|
if (!addr)
|
|
return;
|
|
vm = remove_vm_area(addr);
|
|
if (unlikely(!vm)) {
|
|
WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
|
|
addr);
|
|
return;
|
|
}
|
|
kfree(vm);
|
|
}
|
|
EXPORT_SYMBOL(vunmap);
|
|
|
|
/**
|
|
* vmap - map an array of pages into virtually contiguous space
|
|
* @pages: array of page pointers
|
|
* @count: number of pages to map
|
|
* @flags: vm_area->flags
|
|
* @prot: page protection for the mapping
|
|
*
|
|
* Maps @count pages from @pages into contiguous kernel virtual space.
|
|
* If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
|
|
* (which must be kmalloc or vmalloc memory) and one reference per pages in it
|
|
* are transferred from the caller to vmap(), and will be freed / dropped when
|
|
* vfree() is called on the return value.
|
|
*
|
|
* Return: the address of the area or %NULL on failure
|
|
*/
|
|
void *vmap(struct page **pages, unsigned int count,
|
|
unsigned long flags, pgprot_t prot)
|
|
{
|
|
struct vm_struct *area;
|
|
unsigned long addr;
|
|
unsigned long size; /* In bytes */
|
|
|
|
might_sleep();
|
|
|
|
if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
|
|
return NULL;
|
|
|
|
/*
|
|
* Your top guard is someone else's bottom guard. Not having a top
|
|
* guard compromises someone else's mappings too.
|
|
*/
|
|
if (WARN_ON_ONCE(flags & VM_NO_GUARD))
|
|
flags &= ~VM_NO_GUARD;
|
|
|
|
if (count > totalram_pages())
|
|
return NULL;
|
|
|
|
size = (unsigned long)count << PAGE_SHIFT;
|
|
area = get_vm_area_caller(size, flags, __builtin_return_address(0));
|
|
if (!area)
|
|
return NULL;
|
|
|
|
addr = (unsigned long)area->addr;
|
|
if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
|
|
pages, PAGE_SHIFT) < 0) {
|
|
vunmap(area->addr);
|
|
return NULL;
|
|
}
|
|
|
|
if (flags & VM_MAP_PUT_PAGES) {
|
|
area->pages = pages;
|
|
area->nr_pages = count;
|
|
}
|
|
return area->addr;
|
|
}
|
|
EXPORT_SYMBOL(vmap);
|
|
|
|
#ifdef CONFIG_VMAP_PFN
|
|
struct vmap_pfn_data {
|
|
unsigned long *pfns;
|
|
pgprot_t prot;
|
|
unsigned int idx;
|
|
};
|
|
|
|
static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
|
|
{
|
|
struct vmap_pfn_data *data = private;
|
|
unsigned long pfn = data->pfns[data->idx];
|
|
pte_t ptent;
|
|
|
|
if (WARN_ON_ONCE(pfn_valid(pfn)))
|
|
return -EINVAL;
|
|
|
|
ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
|
|
set_pte_at(&init_mm, addr, pte, ptent);
|
|
|
|
data->idx++;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* vmap_pfn - map an array of PFNs into virtually contiguous space
|
|
* @pfns: array of PFNs
|
|
* @count: number of pages to map
|
|
* @prot: page protection for the mapping
|
|
*
|
|
* Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
|
|
* the start address of the mapping.
|
|
*/
|
|
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
|
|
{
|
|
struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
|
|
struct vm_struct *area;
|
|
|
|
area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
|
|
__builtin_return_address(0));
|
|
if (!area)
|
|
return NULL;
|
|
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
|
|
count * PAGE_SIZE, vmap_pfn_apply, &data)) {
|
|
free_vm_area(area);
|
|
return NULL;
|
|
}
|
|
|
|
flush_cache_vmap((unsigned long)area->addr,
|
|
(unsigned long)area->addr + count * PAGE_SIZE);
|
|
|
|
return area->addr;
|
|
}
|
|
EXPORT_SYMBOL_GPL(vmap_pfn);
|
|
#endif /* CONFIG_VMAP_PFN */
|
|
|
|
static inline unsigned int
|
|
vm_area_alloc_pages(gfp_t gfp, int nid,
|
|
unsigned int order, unsigned int nr_pages, struct page **pages)
|
|
{
|
|
unsigned int nr_allocated = 0;
|
|
struct page *page;
|
|
int i;
|
|
|
|
/*
|
|
* For order-0 pages we make use of bulk allocator, if
|
|
* the page array is partly or not at all populated due
|
|
* to fails, fallback to a single page allocator that is
|
|
* more permissive.
|
|
*/
|
|
if (!order) {
|
|
while (nr_allocated < nr_pages) {
|
|
unsigned int nr, nr_pages_request;
|
|
|
|
/*
|
|
* A maximum allowed request is hard-coded and is 100
|
|
* pages per call. That is done in order to prevent a
|
|
* long preemption off scenario in the bulk-allocator
|
|
* so the range is [1:100].
|
|
*/
|
|
nr_pages_request = min(100U, nr_pages - nr_allocated);
|
|
|
|
/* memory allocation should consider mempolicy, we can't
|
|
* wrongly use nearest node when nid == NUMA_NO_NODE,
|
|
* otherwise memory may be allocated in only one node,
|
|
* but mempolicy wants to alloc memory by interleaving.
|
|
*/
|
|
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
|
|
nr = alloc_pages_bulk_array_mempolicy_noprof(gfp,
|
|
nr_pages_request,
|
|
pages + nr_allocated);
|
|
else
|
|
nr = alloc_pages_bulk_array_node_noprof(gfp, nid,
|
|
nr_pages_request,
|
|
pages + nr_allocated);
|
|
|
|
nr_allocated += nr;
|
|
cond_resched();
|
|
|
|
/*
|
|
* If zero or pages were obtained partly,
|
|
* fallback to a single page allocator.
|
|
*/
|
|
if (nr != nr_pages_request)
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* High-order pages or fallback path if "bulk" fails. */
|
|
while (nr_allocated < nr_pages) {
|
|
if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
|
|
break;
|
|
|
|
if (nid == NUMA_NO_NODE)
|
|
page = alloc_pages_noprof(gfp, order);
|
|
else
|
|
page = alloc_pages_node_noprof(nid, gfp, order);
|
|
|
|
if (unlikely(!page))
|
|
break;
|
|
|
|
/*
|
|
* High-order allocations must be able to be treated as
|
|
* independent small pages by callers (as they can with
|
|
* small-page vmallocs). Some drivers do their own refcounting
|
|
* on vmalloc_to_page() pages, some use page->mapping,
|
|
* page->lru, etc.
|
|
*/
|
|
if (order)
|
|
split_page(page, order);
|
|
|
|
/*
|
|
* Careful, we allocate and map page-order pages, but
|
|
* tracking is done per PAGE_SIZE page so as to keep the
|
|
* vm_struct APIs independent of the physical/mapped size.
|
|
*/
|
|
for (i = 0; i < (1U << order); i++)
|
|
pages[nr_allocated + i] = page + i;
|
|
|
|
cond_resched();
|
|
nr_allocated += 1U << order;
|
|
}
|
|
|
|
return nr_allocated;
|
|
}
|
|
|
|
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
|
pgprot_t prot, unsigned int page_shift,
|
|
int node)
|
|
{
|
|
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
|
|
bool nofail = gfp_mask & __GFP_NOFAIL;
|
|
unsigned long addr = (unsigned long)area->addr;
|
|
unsigned long size = get_vm_area_size(area);
|
|
unsigned long array_size;
|
|
unsigned int nr_small_pages = size >> PAGE_SHIFT;
|
|
unsigned int page_order;
|
|
unsigned int flags;
|
|
int ret;
|
|
|
|
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
|
|
|
|
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
|
|
gfp_mask |= __GFP_HIGHMEM;
|
|
|
|
/* Please note that the recursion is strictly bounded. */
|
|
if (array_size > PAGE_SIZE) {
|
|
area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
|
|
area->caller);
|
|
} else {
|
|
area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
|
|
}
|
|
|
|
if (!area->pages) {
|
|
warn_alloc(gfp_mask, NULL,
|
|
"vmalloc error: size %lu, failed to allocated page array size %lu",
|
|
nr_small_pages * PAGE_SIZE, array_size);
|
|
free_vm_area(area);
|
|
return NULL;
|
|
}
|
|
|
|
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
|
|
page_order = vm_area_page_order(area);
|
|
|
|
/*
|
|
* High-order nofail allocations are really expensive and
|
|
* potentially dangerous (pre-mature OOM, disruptive reclaim
|
|
* and compaction etc.
|
|
*
|
|
* Please note, the __vmalloc_node_range_noprof() falls-back
|
|
* to order-0 pages if high-order attempt is unsuccessful.
|
|
*/
|
|
area->nr_pages = vm_area_alloc_pages((page_order ?
|
|
gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN,
|
|
node, page_order, nr_small_pages, area->pages);
|
|
|
|
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
|
|
if (gfp_mask & __GFP_ACCOUNT) {
|
|
int i;
|
|
|
|
for (i = 0; i < area->nr_pages; i++)
|
|
mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
|
|
}
|
|
|
|
/*
|
|
* If not enough pages were obtained to accomplish an
|
|
* allocation request, free them via vfree() if any.
|
|
*/
|
|
if (area->nr_pages != nr_small_pages) {
|
|
/*
|
|
* vm_area_alloc_pages() can fail due to insufficient memory but
|
|
* also:-
|
|
*
|
|
* - a pending fatal signal
|
|
* - insufficient huge page-order pages
|
|
*
|
|
* Since we always retry allocations at order-0 in the huge page
|
|
* case a warning for either is spurious.
|
|
*/
|
|
if (!fatal_signal_pending(current) && page_order == 0)
|
|
warn_alloc(gfp_mask, NULL,
|
|
"vmalloc error: size %lu, failed to allocate pages",
|
|
area->nr_pages * PAGE_SIZE);
|
|
goto fail;
|
|
}
|
|
|
|
/*
|
|
* page tables allocations ignore external gfp mask, enforce it
|
|
* by the scope API
|
|
*/
|
|
if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
|
|
flags = memalloc_nofs_save();
|
|
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
|
|
flags = memalloc_noio_save();
|
|
|
|
do {
|
|
ret = vmap_pages_range(addr, addr + size, prot, area->pages,
|
|
page_shift);
|
|
if (nofail && (ret < 0))
|
|
schedule_timeout_uninterruptible(1);
|
|
} while (nofail && (ret < 0));
|
|
|
|
if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
|
|
memalloc_nofs_restore(flags);
|
|
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
|
|
memalloc_noio_restore(flags);
|
|
|
|
if (ret < 0) {
|
|
warn_alloc(gfp_mask, NULL,
|
|
"vmalloc error: size %lu, failed to map pages",
|
|
area->nr_pages * PAGE_SIZE);
|
|
goto fail;
|
|
}
|
|
|
|
return area->addr;
|
|
|
|
fail:
|
|
vfree(area->addr);
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* __vmalloc_node_range - allocate virtually contiguous memory
|
|
* @size: allocation size
|
|
* @align: desired alignment
|
|
* @start: vm area range start
|
|
* @end: vm area range end
|
|
* @gfp_mask: flags for the page level allocator
|
|
* @prot: protection mask for the allocated pages
|
|
* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
|
|
* @node: node to use for allocation or NUMA_NO_NODE
|
|
* @caller: caller's return address
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator with @gfp_mask flags. Please note that the full set of gfp
|
|
* flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
|
|
* supported.
|
|
* Zone modifiers are not supported. From the reclaim modifiers
|
|
* __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
|
|
* and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
|
|
* __GFP_RETRY_MAYFAIL are not supported).
|
|
*
|
|
* __GFP_NOWARN can be used to suppress failures messages.
|
|
*
|
|
* Map them into contiguous kernel virtual space, using a pagetable
|
|
* protection of @prot.
|
|
*
|
|
* Return: the address of the area or %NULL on failure
|
|
*/
|
|
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
|
|
unsigned long start, unsigned long end, gfp_t gfp_mask,
|
|
pgprot_t prot, unsigned long vm_flags, int node,
|
|
const void *caller)
|
|
{
|
|
struct vm_struct *area;
|
|
void *ret;
|
|
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
|
|
unsigned long real_size = size;
|
|
unsigned long real_align = align;
|
|
unsigned int shift = PAGE_SHIFT;
|
|
|
|
if (WARN_ON_ONCE(!size))
|
|
return NULL;
|
|
|
|
if ((size >> PAGE_SHIFT) > totalram_pages()) {
|
|
warn_alloc(gfp_mask, NULL,
|
|
"vmalloc error: size %lu, exceeds total pages",
|
|
real_size);
|
|
return NULL;
|
|
}
|
|
|
|
if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
|
|
unsigned long size_per_node;
|
|
|
|
/*
|
|
* Try huge pages. Only try for PAGE_KERNEL allocations,
|
|
* others like modules don't yet expect huge pages in
|
|
* their allocations due to apply_to_page_range not
|
|
* supporting them.
|
|
*/
|
|
|
|
size_per_node = size;
|
|
if (node == NUMA_NO_NODE)
|
|
size_per_node /= num_online_nodes();
|
|
if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
|
|
shift = PMD_SHIFT;
|
|
else
|
|
shift = arch_vmap_pte_supported_shift(size_per_node);
|
|
|
|
align = max(real_align, 1UL << shift);
|
|
size = ALIGN(real_size, 1UL << shift);
|
|
}
|
|
|
|
again:
|
|
area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
|
|
VM_UNINITIALIZED | vm_flags, start, end, node,
|
|
gfp_mask, caller);
|
|
if (!area) {
|
|
bool nofail = gfp_mask & __GFP_NOFAIL;
|
|
warn_alloc(gfp_mask, NULL,
|
|
"vmalloc error: size %lu, vm_struct allocation failed%s",
|
|
real_size, (nofail) ? ". Retrying." : "");
|
|
if (nofail) {
|
|
schedule_timeout_uninterruptible(1);
|
|
goto again;
|
|
}
|
|
goto fail;
|
|
}
|
|
|
|
/*
|
|
* Prepare arguments for __vmalloc_area_node() and
|
|
* kasan_unpoison_vmalloc().
|
|
*/
|
|
if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
|
|
if (kasan_hw_tags_enabled()) {
|
|
/*
|
|
* Modify protection bits to allow tagging.
|
|
* This must be done before mapping.
|
|
*/
|
|
prot = arch_vmap_pgprot_tagged(prot);
|
|
|
|
/*
|
|
* Skip page_alloc poisoning and zeroing for physical
|
|
* pages backing VM_ALLOC mapping. Memory is instead
|
|
* poisoned and zeroed by kasan_unpoison_vmalloc().
|
|
*/
|
|
gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
|
|
}
|
|
|
|
/* Take note that the mapping is PAGE_KERNEL. */
|
|
kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
|
|
}
|
|
|
|
/* Allocate physical pages and map them into vmalloc space. */
|
|
ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
|
|
if (!ret)
|
|
goto fail;
|
|
|
|
/*
|
|
* Mark the pages as accessible, now that they are mapped.
|
|
* The condition for setting KASAN_VMALLOC_INIT should complement the
|
|
* one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
|
|
* to make sure that memory is initialized under the same conditions.
|
|
* Tag-based KASAN modes only assign tags to normal non-executable
|
|
* allocations, see __kasan_unpoison_vmalloc().
|
|
*/
|
|
kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
|
|
if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
|
|
(gfp_mask & __GFP_SKIP_ZERO))
|
|
kasan_flags |= KASAN_VMALLOC_INIT;
|
|
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
|
|
area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
|
|
|
|
/*
|
|
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
|
|
* flag. It means that vm_struct is not fully initialized.
|
|
* Now, it is fully initialized, so remove this flag here.
|
|
*/
|
|
clear_vm_uninitialized_flag(area);
|
|
|
|
size = PAGE_ALIGN(size);
|
|
if (!(vm_flags & VM_DEFER_KMEMLEAK))
|
|
kmemleak_vmalloc(area, size, gfp_mask);
|
|
|
|
return area->addr;
|
|
|
|
fail:
|
|
if (shift > PAGE_SHIFT) {
|
|
shift = PAGE_SHIFT;
|
|
align = real_align;
|
|
size = real_size;
|
|
goto again;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* __vmalloc_node - allocate virtually contiguous memory
|
|
* @size: allocation size
|
|
* @align: desired alignment
|
|
* @gfp_mask: flags for the page level allocator
|
|
* @node: node to use for allocation or NUMA_NO_NODE
|
|
* @caller: caller's return address
|
|
*
|
|
* Allocate enough pages to cover @size from the page level allocator with
|
|
* @gfp_mask flags. Map them into contiguous kernel virtual space.
|
|
*
|
|
* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
|
|
* and __GFP_NOFAIL are not supported
|
|
*
|
|
* Any use of gfp flags outside of GFP_KERNEL should be consulted
|
|
* with mm people.
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
|
|
gfp_t gfp_mask, int node, const void *caller)
|
|
{
|
|
return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
|
|
gfp_mask, PAGE_KERNEL, 0, node, caller);
|
|
}
|
|
/*
|
|
* This is only for performance analysis of vmalloc and stress purpose.
|
|
* It is required by vmalloc test module, therefore do not use it other
|
|
* than that.
|
|
*/
|
|
#ifdef CONFIG_TEST_VMALLOC_MODULE
|
|
EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
|
|
#endif
|
|
|
|
void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
|
|
{
|
|
return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(__vmalloc_noprof);
|
|
|
|
/**
|
|
* vmalloc - allocate virtually contiguous memory
|
|
* @size: allocation size
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
*
|
|
* For tight control over page level allocator and protection flags
|
|
* use __vmalloc() instead.
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *vmalloc_noprof(unsigned long size)
|
|
{
|
|
return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_noprof);
|
|
|
|
/**
|
|
* vmalloc_huge - allocate virtually contiguous memory, allow huge pages
|
|
* @size: allocation size
|
|
* @gfp_mask: flags for the page level allocator
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
* If @size is greater than or equal to PMD_SIZE, allow using
|
|
* huge pages for the memory
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
|
|
{
|
|
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
|
|
gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
|
|
NUMA_NO_NODE, __builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
|
|
|
|
/**
|
|
* vzalloc - allocate virtually contiguous memory with zero fill
|
|
* @size: allocation size
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
* The memory allocated is set to zero.
|
|
*
|
|
* For tight control over page level allocator and protection flags
|
|
* use __vmalloc() instead.
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *vzalloc_noprof(unsigned long size)
|
|
{
|
|
return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vzalloc_noprof);
|
|
|
|
/**
|
|
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
|
|
* @size: allocation size
|
|
*
|
|
* The resulting memory area is zeroed so it can be mapped to userspace
|
|
* without leaking data.
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *vmalloc_user_noprof(unsigned long size)
|
|
{
|
|
return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
|
|
GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
|
|
VM_USERMAP, NUMA_NO_NODE,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_user_noprof);
|
|
|
|
/**
|
|
* vmalloc_node - allocate memory on a specific node
|
|
* @size: allocation size
|
|
* @node: numa node
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
*
|
|
* For tight control over page level allocator and protection flags
|
|
* use __vmalloc() instead.
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *vmalloc_node_noprof(unsigned long size, int node)
|
|
{
|
|
return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_node_noprof);
|
|
|
|
/**
|
|
* vzalloc_node - allocate memory on a specific node with zero fill
|
|
* @size: allocation size
|
|
* @node: numa node
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
* The memory allocated is set to zero.
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *vzalloc_node_noprof(unsigned long size, int node)
|
|
{
|
|
return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vzalloc_node_noprof);
|
|
|
|
/**
|
|
* vrealloc - reallocate virtually contiguous memory; contents remain unchanged
|
|
* @p: object to reallocate memory for
|
|
* @size: the size to reallocate
|
|
* @flags: the flags for the page level allocator
|
|
*
|
|
* If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and
|
|
* @p is not a %NULL pointer, the object pointed to is freed.
|
|
*
|
|
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
|
|
* initial memory allocation, every subsequent call to this API for the same
|
|
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
|
|
* __GFP_ZERO is not fully honored by this API.
|
|
*
|
|
* In any case, the contents of the object pointed to are preserved up to the
|
|
* lesser of the new and old sizes.
|
|
*
|
|
* This function must not be called concurrently with itself or vfree() for the
|
|
* same memory allocation.
|
|
*
|
|
* Return: pointer to the allocated memory; %NULL if @size is zero or in case of
|
|
* failure
|
|
*/
|
|
void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
|
|
{
|
|
size_t old_size = 0;
|
|
void *n;
|
|
|
|
if (!size) {
|
|
vfree(p);
|
|
return NULL;
|
|
}
|
|
|
|
if (p) {
|
|
struct vm_struct *vm;
|
|
|
|
vm = find_vm_area(p);
|
|
if (unlikely(!vm)) {
|
|
WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
|
|
return NULL;
|
|
}
|
|
|
|
old_size = get_vm_area_size(vm);
|
|
}
|
|
|
|
/*
|
|
* TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
|
|
* would be a good heuristic for when to shrink the vm_area?
|
|
*/
|
|
if (size <= old_size) {
|
|
/* Zero out spare memory. */
|
|
if (want_init_on_alloc(flags))
|
|
memset((void *)p + size, 0, old_size - size);
|
|
|
|
return (void *)p;
|
|
}
|
|
|
|
/* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
|
|
n = __vmalloc_noprof(size, flags);
|
|
if (!n)
|
|
return NULL;
|
|
|
|
if (p) {
|
|
memcpy(n, p, old_size);
|
|
vfree(p);
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
|
|
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
|
|
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
|
|
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
|
|
#else
|
|
/*
|
|
* 64b systems should always have either DMA or DMA32 zones. For others
|
|
* GFP_DMA32 should do the right thing and use the normal zone.
|
|
*/
|
|
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
|
|
#endif
|
|
|
|
/**
|
|
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
|
|
* @size: allocation size
|
|
*
|
|
* Allocate enough 32bit PA addressable pages to cover @size from the
|
|
* page level allocator and map them into contiguous kernel virtual space.
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *vmalloc_32_noprof(unsigned long size)
|
|
{
|
|
return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_32_noprof);
|
|
|
|
/**
|
|
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
|
|
* @size: allocation size
|
|
*
|
|
* The resulting memory area is 32bit addressable and zeroed so it can be
|
|
* mapped to userspace without leaking data.
|
|
*
|
|
* Return: pointer to the allocated memory or %NULL on error
|
|
*/
|
|
void *vmalloc_32_user_noprof(unsigned long size)
|
|
{
|
|
return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
|
|
GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
|
|
VM_USERMAP, NUMA_NO_NODE,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_32_user_noprof);
|
|
|
|
/*
|
|
* Atomically zero bytes in the iterator.
|
|
*
|
|
* Returns the number of zeroed bytes.
|
|
*/
|
|
static size_t zero_iter(struct iov_iter *iter, size_t count)
|
|
{
|
|
size_t remains = count;
|
|
|
|
while (remains > 0) {
|
|
size_t num, copied;
|
|
|
|
num = min_t(size_t, remains, PAGE_SIZE);
|
|
copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
|
|
remains -= copied;
|
|
|
|
if (copied < num)
|
|
break;
|
|
}
|
|
|
|
return count - remains;
|
|
}
|
|
|
|
/*
|
|
* small helper routine, copy contents to iter from addr.
|
|
* If the page is not present, fill zero.
|
|
*
|
|
* Returns the number of copied bytes.
|
|
*/
|
|
static size_t aligned_vread_iter(struct iov_iter *iter,
|
|
const char *addr, size_t count)
|
|
{
|
|
size_t remains = count;
|
|
struct page *page;
|
|
|
|
while (remains > 0) {
|
|
unsigned long offset, length;
|
|
size_t copied = 0;
|
|
|
|
offset = offset_in_page(addr);
|
|
length = PAGE_SIZE - offset;
|
|
if (length > remains)
|
|
length = remains;
|
|
page = vmalloc_to_page(addr);
|
|
/*
|
|
* To do safe access to this _mapped_ area, we need lock. But
|
|
* adding lock here means that we need to add overhead of
|
|
* vmalloc()/vfree() calls for this _debug_ interface, rarely
|
|
* used. Instead of that, we'll use an local mapping via
|
|
* copy_page_to_iter_nofault() and accept a small overhead in
|
|
* this access function.
|
|
*/
|
|
if (page)
|
|
copied = copy_page_to_iter_nofault(page, offset,
|
|
length, iter);
|
|
else
|
|
copied = zero_iter(iter, length);
|
|
|
|
addr += copied;
|
|
remains -= copied;
|
|
|
|
if (copied != length)
|
|
break;
|
|
}
|
|
|
|
return count - remains;
|
|
}
|
|
|
|
/*
|
|
* Read from a vm_map_ram region of memory.
|
|
*
|
|
* Returns the number of copied bytes.
|
|
*/
|
|
static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
|
|
size_t count, unsigned long flags)
|
|
{
|
|
char *start;
|
|
struct vmap_block *vb;
|
|
struct xarray *xa;
|
|
unsigned long offset;
|
|
unsigned int rs, re;
|
|
size_t remains, n;
|
|
|
|
/*
|
|
* If it's area created by vm_map_ram() interface directly, but
|
|
* not further subdividing and delegating management to vmap_block,
|
|
* handle it here.
|
|
*/
|
|
if (!(flags & VMAP_BLOCK))
|
|
return aligned_vread_iter(iter, addr, count);
|
|
|
|
remains = count;
|
|
|
|
/*
|
|
* Area is split into regions and tracked with vmap_block, read out
|
|
* each region and zero fill the hole between regions.
|
|
*/
|
|
xa = addr_to_vb_xa((unsigned long) addr);
|
|
vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
|
|
if (!vb)
|
|
goto finished_zero;
|
|
|
|
spin_lock(&vb->lock);
|
|
if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
|
|
spin_unlock(&vb->lock);
|
|
goto finished_zero;
|
|
}
|
|
|
|
for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
|
|
size_t copied;
|
|
|
|
if (remains == 0)
|
|
goto finished;
|
|
|
|
start = vmap_block_vaddr(vb->va->va_start, rs);
|
|
|
|
if (addr < start) {
|
|
size_t to_zero = min_t(size_t, start - addr, remains);
|
|
size_t zeroed = zero_iter(iter, to_zero);
|
|
|
|
addr += zeroed;
|
|
remains -= zeroed;
|
|
|
|
if (remains == 0 || zeroed != to_zero)
|
|
goto finished;
|
|
}
|
|
|
|
/*it could start reading from the middle of used region*/
|
|
offset = offset_in_page(addr);
|
|
n = ((re - rs + 1) << PAGE_SHIFT) - offset;
|
|
if (n > remains)
|
|
n = remains;
|
|
|
|
copied = aligned_vread_iter(iter, start + offset, n);
|
|
|
|
addr += copied;
|
|
remains -= copied;
|
|
|
|
if (copied != n)
|
|
goto finished;
|
|
}
|
|
|
|
spin_unlock(&vb->lock);
|
|
|
|
finished_zero:
|
|
/* zero-fill the left dirty or free regions */
|
|
return count - remains + zero_iter(iter, remains);
|
|
finished:
|
|
/* We couldn't copy/zero everything */
|
|
spin_unlock(&vb->lock);
|
|
return count - remains;
|
|
}
|
|
|
|
/**
|
|
* vread_iter() - read vmalloc area in a safe way to an iterator.
|
|
* @iter: the iterator to which data should be written.
|
|
* @addr: vm address.
|
|
* @count: number of bytes to be read.
|
|
*
|
|
* This function checks that addr is a valid vmalloc'ed area, and
|
|
* copy data from that area to a given buffer. If the given memory range
|
|
* of [addr...addr+count) includes some valid address, data is copied to
|
|
* proper area of @buf. If there are memory holes, they'll be zero-filled.
|
|
* IOREMAP area is treated as memory hole and no copy is done.
|
|
*
|
|
* If [addr...addr+count) doesn't includes any intersects with alive
|
|
* vm_struct area, returns 0. @buf should be kernel's buffer.
|
|
*
|
|
* Note: In usual ops, vread() is never necessary because the caller
|
|
* should know vmalloc() area is valid and can use memcpy().
|
|
* This is for routines which have to access vmalloc area without
|
|
* any information, as /proc/kcore.
|
|
*
|
|
* Return: number of bytes for which addr and buf should be increased
|
|
* (same number as @count) or %0 if [addr...addr+count) doesn't
|
|
* include any intersection with valid vmalloc area
|
|
*/
|
|
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
|
|
{
|
|
struct vmap_node *vn;
|
|
struct vmap_area *va;
|
|
struct vm_struct *vm;
|
|
char *vaddr;
|
|
size_t n, size, flags, remains;
|
|
unsigned long next;
|
|
|
|
addr = kasan_reset_tag(addr);
|
|
|
|
/* Don't allow overflow */
|
|
if ((unsigned long) addr + count < count)
|
|
count = -(unsigned long) addr;
|
|
|
|
remains = count;
|
|
|
|
vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
|
|
if (!vn)
|
|
goto finished_zero;
|
|
|
|
/* no intersects with alive vmap_area */
|
|
if ((unsigned long)addr + remains <= va->va_start)
|
|
goto finished_zero;
|
|
|
|
do {
|
|
size_t copied;
|
|
|
|
if (remains == 0)
|
|
goto finished;
|
|
|
|
vm = va->vm;
|
|
flags = va->flags & VMAP_FLAGS_MASK;
|
|
/*
|
|
* VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
|
|
* be set together with VMAP_RAM.
|
|
*/
|
|
WARN_ON(flags == VMAP_BLOCK);
|
|
|
|
if (!vm && !flags)
|
|
goto next_va;
|
|
|
|
if (vm && (vm->flags & VM_UNINITIALIZED))
|
|
goto next_va;
|
|
|
|
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
|
|
smp_rmb();
|
|
|
|
vaddr = (char *) va->va_start;
|
|
size = vm ? get_vm_area_size(vm) : va_size(va);
|
|
|
|
if (addr >= vaddr + size)
|
|
goto next_va;
|
|
|
|
if (addr < vaddr) {
|
|
size_t to_zero = min_t(size_t, vaddr - addr, remains);
|
|
size_t zeroed = zero_iter(iter, to_zero);
|
|
|
|
addr += zeroed;
|
|
remains -= zeroed;
|
|
|
|
if (remains == 0 || zeroed != to_zero)
|
|
goto finished;
|
|
}
|
|
|
|
n = vaddr + size - addr;
|
|
if (n > remains)
|
|
n = remains;
|
|
|
|
if (flags & VMAP_RAM)
|
|
copied = vmap_ram_vread_iter(iter, addr, n, flags);
|
|
else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
|
|
copied = aligned_vread_iter(iter, addr, n);
|
|
else /* IOREMAP | SPARSE area is treated as memory hole */
|
|
copied = zero_iter(iter, n);
|
|
|
|
addr += copied;
|
|
remains -= copied;
|
|
|
|
if (copied != n)
|
|
goto finished;
|
|
|
|
next_va:
|
|
next = va->va_end;
|
|
spin_unlock(&vn->busy.lock);
|
|
} while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
|
|
|
|
finished_zero:
|
|
if (vn)
|
|
spin_unlock(&vn->busy.lock);
|
|
|
|
/* zero-fill memory holes */
|
|
return count - remains + zero_iter(iter, remains);
|
|
finished:
|
|
/* Nothing remains, or We couldn't copy/zero everything. */
|
|
if (vn)
|
|
spin_unlock(&vn->busy.lock);
|
|
|
|
return count - remains;
|
|
}
|
|
|
|
/**
|
|
* remap_vmalloc_range_partial - map vmalloc pages to userspace
|
|
* @vma: vma to cover
|
|
* @uaddr: target user address to start at
|
|
* @kaddr: virtual address of vmalloc kernel memory
|
|
* @pgoff: offset from @kaddr to start at
|
|
* @size: size of map area
|
|
*
|
|
* Returns: 0 for success, -Exxx on failure
|
|
*
|
|
* This function checks that @kaddr is a valid vmalloc'ed area,
|
|
* and that it is big enough to cover the range starting at
|
|
* @uaddr in @vma. Will return failure if that criteria isn't
|
|
* met.
|
|
*
|
|
* Similar to remap_pfn_range() (see mm/memory.c)
|
|
*/
|
|
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
|
|
void *kaddr, unsigned long pgoff,
|
|
unsigned long size)
|
|
{
|
|
struct vm_struct *area;
|
|
unsigned long off;
|
|
unsigned long end_index;
|
|
|
|
if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
|
|
return -EINVAL;
|
|
|
|
size = PAGE_ALIGN(size);
|
|
|
|
if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
|
|
return -EINVAL;
|
|
|
|
area = find_vm_area(kaddr);
|
|
if (!area)
|
|
return -EINVAL;
|
|
|
|
if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
|
|
return -EINVAL;
|
|
|
|
if (check_add_overflow(size, off, &end_index) ||
|
|
end_index > get_vm_area_size(area))
|
|
return -EINVAL;
|
|
kaddr += off;
|
|
|
|
do {
|
|
struct page *page = vmalloc_to_page(kaddr);
|
|
int ret;
|
|
|
|
ret = vm_insert_page(vma, uaddr, page);
|
|
if (ret)
|
|
return ret;
|
|
|
|
uaddr += PAGE_SIZE;
|
|
kaddr += PAGE_SIZE;
|
|
size -= PAGE_SIZE;
|
|
} while (size > 0);
|
|
|
|
vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* remap_vmalloc_range - map vmalloc pages to userspace
|
|
* @vma: vma to cover (map full range of vma)
|
|
* @addr: vmalloc memory
|
|
* @pgoff: number of pages into addr before first page to map
|
|
*
|
|
* Returns: 0 for success, -Exxx on failure
|
|
*
|
|
* This function checks that addr is a valid vmalloc'ed area, and
|
|
* that it is big enough to cover the vma. Will return failure if
|
|
* that criteria isn't met.
|
|
*
|
|
* Similar to remap_pfn_range() (see mm/memory.c)
|
|
*/
|
|
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
|
|
unsigned long pgoff)
|
|
{
|
|
return remap_vmalloc_range_partial(vma, vma->vm_start,
|
|
addr, pgoff,
|
|
vma->vm_end - vma->vm_start);
|
|
}
|
|
EXPORT_SYMBOL(remap_vmalloc_range);
|
|
|
|
void free_vm_area(struct vm_struct *area)
|
|
{
|
|
struct vm_struct *ret;
|
|
ret = remove_vm_area(area->addr);
|
|
BUG_ON(ret != area);
|
|
kfree(area);
|
|
}
|
|
EXPORT_SYMBOL_GPL(free_vm_area);
|
|
|
|
#ifdef CONFIG_SMP
|
|
static struct vmap_area *node_to_va(struct rb_node *n)
|
|
{
|
|
return rb_entry_safe(n, struct vmap_area, rb_node);
|
|
}
|
|
|
|
/**
|
|
* pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
|
|
* @addr: target address
|
|
*
|
|
* Returns: vmap_area if it is found. If there is no such area
|
|
* the first highest(reverse order) vmap_area is returned
|
|
* i.e. va->va_start < addr && va->va_end < addr or NULL
|
|
* if there are no any areas before @addr.
|
|
*/
|
|
static struct vmap_area *
|
|
pvm_find_va_enclose_addr(unsigned long addr)
|
|
{
|
|
struct vmap_area *va, *tmp;
|
|
struct rb_node *n;
|
|
|
|
n = free_vmap_area_root.rb_node;
|
|
va = NULL;
|
|
|
|
while (n) {
|
|
tmp = rb_entry(n, struct vmap_area, rb_node);
|
|
if (tmp->va_start <= addr) {
|
|
va = tmp;
|
|
if (tmp->va_end >= addr)
|
|
break;
|
|
|
|
n = n->rb_right;
|
|
} else {
|
|
n = n->rb_left;
|
|
}
|
|
}
|
|
|
|
return va;
|
|
}
|
|
|
|
/**
|
|
* pvm_determine_end_from_reverse - find the highest aligned address
|
|
* of free block below VMALLOC_END
|
|
* @va:
|
|
* in - the VA we start the search(reverse order);
|
|
* out - the VA with the highest aligned end address.
|
|
* @align: alignment for required highest address
|
|
*
|
|
* Returns: determined end address within vmap_area
|
|
*/
|
|
static unsigned long
|
|
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
|
|
{
|
|
unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
|
|
unsigned long addr;
|
|
|
|
if (likely(*va)) {
|
|
list_for_each_entry_from_reverse((*va),
|
|
&free_vmap_area_list, list) {
|
|
addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
|
|
if ((*va)->va_start < addr)
|
|
return addr;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
|
|
* @offsets: array containing offset of each area
|
|
* @sizes: array containing size of each area
|
|
* @nr_vms: the number of areas to allocate
|
|
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
|
|
*
|
|
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
|
|
* vm_structs on success, %NULL on failure
|
|
*
|
|
* Percpu allocator wants to use congruent vm areas so that it can
|
|
* maintain the offsets among percpu areas. This function allocates
|
|
* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
|
|
* be scattered pretty far, distance between two areas easily going up
|
|
* to gigabytes. To avoid interacting with regular vmallocs, these
|
|
* areas are allocated from top.
|
|
*
|
|
* Despite its complicated look, this allocator is rather simple. It
|
|
* does everything top-down and scans free blocks from the end looking
|
|
* for matching base. While scanning, if any of the areas do not fit the
|
|
* base address is pulled down to fit the area. Scanning is repeated till
|
|
* all the areas fit and then all necessary data structures are inserted
|
|
* and the result is returned.
|
|
*/
|
|
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
|
|
const size_t *sizes, int nr_vms,
|
|
size_t align)
|
|
{
|
|
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
|
|
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
|
|
struct vmap_area **vas, *va;
|
|
struct vm_struct **vms;
|
|
int area, area2, last_area, term_area;
|
|
unsigned long base, start, size, end, last_end, orig_start, orig_end;
|
|
bool purged = false;
|
|
|
|
/* verify parameters and allocate data structures */
|
|
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
|
|
for (last_area = 0, area = 0; area < nr_vms; area++) {
|
|
start = offsets[area];
|
|
end = start + sizes[area];
|
|
|
|
/* is everything aligned properly? */
|
|
BUG_ON(!IS_ALIGNED(offsets[area], align));
|
|
BUG_ON(!IS_ALIGNED(sizes[area], align));
|
|
|
|
/* detect the area with the highest address */
|
|
if (start > offsets[last_area])
|
|
last_area = area;
|
|
|
|
for (area2 = area + 1; area2 < nr_vms; area2++) {
|
|
unsigned long start2 = offsets[area2];
|
|
unsigned long end2 = start2 + sizes[area2];
|
|
|
|
BUG_ON(start2 < end && start < end2);
|
|
}
|
|
}
|
|
last_end = offsets[last_area] + sizes[last_area];
|
|
|
|
if (vmalloc_end - vmalloc_start < last_end) {
|
|
WARN_ON(true);
|
|
return NULL;
|
|
}
|
|
|
|
vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
|
|
vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
|
|
if (!vas || !vms)
|
|
goto err_free2;
|
|
|
|
for (area = 0; area < nr_vms; area++) {
|
|
vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
|
|
vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
|
|
if (!vas[area] || !vms[area])
|
|
goto err_free;
|
|
}
|
|
retry:
|
|
spin_lock(&free_vmap_area_lock);
|
|
|
|
/* start scanning - we scan from the top, begin with the last area */
|
|
area = term_area = last_area;
|
|
start = offsets[area];
|
|
end = start + sizes[area];
|
|
|
|
va = pvm_find_va_enclose_addr(vmalloc_end);
|
|
base = pvm_determine_end_from_reverse(&va, align) - end;
|
|
|
|
while (true) {
|
|
/*
|
|
* base might have underflowed, add last_end before
|
|
* comparing.
|
|
*/
|
|
if (base + last_end < vmalloc_start + last_end)
|
|
goto overflow;
|
|
|
|
/*
|
|
* Fitting base has not been found.
|
|
*/
|
|
if (va == NULL)
|
|
goto overflow;
|
|
|
|
/*
|
|
* If required width exceeds current VA block, move
|
|
* base downwards and then recheck.
|
|
*/
|
|
if (base + end > va->va_end) {
|
|
base = pvm_determine_end_from_reverse(&va, align) - end;
|
|
term_area = area;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* If this VA does not fit, move base downwards and recheck.
|
|
*/
|
|
if (base + start < va->va_start) {
|
|
va = node_to_va(rb_prev(&va->rb_node));
|
|
base = pvm_determine_end_from_reverse(&va, align) - end;
|
|
term_area = area;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* This area fits, move on to the previous one. If
|
|
* the previous one is the terminal one, we're done.
|
|
*/
|
|
area = (area + nr_vms - 1) % nr_vms;
|
|
if (area == term_area)
|
|
break;
|
|
|
|
start = offsets[area];
|
|
end = start + sizes[area];
|
|
va = pvm_find_va_enclose_addr(base + end);
|
|
}
|
|
|
|
/* we've found a fitting base, insert all va's */
|
|
for (area = 0; area < nr_vms; area++) {
|
|
int ret;
|
|
|
|
start = base + offsets[area];
|
|
size = sizes[area];
|
|
|
|
va = pvm_find_va_enclose_addr(start);
|
|
if (WARN_ON_ONCE(va == NULL))
|
|
/* It is a BUG(), but trigger recovery instead. */
|
|
goto recovery;
|
|
|
|
ret = va_clip(&free_vmap_area_root,
|
|
&free_vmap_area_list, va, start, size);
|
|
if (WARN_ON_ONCE(unlikely(ret)))
|
|
/* It is a BUG(), but trigger recovery instead. */
|
|
goto recovery;
|
|
|
|
/* Allocated area. */
|
|
va = vas[area];
|
|
va->va_start = start;
|
|
va->va_end = start + size;
|
|
}
|
|
|
|
spin_unlock(&free_vmap_area_lock);
|
|
|
|
/* populate the kasan shadow space */
|
|
for (area = 0; area < nr_vms; area++) {
|
|
if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
|
|
goto err_free_shadow;
|
|
}
|
|
|
|
/* insert all vm's */
|
|
for (area = 0; area < nr_vms; area++) {
|
|
struct vmap_node *vn = addr_to_node(vas[area]->va_start);
|
|
|
|
spin_lock(&vn->busy.lock);
|
|
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
|
|
setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
|
|
pcpu_get_vm_areas);
|
|
spin_unlock(&vn->busy.lock);
|
|
}
|
|
|
|
/*
|
|
* Mark allocated areas as accessible. Do it now as a best-effort
|
|
* approach, as they can be mapped outside of vmalloc code.
|
|
* With hardware tag-based KASAN, marking is skipped for
|
|
* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
|
|
*/
|
|
for (area = 0; area < nr_vms; area++)
|
|
vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
|
|
vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
|
|
|
|
kfree(vas);
|
|
return vms;
|
|
|
|
recovery:
|
|
/*
|
|
* Remove previously allocated areas. There is no
|
|
* need in removing these areas from the busy tree,
|
|
* because they are inserted only on the final step
|
|
* and when pcpu_get_vm_areas() is success.
|
|
*/
|
|
while (area--) {
|
|
orig_start = vas[area]->va_start;
|
|
orig_end = vas[area]->va_end;
|
|
va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
|
|
&free_vmap_area_list);
|
|
if (va)
|
|
kasan_release_vmalloc(orig_start, orig_end,
|
|
va->va_start, va->va_end);
|
|
vas[area] = NULL;
|
|
}
|
|
|
|
overflow:
|
|
spin_unlock(&free_vmap_area_lock);
|
|
if (!purged) {
|
|
reclaim_and_purge_vmap_areas();
|
|
purged = true;
|
|
|
|
/* Before "retry", check if we recover. */
|
|
for (area = 0; area < nr_vms; area++) {
|
|
if (vas[area])
|
|
continue;
|
|
|
|
vas[area] = kmem_cache_zalloc(
|
|
vmap_area_cachep, GFP_KERNEL);
|
|
if (!vas[area])
|
|
goto err_free;
|
|
}
|
|
|
|
goto retry;
|
|
}
|
|
|
|
err_free:
|
|
for (area = 0; area < nr_vms; area++) {
|
|
if (vas[area])
|
|
kmem_cache_free(vmap_area_cachep, vas[area]);
|
|
|
|
kfree(vms[area]);
|
|
}
|
|
err_free2:
|
|
kfree(vas);
|
|
kfree(vms);
|
|
return NULL;
|
|
|
|
err_free_shadow:
|
|
spin_lock(&free_vmap_area_lock);
|
|
/*
|
|
* We release all the vmalloc shadows, even the ones for regions that
|
|
* hadn't been successfully added. This relies on kasan_release_vmalloc
|
|
* being able to tolerate this case.
|
|
*/
|
|
for (area = 0; area < nr_vms; area++) {
|
|
orig_start = vas[area]->va_start;
|
|
orig_end = vas[area]->va_end;
|
|
va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
|
|
&free_vmap_area_list);
|
|
if (va)
|
|
kasan_release_vmalloc(orig_start, orig_end,
|
|
va->va_start, va->va_end);
|
|
vas[area] = NULL;
|
|
kfree(vms[area]);
|
|
}
|
|
spin_unlock(&free_vmap_area_lock);
|
|
kfree(vas);
|
|
kfree(vms);
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
|
|
* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
|
|
* @nr_vms: the number of allocated areas
|
|
*
|
|
* Free vm_structs and the array allocated by pcpu_get_vm_areas().
|
|
*/
|
|
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < nr_vms; i++)
|
|
free_vm_area(vms[i]);
|
|
kfree(vms);
|
|
}
|
|
#endif /* CONFIG_SMP */
|
|
|
|
#ifdef CONFIG_PRINTK
|
|
bool vmalloc_dump_obj(void *object)
|
|
{
|
|
const void *caller;
|
|
struct vm_struct *vm;
|
|
struct vmap_area *va;
|
|
struct vmap_node *vn;
|
|
unsigned long addr;
|
|
unsigned int nr_pages;
|
|
|
|
addr = PAGE_ALIGN((unsigned long) object);
|
|
vn = addr_to_node(addr);
|
|
|
|
if (!spin_trylock(&vn->busy.lock))
|
|
return false;
|
|
|
|
va = __find_vmap_area(addr, &vn->busy.root);
|
|
if (!va || !va->vm) {
|
|
spin_unlock(&vn->busy.lock);
|
|
return false;
|
|
}
|
|
|
|
vm = va->vm;
|
|
addr = (unsigned long) vm->addr;
|
|
caller = vm->caller;
|
|
nr_pages = vm->nr_pages;
|
|
spin_unlock(&vn->busy.lock);
|
|
|
|
pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
|
|
nr_pages, addr, caller);
|
|
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
|
|
{
|
|
if (IS_ENABLED(CONFIG_NUMA)) {
|
|
unsigned int nr, *counters = m->private;
|
|
unsigned int step = 1U << vm_area_page_order(v);
|
|
|
|
if (!counters)
|
|
return;
|
|
|
|
if (v->flags & VM_UNINITIALIZED)
|
|
return;
|
|
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
|
|
smp_rmb();
|
|
|
|
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
|
|
|
|
for (nr = 0; nr < v->nr_pages; nr += step)
|
|
counters[page_to_nid(v->pages[nr])] += step;
|
|
for_each_node_state(nr, N_HIGH_MEMORY)
|
|
if (counters[nr])
|
|
seq_printf(m, " N%u=%u", nr, counters[nr]);
|
|
}
|
|
}
|
|
|
|
static void show_purge_info(struct seq_file *m)
|
|
{
|
|
struct vmap_node *vn;
|
|
struct vmap_area *va;
|
|
int i;
|
|
|
|
for (i = 0; i < nr_vmap_nodes; i++) {
|
|
vn = &vmap_nodes[i];
|
|
|
|
spin_lock(&vn->lazy.lock);
|
|
list_for_each_entry(va, &vn->lazy.head, list) {
|
|
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
|
|
(void *)va->va_start, (void *)va->va_end,
|
|
va_size(va));
|
|
}
|
|
spin_unlock(&vn->lazy.lock);
|
|
}
|
|
}
|
|
|
|
static int vmalloc_info_show(struct seq_file *m, void *p)
|
|
{
|
|
struct vmap_node *vn;
|
|
struct vmap_area *va;
|
|
struct vm_struct *v;
|
|
int i;
|
|
|
|
for (i = 0; i < nr_vmap_nodes; i++) {
|
|
vn = &vmap_nodes[i];
|
|
|
|
spin_lock(&vn->busy.lock);
|
|
list_for_each_entry(va, &vn->busy.head, list) {
|
|
if (!va->vm) {
|
|
if (va->flags & VMAP_RAM)
|
|
seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
|
|
(void *)va->va_start, (void *)va->va_end,
|
|
va_size(va));
|
|
|
|
continue;
|
|
}
|
|
|
|
v = va->vm;
|
|
|
|
seq_printf(m, "0x%pK-0x%pK %7ld",
|
|
v->addr, v->addr + v->size, v->size);
|
|
|
|
if (v->caller)
|
|
seq_printf(m, " %pS", v->caller);
|
|
|
|
if (v->nr_pages)
|
|
seq_printf(m, " pages=%d", v->nr_pages);
|
|
|
|
if (v->phys_addr)
|
|
seq_printf(m, " phys=%pa", &v->phys_addr);
|
|
|
|
if (v->flags & VM_IOREMAP)
|
|
seq_puts(m, " ioremap");
|
|
|
|
if (v->flags & VM_SPARSE)
|
|
seq_puts(m, " sparse");
|
|
|
|
if (v->flags & VM_ALLOC)
|
|
seq_puts(m, " vmalloc");
|
|
|
|
if (v->flags & VM_MAP)
|
|
seq_puts(m, " vmap");
|
|
|
|
if (v->flags & VM_USERMAP)
|
|
seq_puts(m, " user");
|
|
|
|
if (v->flags & VM_DMA_COHERENT)
|
|
seq_puts(m, " dma-coherent");
|
|
|
|
if (is_vmalloc_addr(v->pages))
|
|
seq_puts(m, " vpages");
|
|
|
|
show_numa_info(m, v);
|
|
seq_putc(m, '\n');
|
|
}
|
|
spin_unlock(&vn->busy.lock);
|
|
}
|
|
|
|
/*
|
|
* As a final step, dump "unpurged" areas.
|
|
*/
|
|
show_purge_info(m);
|
|
return 0;
|
|
}
|
|
|
|
static int __init proc_vmalloc_init(void)
|
|
{
|
|
void *priv_data = NULL;
|
|
|
|
if (IS_ENABLED(CONFIG_NUMA))
|
|
priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
|
|
|
|
proc_create_single_data("vmallocinfo",
|
|
0400, NULL, vmalloc_info_show, priv_data);
|
|
|
|
return 0;
|
|
}
|
|
module_init(proc_vmalloc_init);
|
|
|
|
#endif
|
|
|
|
static void __init vmap_init_free_space(void)
|
|
{
|
|
unsigned long vmap_start = 1;
|
|
const unsigned long vmap_end = ULONG_MAX;
|
|
struct vmap_area *free;
|
|
struct vm_struct *busy;
|
|
|
|
/*
|
|
* B F B B B F
|
|
* -|-----|.....|-----|-----|-----|.....|-
|
|
* | The KVA space |
|
|
* |<--------------------------------->|
|
|
*/
|
|
for (busy = vmlist; busy; busy = busy->next) {
|
|
if ((unsigned long) busy->addr - vmap_start > 0) {
|
|
free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
|
|
if (!WARN_ON_ONCE(!free)) {
|
|
free->va_start = vmap_start;
|
|
free->va_end = (unsigned long) busy->addr;
|
|
|
|
insert_vmap_area_augment(free, NULL,
|
|
&free_vmap_area_root,
|
|
&free_vmap_area_list);
|
|
}
|
|
}
|
|
|
|
vmap_start = (unsigned long) busy->addr + busy->size;
|
|
}
|
|
|
|
if (vmap_end - vmap_start > 0) {
|
|
free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
|
|
if (!WARN_ON_ONCE(!free)) {
|
|
free->va_start = vmap_start;
|
|
free->va_end = vmap_end;
|
|
|
|
insert_vmap_area_augment(free, NULL,
|
|
&free_vmap_area_root,
|
|
&free_vmap_area_list);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void vmap_init_nodes(void)
|
|
{
|
|
struct vmap_node *vn;
|
|
int i, n;
|
|
|
|
#if BITS_PER_LONG == 64
|
|
/*
|
|
* A high threshold of max nodes is fixed and bound to 128,
|
|
* thus a scale factor is 1 for systems where number of cores
|
|
* are less or equal to specified threshold.
|
|
*
|
|
* As for NUMA-aware notes. For bigger systems, for example
|
|
* NUMA with multi-sockets, where we can end-up with thousands
|
|
* of cores in total, a "sub-numa-clustering" should be added.
|
|
*
|
|
* In this case a NUMA domain is considered as a single entity
|
|
* with dedicated sub-nodes in it which describe one group or
|
|
* set of cores. Therefore a per-domain purging is supposed to
|
|
* be added as well as a per-domain balancing.
|
|
*/
|
|
n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
|
|
|
|
if (n > 1) {
|
|
vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
|
|
if (vn) {
|
|
/* Node partition is 16 pages. */
|
|
vmap_zone_size = (1 << 4) * PAGE_SIZE;
|
|
nr_vmap_nodes = n;
|
|
vmap_nodes = vn;
|
|
} else {
|
|
pr_err("Failed to allocate an array. Disable a node layer\n");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
for (n = 0; n < nr_vmap_nodes; n++) {
|
|
vn = &vmap_nodes[n];
|
|
vn->busy.root = RB_ROOT;
|
|
INIT_LIST_HEAD(&vn->busy.head);
|
|
spin_lock_init(&vn->busy.lock);
|
|
|
|
vn->lazy.root = RB_ROOT;
|
|
INIT_LIST_HEAD(&vn->lazy.head);
|
|
spin_lock_init(&vn->lazy.lock);
|
|
|
|
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
|
|
INIT_LIST_HEAD(&vn->pool[i].head);
|
|
WRITE_ONCE(vn->pool[i].len, 0);
|
|
}
|
|
|
|
spin_lock_init(&vn->pool_lock);
|
|
}
|
|
}
|
|
|
|
static unsigned long
|
|
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
|
|
{
|
|
unsigned long count;
|
|
struct vmap_node *vn;
|
|
int i, j;
|
|
|
|
for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
|
|
vn = &vmap_nodes[i];
|
|
|
|
for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
|
|
count += READ_ONCE(vn->pool[j].len);
|
|
}
|
|
|
|
return count ? count : SHRINK_EMPTY;
|
|
}
|
|
|
|
static unsigned long
|
|
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < nr_vmap_nodes; i++)
|
|
decay_va_pool_node(&vmap_nodes[i], true);
|
|
|
|
return SHRINK_STOP;
|
|
}
|
|
|
|
void __init vmalloc_init(void)
|
|
{
|
|
struct shrinker *vmap_node_shrinker;
|
|
struct vmap_area *va;
|
|
struct vmap_node *vn;
|
|
struct vm_struct *tmp;
|
|
int i;
|
|
|
|
/*
|
|
* Create the cache for vmap_area objects.
|
|
*/
|
|
vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
|
|
|
|
for_each_possible_cpu(i) {
|
|
struct vmap_block_queue *vbq;
|
|
struct vfree_deferred *p;
|
|
|
|
vbq = &per_cpu(vmap_block_queue, i);
|
|
spin_lock_init(&vbq->lock);
|
|
INIT_LIST_HEAD(&vbq->free);
|
|
p = &per_cpu(vfree_deferred, i);
|
|
init_llist_head(&p->list);
|
|
INIT_WORK(&p->wq, delayed_vfree_work);
|
|
xa_init(&vbq->vmap_blocks);
|
|
}
|
|
|
|
/*
|
|
* Setup nodes before importing vmlist.
|
|
*/
|
|
vmap_init_nodes();
|
|
|
|
/* Import existing vmlist entries. */
|
|
for (tmp = vmlist; tmp; tmp = tmp->next) {
|
|
va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
|
|
if (WARN_ON_ONCE(!va))
|
|
continue;
|
|
|
|
va->va_start = (unsigned long)tmp->addr;
|
|
va->va_end = va->va_start + tmp->size;
|
|
va->vm = tmp;
|
|
|
|
vn = addr_to_node(va->va_start);
|
|
insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
|
|
}
|
|
|
|
/*
|
|
* Now we can initialize a free vmap space.
|
|
*/
|
|
vmap_init_free_space();
|
|
vmap_initialized = true;
|
|
|
|
vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
|
|
if (!vmap_node_shrinker) {
|
|
pr_err("Failed to allocate vmap-node shrinker!\n");
|
|
return;
|
|
}
|
|
|
|
vmap_node_shrinker->count_objects = vmap_node_shrink_count;
|
|
vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
|
|
shrinker_register(vmap_node_shrinker);
|
|
}
|