mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-06 05:02:31 +00:00
a08a2ae346
Physical memory hotadd has to allocate a memmap (struct page array) for the newly added memory section. Currently, alloc_pages_node() is used for those allocations. This has some disadvantages: a) an existing memory is consumed for that purpose (eg: ~2MB per 128MB memory section on x86_64) This can even lead to extreme cases where system goes OOM because the physically hotplugged memory depletes the available memory before it is onlined. b) if the whole node is movable then we have off-node struct pages which has performance drawbacks. c) It might be there are no PMD_ALIGNED chunks so memmap array gets populated with base pages. This can be improved when CONFIG_SPARSEMEM_VMEMMAP is enabled. Vmemap page tables can map arbitrary memory. That means that we can reserve a part of the physically hotadded memory to back vmemmap page tables. This implementation uses the beginning of the hotplugged memory for that purpose. There are some non-obviously things to consider though. Vmemmap pages are allocated/freed during the memory hotplug events (add_memory_resource(), try_remove_memory()) when the memory is added/removed. This means that the reserved physical range is not online although it is used. The most obvious side effect is that pfn_to_online_page() returns NULL for those pfns. The current design expects that this should be OK as the hotplugged memory is considered a garbage until it is onlined. For example hibernation wouldn't save the content of those vmmemmaps into the image so it wouldn't be restored on resume but this should be OK as there no real content to recover anyway while metadata is reachable from other data structures (e.g. vmemmap page tables). The reserved space is therefore (de)initialized during the {on,off}line events (mhp_{de}init_memmap_on_memory). That is done by extracting page allocator independent initialization from the regular onlining path. The primary reason to handle the reserved space outside of {on,off}line_pages is to make each initialization specific to the purpose rather than special case them in a single function. As per above, the functions that are introduced are: - mhp_init_memmap_on_memory: Initializes vmemmap pages by calling move_pfn_range_to_zone(), calls kasan_add_zero_shadow(), and onlines as many sections as vmemmap pages fully span. - mhp_deinit_memmap_on_memory: Offlines as many sections as vmemmap pages fully span, removes the range from zhe zone by remove_pfn_range_from_zone(), and calls kasan_remove_zero_shadow() for the range. The new function memory_block_online() calls mhp_init_memmap_on_memory() before doing the actual online_pages(). Should online_pages() fail, we clean up by calling mhp_deinit_memmap_on_memory(). Adjusting of present_pages is done at the end once we know that online_pages() succedeed. On offline, memory_block_offline() needs to unaccount vmemmap pages from present_pages() before calling offline_pages(). This is necessary because offline_pages() tears down some structures based on the fact whether the node or the zone become empty. If offline_pages() fails, we account back vmemmap pages. If it succeeds, we call mhp_deinit_memmap_on_memory(). Hot-remove: We need to be careful when removing memory, as adding and removing memory needs to be done with the same granularity. To check that this assumption is not violated, we check the memory range we want to remove and if a) any memory block has vmemmap pages and b) the range spans more than a single memory block, we scream out loud and refuse to proceed. If all is good and the range was using memmap on memory (aka vmemmap pages), we construct an altmap structure so free_hugepage_table does the right thing and calls vmem_altmap_free instead of free_pagetable. Link: https://lkml.kernel.org/r/20210421102701.25051-5-osalvador@suse.de Signed-off-by: Oscar Salvador <osalvador@suse.de> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
198 lines
6.0 KiB
C
198 lines
6.0 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_MEMREMAP_H_
|
|
#define _LINUX_MEMREMAP_H_
|
|
#include <linux/range.h>
|
|
#include <linux/ioport.h>
|
|
#include <linux/percpu-refcount.h>
|
|
|
|
struct resource;
|
|
struct device;
|
|
|
|
/**
|
|
* struct vmem_altmap - pre-allocated storage for vmemmap_populate
|
|
* @base_pfn: base of the entire dev_pagemap mapping
|
|
* @reserve: pages mapped, but reserved for driver use (relative to @base)
|
|
* @free: free pages set aside in the mapping for memmap storage
|
|
* @align: pages reserved to meet allocation alignments
|
|
* @alloc: track pages consumed, private to vmemmap_populate()
|
|
*/
|
|
struct vmem_altmap {
|
|
unsigned long base_pfn;
|
|
const unsigned long end_pfn;
|
|
const unsigned long reserve;
|
|
unsigned long free;
|
|
unsigned long align;
|
|
unsigned long alloc;
|
|
};
|
|
|
|
/*
|
|
* Specialize ZONE_DEVICE memory into multiple types each having differents
|
|
* usage.
|
|
*
|
|
* MEMORY_DEVICE_PRIVATE:
|
|
* Device memory that is not directly addressable by the CPU: CPU can neither
|
|
* read nor write private memory. In this case, we do still have struct pages
|
|
* backing the device memory. Doing so simplifies the implementation, but it is
|
|
* important to remember that there are certain points at which the struct page
|
|
* must be treated as an opaque object, rather than a "normal" struct page.
|
|
*
|
|
* A more complete discussion of unaddressable memory may be found in
|
|
* include/linux/hmm.h and Documentation/vm/hmm.rst.
|
|
*
|
|
* MEMORY_DEVICE_FS_DAX:
|
|
* Host memory that has similar access semantics as System RAM i.e. DMA
|
|
* coherent and supports page pinning. In support of coordinating page
|
|
* pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
|
|
* wakeup event whenever a page is unpinned and becomes idle. This
|
|
* wakeup is used to coordinate physical address space management (ex:
|
|
* fs truncate/hole punch) vs pinned pages (ex: device dma).
|
|
*
|
|
* MEMORY_DEVICE_GENERIC:
|
|
* Host memory that has similar access semantics as System RAM i.e. DMA
|
|
* coherent and supports page pinning. This is for example used by DAX devices
|
|
* that expose memory using a character device.
|
|
*
|
|
* MEMORY_DEVICE_PCI_P2PDMA:
|
|
* Device memory residing in a PCI BAR intended for use with Peer-to-Peer
|
|
* transactions.
|
|
*/
|
|
enum memory_type {
|
|
/* 0 is reserved to catch uninitialized type fields */
|
|
MEMORY_DEVICE_PRIVATE = 1,
|
|
MEMORY_DEVICE_FS_DAX,
|
|
MEMORY_DEVICE_GENERIC,
|
|
MEMORY_DEVICE_PCI_P2PDMA,
|
|
};
|
|
|
|
struct dev_pagemap_ops {
|
|
/*
|
|
* Called once the page refcount reaches 1. (ZONE_DEVICE pages never
|
|
* reach 0 refcount unless there is a refcount bug. This allows the
|
|
* device driver to implement its own memory management.)
|
|
*/
|
|
void (*page_free)(struct page *page);
|
|
|
|
/*
|
|
* Transition the refcount in struct dev_pagemap to the dead state.
|
|
*/
|
|
void (*kill)(struct dev_pagemap *pgmap);
|
|
|
|
/*
|
|
* Wait for refcount in struct dev_pagemap to be idle and reap it.
|
|
*/
|
|
void (*cleanup)(struct dev_pagemap *pgmap);
|
|
|
|
/*
|
|
* Used for private (un-addressable) device memory only. Must migrate
|
|
* the page back to a CPU accessible page.
|
|
*/
|
|
vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
|
|
};
|
|
|
|
#define PGMAP_ALTMAP_VALID (1 << 0)
|
|
|
|
/**
|
|
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
|
|
* @altmap: pre-allocated/reserved memory for vmemmap allocations
|
|
* @ref: reference count that pins the devm_memremap_pages() mapping
|
|
* @internal_ref: internal reference if @ref is not provided by the caller
|
|
* @done: completion for @internal_ref
|
|
* @type: memory type: see MEMORY_* in memory_hotplug.h
|
|
* @flags: PGMAP_* flags to specify defailed behavior
|
|
* @ops: method table
|
|
* @owner: an opaque pointer identifying the entity that manages this
|
|
* instance. Used by various helpers to make sure that no
|
|
* foreign ZONE_DEVICE memory is accessed.
|
|
* @nr_range: number of ranges to be mapped
|
|
* @range: range to be mapped when nr_range == 1
|
|
* @ranges: array of ranges to be mapped when nr_range > 1
|
|
*/
|
|
struct dev_pagemap {
|
|
struct vmem_altmap altmap;
|
|
struct percpu_ref *ref;
|
|
struct percpu_ref internal_ref;
|
|
struct completion done;
|
|
enum memory_type type;
|
|
unsigned int flags;
|
|
const struct dev_pagemap_ops *ops;
|
|
void *owner;
|
|
int nr_range;
|
|
union {
|
|
struct range range;
|
|
struct range ranges[0];
|
|
};
|
|
};
|
|
|
|
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
|
|
{
|
|
if (pgmap->flags & PGMAP_ALTMAP_VALID)
|
|
return &pgmap->altmap;
|
|
return NULL;
|
|
}
|
|
|
|
#ifdef CONFIG_ZONE_DEVICE
|
|
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
|
|
void memunmap_pages(struct dev_pagemap *pgmap);
|
|
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
|
|
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
|
|
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
|
struct dev_pagemap *pgmap);
|
|
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
|
|
|
|
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
|
|
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
|
|
unsigned long memremap_compat_align(void);
|
|
#else
|
|
static inline void *devm_memremap_pages(struct device *dev,
|
|
struct dev_pagemap *pgmap)
|
|
{
|
|
/*
|
|
* Fail attempts to call devm_memremap_pages() without
|
|
* ZONE_DEVICE support enabled, this requires callers to fall
|
|
* back to plain devm_memremap() based on config
|
|
*/
|
|
WARN_ON_ONCE(1);
|
|
return ERR_PTR(-ENXIO);
|
|
}
|
|
|
|
static inline void devm_memunmap_pages(struct device *dev,
|
|
struct dev_pagemap *pgmap)
|
|
{
|
|
}
|
|
|
|
static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
|
struct dev_pagemap *pgmap)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void vmem_altmap_free(struct vmem_altmap *altmap,
|
|
unsigned long nr_pfns)
|
|
{
|
|
}
|
|
|
|
/* when memremap_pages() is disabled all archs can remap a single page */
|
|
static inline unsigned long memremap_compat_align(void)
|
|
{
|
|
return PAGE_SIZE;
|
|
}
|
|
#endif /* CONFIG_ZONE_DEVICE */
|
|
|
|
static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
|
|
{
|
|
if (pgmap)
|
|
percpu_ref_put(pgmap->ref);
|
|
}
|
|
|
|
#endif /* _LINUX_MEMREMAP_H_ */
|