mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-04 04:02:26 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton: "Almost all of the rest of MM. There was an unusually large amount of MM material this time" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits) zpool: remove no-op module init/exit mm: zbud: constify the zbud_ops mm: zpool: constify the zpool_ops mm: swap: zswap: maybe_preload & refactoring zram: unify error reporting zsmalloc: remove null check from destroy_handle_cache() zsmalloc: do not take class lock in zs_shrinker_count() zsmalloc: use class->pages_per_zspage zsmalloc: consider ZS_ALMOST_FULL as migrate source zsmalloc: partial page ordering within a fullness_list zsmalloc: use shrinker to trigger auto-compaction zsmalloc: account the number of compacted pages zsmalloc/zram: introduce zs_pool_stats api zsmalloc: cosmetic compaction code adjustments zsmalloc: introduce zs_can_compact() function zsmalloc: always keep per-class stats zsmalloc: drop unused variable `nr_to_migrate' mm/memblock.c: fix comment in __next_mem_range() mm/page_alloc.c: fix type information of memoryless node memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node() ...
This commit is contained in:
commit
f6f7a63692
@ -104,6 +104,13 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
|
||||
from this pool must not cross 4KByte boundaries.
|
||||
|
||||
|
||||
void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle)
|
||||
|
||||
Wraps dma_pool_alloc() and also zeroes the returned memory if the
|
||||
allocation attempt succeeded.
|
||||
|
||||
|
||||
void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
|
||||
dma_addr_t *dma_handle);
|
||||
|
||||
|
@ -144,7 +144,8 @@ mem_used_max RW the maximum amount memory zram have consumed to
|
||||
store compressed data
|
||||
mem_limit RW the maximum amount of memory ZRAM can use to store
|
||||
the compressed data
|
||||
num_migrated RO the number of objects migrated migrated by compaction
|
||||
pages_compacted RO the number of pages freed during compaction
|
||||
(available only via zram<id>/mm_stat node)
|
||||
compact WO trigger memory compaction
|
||||
|
||||
WARNING
|
||||
|
@ -60,9 +60,10 @@ Filesystem support consists of
|
||||
- implementing the direct_IO address space operation, and calling
|
||||
dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
|
||||
- implementing an mmap file operation for DAX files which sets the
|
||||
VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
|
||||
for fault and page_mkwrite (which should probably call dax_fault() and
|
||||
dax_mkwrite(), passing the appropriate get_block() callback)
|
||||
VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
|
||||
include handlers for fault, pmd_fault and page_mkwrite (which should
|
||||
probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
|
||||
appropriate get_block() callback)
|
||||
- calling dax_truncate_page() instead of block_truncate_page() for DAX files
|
||||
- calling dax_zero_page_range() instead of zero_user() for DAX files
|
||||
- ensuring that there is sufficient locking between reads, writes,
|
||||
|
@ -424,6 +424,7 @@ Private_Dirty: 0 kB
|
||||
Referenced: 892 kB
|
||||
Anonymous: 0 kB
|
||||
Swap: 0 kB
|
||||
SwapPss: 0 kB
|
||||
KernelPageSize: 4 kB
|
||||
MMUPageSize: 4 kB
|
||||
Locked: 374 kB
|
||||
@ -433,16 +434,23 @@ the first of these lines shows the same information as is displayed for the
|
||||
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
|
||||
(size), the amount of the mapping that is currently resident in RAM (RSS), the
|
||||
process' proportional share of this mapping (PSS), the number of clean and
|
||||
dirty private pages in the mapping. Note that even a page which is part of a
|
||||
MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used
|
||||
by only one process, is accounted as private and not as shared. "Referenced"
|
||||
indicates the amount of memory currently marked as referenced or accessed.
|
||||
dirty private pages in the mapping.
|
||||
|
||||
The "proportional set size" (PSS) of a process is the count of pages it has
|
||||
in memory, where each page is divided by the number of processes sharing it.
|
||||
So if a process has 1000 pages all to itself, and 1000 shared with one other
|
||||
process, its PSS will be 1500.
|
||||
Note that even a page which is part of a MAP_SHARED mapping, but has only
|
||||
a single pte mapped, i.e. is currently used by only one process, is accounted
|
||||
as private and not as shared.
|
||||
"Referenced" indicates the amount of memory currently marked as referenced or
|
||||
accessed.
|
||||
"Anonymous" shows the amount of memory that does not belong to any file. Even
|
||||
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
|
||||
and a page is modified, the file page is replaced by a private anonymous copy.
|
||||
"Swap" shows how much would-be-anonymous memory is also used, but out on
|
||||
swap.
|
||||
|
||||
"SwapPss" shows proportional swap share of this mapping.
|
||||
"VmFlags" field deserves a separate description. This member represents the kernel
|
||||
flags associated with the particular virtual memory area in two letter encoded
|
||||
manner. The codes are the following:
|
||||
|
@ -349,7 +349,7 @@ zone[i]'s protection[j] is calculated by following expression.
|
||||
|
||||
(i < j):
|
||||
zone[i]->protection[j]
|
||||
= (total sums of present_pages from zone[i+1] to zone[j] on the node)
|
||||
= (total sums of managed_pages from zone[i+1] to zone[j] on the node)
|
||||
/ lowmem_reserve_ratio[i];
|
||||
(i = j):
|
||||
(should not be protected. = 0;
|
||||
@ -360,7 +360,7 @@ The default values of lowmem_reserve_ratio[i] are
|
||||
256 (if zone[i] means DMA or DMA32 zone)
|
||||
32 (others).
|
||||
As above expression, they are reciprocal number of ratio.
|
||||
256 means 1/256. # of protection pages becomes about "0.39%" of total present
|
||||
256 means 1/256. # of protection pages becomes about "0.39%" of total managed
|
||||
pages of higher zones on the node.
|
||||
|
||||
If you would like to protect more pages, smaller values are effective.
|
||||
|
@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
|
||||
|
||||
'e' - Send a SIGTERM to all processes, except for init.
|
||||
|
||||
'f' - Will call oom_kill to kill a memory hog process.
|
||||
'f' - Will call the oom killer to kill a memory hog process, but do not
|
||||
panic if nothing can be killed.
|
||||
|
||||
'g' - Used by kgdb (kernel debugger)
|
||||
|
||||
|
@ -329,7 +329,14 @@ Examples
|
||||
|
||||
3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c
|
||||
|
||||
4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a
|
||||
wide range of userspace tools to help with huge page usability, environment
|
||||
setup, and control. Furthermore it provides useful test cases that should be
|
||||
used when modifying code to ensure no regressions are introduced.
|
||||
4) The libhugetlbfs (https://github.com/libhugetlbfs/libhugetlbfs) library
|
||||
provides a wide range of userspace tools to help with huge page usability,
|
||||
environment setup, and control.
|
||||
|
||||
Kernel development regression testing
|
||||
=====================================
|
||||
|
||||
The most complete set of hugetlb tests are in the libhugetlbfs repository.
|
||||
If you modify any hugetlb related code, use the libhugetlbfs test suite
|
||||
to check for regressions. In addition, if you add any new hugetlb
|
||||
functionality, please add appropriate tests to libhugetlbfs.
|
||||
|
@ -16,11 +16,17 @@ There are three components to pagemap:
|
||||
* Bits 0-4 swap type if swapped
|
||||
* Bits 5-54 swap offset if swapped
|
||||
* Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
|
||||
* Bits 56-60 zero
|
||||
* Bit 61 page is file-page or shared-anon
|
||||
* Bit 56 page exclusively mapped (since 4.2)
|
||||
* Bits 57-60 zero
|
||||
* Bit 61 page is file-page or shared-anon (since 3.5)
|
||||
* Bit 62 page swapped
|
||||
* Bit 63 page present
|
||||
|
||||
Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
|
||||
In 4.0 and 4.1 opens by unprivileged fail with -EPERM. Starting from
|
||||
4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
|
||||
Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
|
||||
|
||||
If the page is not present but in swap, then the PFN contains an
|
||||
encoding of the swap file number and the page's offset into the
|
||||
swap. Unmapped pages return a null PFN. This allows determining
|
||||
@ -159,3 +165,8 @@ Other notes:
|
||||
Reading from any of the files will return -EINVAL if you are not starting
|
||||
the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
|
||||
into the file), or if the size of the read is not a multiple of 8 bytes.
|
||||
|
||||
Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
|
||||
always 12 at most architectures). Since Linux 3.11 their meaning changes
|
||||
after first clear of soft-dirty bits. Since Linux 4.2 they are used for
|
||||
flags unconditionally.
|
||||
|
@ -339,6 +339,67 @@ static void __init request_standard_resources(void)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
/*
|
||||
* Relocate initrd if it is not completely within the linear mapping.
|
||||
* This would be the case if mem= cuts out all or part of it.
|
||||
*/
|
||||
static void __init relocate_initrd(void)
|
||||
{
|
||||
phys_addr_t orig_start = __virt_to_phys(initrd_start);
|
||||
phys_addr_t orig_end = __virt_to_phys(initrd_end);
|
||||
phys_addr_t ram_end = memblock_end_of_DRAM();
|
||||
phys_addr_t new_start;
|
||||
unsigned long size, to_free = 0;
|
||||
void *dest;
|
||||
|
||||
if (orig_end <= ram_end)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Any of the original initrd which overlaps the linear map should
|
||||
* be freed after relocating.
|
||||
*/
|
||||
if (orig_start < ram_end)
|
||||
to_free = ram_end - orig_start;
|
||||
|
||||
size = orig_end - orig_start;
|
||||
|
||||
/* initrd needs to be relocated completely inside linear mapping */
|
||||
new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn),
|
||||
size, PAGE_SIZE);
|
||||
if (!new_start)
|
||||
panic("Cannot relocate initrd of size %ld\n", size);
|
||||
memblock_reserve(new_start, size);
|
||||
|
||||
initrd_start = __phys_to_virt(new_start);
|
||||
initrd_end = initrd_start + size;
|
||||
|
||||
pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n",
|
||||
orig_start, orig_start + size - 1,
|
||||
new_start, new_start + size - 1);
|
||||
|
||||
dest = (void *)initrd_start;
|
||||
|
||||
if (to_free) {
|
||||
memcpy(dest, (void *)__phys_to_virt(orig_start), to_free);
|
||||
dest += to_free;
|
||||
}
|
||||
|
||||
copy_from_early_mem(dest, orig_start + to_free, size - to_free);
|
||||
|
||||
if (to_free) {
|
||||
pr_info("Freeing original RAMDISK from [%llx-%llx]\n",
|
||||
orig_start, orig_start + to_free - 1);
|
||||
memblock_free(orig_start, to_free);
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline void __init relocate_initrd(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
|
||||
|
||||
void __init setup_arch(char **cmdline_p)
|
||||
@ -372,6 +433,7 @@ void __init setup_arch(char **cmdline_p)
|
||||
acpi_boot_table_init();
|
||||
|
||||
paging_init();
|
||||
relocate_initrd();
|
||||
request_standard_resources();
|
||||
|
||||
early_ioremap_reset();
|
||||
|
@ -1140,13 +1140,9 @@ sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
{
|
||||
int node = ioc->node;
|
||||
struct page *page;
|
||||
|
||||
if (node == NUMA_NO_NODE)
|
||||
node = numa_node_id();
|
||||
|
||||
page = alloc_pages_exact_node(node, flags, get_order(size));
|
||||
page = alloc_pages_node(ioc->node, flags, get_order(size));
|
||||
if (unlikely(!page))
|
||||
return NULL;
|
||||
|
||||
|
@ -97,7 +97,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
|
||||
|
||||
/* attempt to allocate a granule's worth of cached memory pages */
|
||||
|
||||
page = alloc_pages_exact_node(nid,
|
||||
page = __alloc_pages_node(nid,
|
||||
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
||||
IA64_GRANULE_SHIFT-PAGE_SHIFT);
|
||||
if (!page) {
|
||||
|
@ -92,7 +92,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size,
|
||||
*/
|
||||
node = pcibus_to_node(pdev->bus);
|
||||
if (likely(node >=0)) {
|
||||
struct page *p = alloc_pages_exact_node(node,
|
||||
struct page *p = __alloc_pages_node(node,
|
||||
flags, get_order(size));
|
||||
|
||||
if (likely(p))
|
||||
|
@ -123,7 +123,7 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order)
|
||||
|
||||
area->nid = nid;
|
||||
area->order = order;
|
||||
area->pages = alloc_pages_exact_node(area->nid,
|
||||
area->pages = __alloc_pages_node(area->nid,
|
||||
GFP_KERNEL|__GFP_THISNODE,
|
||||
area->order);
|
||||
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include <asm-generic/4level-fixup.h>
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <asm/types.h>
|
||||
#include <asm/pgtsrmmu.h>
|
||||
#include <asm/vaddrs.h>
|
||||
|
@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void)
|
||||
return ramdisk_size;
|
||||
}
|
||||
|
||||
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
|
||||
static void __init relocate_initrd(void)
|
||||
{
|
||||
/* Assume only end is not page aligned */
|
||||
u64 ramdisk_image = get_ramdisk_image();
|
||||
u64 ramdisk_size = get_ramdisk_size();
|
||||
u64 area_size = PAGE_ALIGN(ramdisk_size);
|
||||
unsigned long slop, clen, mapaddr;
|
||||
char *p, *q;
|
||||
|
||||
/* We need to move the initrd down into directly mapped mem */
|
||||
relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
|
||||
@ -343,25 +340,8 @@ static void __init relocate_initrd(void)
|
||||
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
|
||||
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
|
||||
|
||||
q = (char *)initrd_start;
|
||||
copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
|
||||
|
||||
/* Copy the initrd */
|
||||
while (ramdisk_size) {
|
||||
slop = ramdisk_image & ~PAGE_MASK;
|
||||
clen = ramdisk_size;
|
||||
if (clen > MAX_MAP_CHUNK-slop)
|
||||
clen = MAX_MAP_CHUNK-slop;
|
||||
mapaddr = ramdisk_image & PAGE_MASK;
|
||||
p = early_memremap(mapaddr, clen+slop);
|
||||
memcpy(q, p+slop, clen);
|
||||
early_memunmap(p, clen+slop);
|
||||
q += clen;
|
||||
ramdisk_image += clen;
|
||||
ramdisk_size -= clen;
|
||||
}
|
||||
|
||||
ramdisk_image = get_ramdisk_image();
|
||||
ramdisk_size = get_ramdisk_size();
|
||||
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
|
||||
" [mem %#010llx-%#010llx]\n",
|
||||
ramdisk_image, ramdisk_image + ramdisk_size - 1,
|
||||
|
@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
|
||||
struct page *pages;
|
||||
struct vmcs *vmcs;
|
||||
|
||||
pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
|
||||
pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
|
||||
if (!pages)
|
||||
return NULL;
|
||||
vmcs = page_address(pages);
|
||||
|
@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
|
||||
bi->start = max(bi->start, low);
|
||||
bi->end = min(bi->end, high);
|
||||
|
||||
/* and there's no empty block */
|
||||
if (bi->start >= bi->end)
|
||||
/* and there's no empty or non-exist block */
|
||||
if (bi->start >= bi->end ||
|
||||
!memblock_overlaps_region(&memblock.memory,
|
||||
bi->start, bi->end - bi->start))
|
||||
numa_remove_memblk_from(i--, mi);
|
||||
}
|
||||
|
||||
|
@ -388,7 +388,6 @@ static ssize_t comp_algorithm_store(struct device *dev,
|
||||
static ssize_t compact_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
unsigned long nr_migrated;
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
struct zram_meta *meta;
|
||||
|
||||
@ -399,8 +398,7 @@ static ssize_t compact_store(struct device *dev,
|
||||
}
|
||||
|
||||
meta = zram->meta;
|
||||
nr_migrated = zs_compact(meta->mem_pool);
|
||||
atomic64_add(nr_migrated, &zram->stats.num_migrated);
|
||||
zs_compact(meta->mem_pool);
|
||||
up_read(&zram->init_lock);
|
||||
|
||||
return len;
|
||||
@ -428,26 +426,31 @@ static ssize_t mm_stat_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
struct zs_pool_stats pool_stats;
|
||||
u64 orig_size, mem_used = 0;
|
||||
long max_used;
|
||||
ssize_t ret;
|
||||
|
||||
memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
if (init_done(zram))
|
||||
if (init_done(zram)) {
|
||||
mem_used = zs_get_total_pages(zram->meta->mem_pool);
|
||||
zs_pool_stats(zram->meta->mem_pool, &pool_stats);
|
||||
}
|
||||
|
||||
orig_size = atomic64_read(&zram->stats.pages_stored);
|
||||
max_used = atomic_long_read(&zram->stats.max_used_pages);
|
||||
|
||||
ret = scnprintf(buf, PAGE_SIZE,
|
||||
"%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
|
||||
"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
|
||||
orig_size << PAGE_SHIFT,
|
||||
(u64)atomic64_read(&zram->stats.compr_data_size),
|
||||
mem_used << PAGE_SHIFT,
|
||||
zram->limit_pages << PAGE_SHIFT,
|
||||
max_used << PAGE_SHIFT,
|
||||
(u64)atomic64_read(&zram->stats.zero_pages),
|
||||
(u64)atomic64_read(&zram->stats.num_migrated));
|
||||
pool_stats.pages_compacted);
|
||||
up_read(&zram->init_lock);
|
||||
|
||||
return ret;
|
||||
@ -619,7 +622,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
|
||||
uncmem = user_mem;
|
||||
|
||||
if (!uncmem) {
|
||||
pr_info("Unable to allocate temp memory\n");
|
||||
pr_err("Unable to allocate temp memory\n");
|
||||
ret = -ENOMEM;
|
||||
goto out_cleanup;
|
||||
}
|
||||
@ -716,7 +719,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
|
||||
|
||||
handle = zs_malloc(meta->mem_pool, clen);
|
||||
if (!handle) {
|
||||
pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
|
||||
pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
|
||||
index, clen);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
@ -1036,7 +1039,7 @@ static ssize_t disksize_store(struct device *dev,
|
||||
|
||||
comp = zcomp_create(zram->compressor, zram->max_comp_streams);
|
||||
if (IS_ERR(comp)) {
|
||||
pr_info("Cannot initialise %s compressing backend\n",
|
||||
pr_err("Cannot initialise %s compressing backend\n",
|
||||
zram->compressor);
|
||||
err = PTR_ERR(comp);
|
||||
goto out_free_meta;
|
||||
@ -1214,7 +1217,7 @@ static int zram_add(void)
|
||||
/* gendisk structure */
|
||||
zram->disk = alloc_disk(1);
|
||||
if (!zram->disk) {
|
||||
pr_warn("Error allocating disk structure for device %d\n",
|
||||
pr_err("Error allocating disk structure for device %d\n",
|
||||
device_id);
|
||||
ret = -ENOMEM;
|
||||
goto out_free_queue;
|
||||
@ -1263,7 +1266,8 @@ static int zram_add(void)
|
||||
ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
|
||||
&zram_disk_attr_group);
|
||||
if (ret < 0) {
|
||||
pr_warn("Error creating sysfs group");
|
||||
pr_err("Error creating sysfs group for device %d\n",
|
||||
device_id);
|
||||
goto out_free_disk;
|
||||
}
|
||||
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
|
||||
@ -1403,13 +1407,13 @@ static int __init zram_init(void)
|
||||
|
||||
ret = class_register(&zram_control_class);
|
||||
if (ret) {
|
||||
pr_warn("Unable to register zram-control class\n");
|
||||
pr_err("Unable to register zram-control class\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
zram_major = register_blkdev(0, "zram");
|
||||
if (zram_major <= 0) {
|
||||
pr_warn("Unable to get major number\n");
|
||||
pr_err("Unable to get major number\n");
|
||||
class_unregister(&zram_control_class);
|
||||
return -EBUSY;
|
||||
}
|
||||
|
@ -78,7 +78,6 @@ struct zram_stats {
|
||||
atomic64_t compr_data_size; /* compressed size of pages stored */
|
||||
atomic64_t num_reads; /* failed + successful */
|
||||
atomic64_t num_writes; /* --do-- */
|
||||
atomic64_t num_migrated; /* no. of migrated object */
|
||||
atomic64_t failed_reads; /* can happen when memory is too low */
|
||||
atomic64_t failed_writes; /* can happen when memory is too low */
|
||||
atomic64_t invalid_io; /* non-page-aligned I/O requests */
|
||||
|
@ -239,7 +239,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
|
||||
mq->mmr_blade = uv_cpu_to_blade_id(cpu);
|
||||
|
||||
nid = cpu_to_node(cpu);
|
||||
page = alloc_pages_exact_node(nid,
|
||||
page = __alloc_pages_node(nid,
|
||||
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
||||
pg_order);
|
||||
if (page == NULL) {
|
||||
|
@ -353,9 +353,16 @@ static struct sysrq_key_op sysrq_term_op = {
|
||||
|
||||
static void moom_callback(struct work_struct *ignored)
|
||||
{
|
||||
const gfp_t gfp_mask = GFP_KERNEL;
|
||||
struct oom_control oc = {
|
||||
.zonelist = node_zonelist(first_memory_node, gfp_mask),
|
||||
.nodemask = NULL,
|
||||
.gfp_mask = gfp_mask,
|
||||
.order = -1,
|
||||
};
|
||||
|
||||
mutex_lock(&oom_lock);
|
||||
if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
|
||||
GFP_KERNEL, 0, NULL, true))
|
||||
if (!out_of_memory(&oc))
|
||||
pr_info("OOM request ignored because killer is disabled\n");
|
||||
mutex_unlock(&oom_lock);
|
||||
}
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include <linux/namei.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/cleancache.h>
|
||||
#include <linux/dax.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include "internal.h"
|
||||
|
||||
|
197
fs/dax.c
197
fs/dax.c
@ -283,7 +283,6 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
|
||||
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
||||
struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
|
||||
unsigned long vaddr = (unsigned long)vmf->virtual_address;
|
||||
void __pmem *addr;
|
||||
@ -291,8 +290,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
||||
pgoff_t size;
|
||||
int error;
|
||||
|
||||
i_mmap_lock_read(mapping);
|
||||
|
||||
/*
|
||||
* Check truncate didn't happen while we were allocating a block.
|
||||
* If it did, this block may or may not be still allocated to the
|
||||
@ -322,8 +319,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
||||
error = vm_insert_mixed(vma, vaddr, pfn);
|
||||
|
||||
out:
|
||||
i_mmap_unlock_read(mapping);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -385,15 +380,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
* from a read fault and we've raced with a truncate
|
||||
*/
|
||||
error = -EIO;
|
||||
goto unlock_page;
|
||||
goto unlock;
|
||||
}
|
||||
} else {
|
||||
i_mmap_lock_write(mapping);
|
||||
}
|
||||
|
||||
error = get_block(inode, block, &bh, 0);
|
||||
if (!error && (bh.b_size < PAGE_SIZE))
|
||||
error = -EIO; /* fs corruption? */
|
||||
if (error)
|
||||
goto unlock_page;
|
||||
goto unlock;
|
||||
|
||||
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
|
||||
if (vmf->flags & FAULT_FLAG_WRITE) {
|
||||
@ -404,8 +401,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
if (!error && (bh.b_size < PAGE_SIZE))
|
||||
error = -EIO;
|
||||
if (error)
|
||||
goto unlock_page;
|
||||
goto unlock;
|
||||
} else {
|
||||
i_mmap_unlock_write(mapping);
|
||||
return dax_load_hole(mapping, page, vmf);
|
||||
}
|
||||
}
|
||||
@ -417,17 +415,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
else
|
||||
clear_user_highpage(new_page, vaddr);
|
||||
if (error)
|
||||
goto unlock_page;
|
||||
goto unlock;
|
||||
vmf->page = page;
|
||||
if (!page) {
|
||||
i_mmap_lock_read(mapping);
|
||||
/* Check we didn't race with truncate */
|
||||
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
|
||||
PAGE_SHIFT;
|
||||
if (vmf->pgoff >= size) {
|
||||
i_mmap_unlock_read(mapping);
|
||||
error = -EIO;
|
||||
goto out;
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
return VM_FAULT_LOCKED;
|
||||
@ -463,6 +459,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
|
||||
}
|
||||
|
||||
if (!page)
|
||||
i_mmap_unlock_write(mapping);
|
||||
out:
|
||||
if (error == -ENOMEM)
|
||||
return VM_FAULT_OOM | major;
|
||||
@ -471,11 +469,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
return VM_FAULT_SIGBUS | major;
|
||||
return VM_FAULT_NOPAGE | major;
|
||||
|
||||
unlock_page:
|
||||
unlock:
|
||||
if (page) {
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
} else {
|
||||
i_mmap_unlock_write(mapping);
|
||||
}
|
||||
|
||||
goto out;
|
||||
}
|
||||
EXPORT_SYMBOL(__dax_fault);
|
||||
@ -507,6 +508,176 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_fault);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
/*
|
||||
* The 'colour' (ie low bits) within a PMD of a page offset. This comes up
|
||||
* more often than one might expect in the below function.
|
||||
*/
|
||||
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
|
||||
|
||||
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
||||
pmd_t *pmd, unsigned int flags, get_block_t get_block,
|
||||
dax_iodone_t complete_unwritten)
|
||||
{
|
||||
struct file *file = vma->vm_file;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
struct buffer_head bh;
|
||||
unsigned blkbits = inode->i_blkbits;
|
||||
unsigned long pmd_addr = address & PMD_MASK;
|
||||
bool write = flags & FAULT_FLAG_WRITE;
|
||||
long length;
|
||||
void *kaddr;
|
||||
pgoff_t size, pgoff;
|
||||
sector_t block, sector;
|
||||
unsigned long pfn;
|
||||
int result = 0;
|
||||
|
||||
/* Fall back to PTEs if we're going to COW */
|
||||
if (write && !(vma->vm_flags & VM_SHARED))
|
||||
return VM_FAULT_FALLBACK;
|
||||
/* If the PMD would extend outside the VMA */
|
||||
if (pmd_addr < vma->vm_start)
|
||||
return VM_FAULT_FALLBACK;
|
||||
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
|
||||
return VM_FAULT_FALLBACK;
|
||||
|
||||
pgoff = linear_page_index(vma, pmd_addr);
|
||||
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
if (pgoff >= size)
|
||||
return VM_FAULT_SIGBUS;
|
||||
/* If the PMD would cover blocks out of the file */
|
||||
if ((pgoff | PG_PMD_COLOUR) >= size)
|
||||
return VM_FAULT_FALLBACK;
|
||||
|
||||
memset(&bh, 0, sizeof(bh));
|
||||
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
|
||||
|
||||
bh.b_size = PMD_SIZE;
|
||||
i_mmap_lock_write(mapping);
|
||||
length = get_block(inode, block, &bh, write);
|
||||
if (length)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
/*
|
||||
* If the filesystem isn't willing to tell us the length of a hole,
|
||||
* just fall back to PTEs. Calling get_block 512 times in a loop
|
||||
* would be silly.
|
||||
*/
|
||||
if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
|
||||
goto fallback;
|
||||
|
||||
if (buffer_unwritten(&bh) || buffer_new(&bh)) {
|
||||
int i;
|
||||
for (i = 0; i < PTRS_PER_PMD; i++)
|
||||
clear_page(kaddr + i * PAGE_SIZE);
|
||||
count_vm_event(PGMAJFAULT);
|
||||
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
|
||||
result |= VM_FAULT_MAJOR;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we allocated new storage, make sure no process has any
|
||||
* zero pages covering this hole
|
||||
*/
|
||||
if (buffer_new(&bh)) {
|
||||
i_mmap_unlock_write(mapping);
|
||||
unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
|
||||
i_mmap_lock_write(mapping);
|
||||
}
|
||||
|
||||
/*
|
||||
* If a truncate happened while we were allocating blocks, we may
|
||||
* leave blocks allocated to the file that are beyond EOF. We can't
|
||||
* take i_mutex here, so just leave them hanging; they'll be freed
|
||||
* when the file is deleted.
|
||||
*/
|
||||
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
if (pgoff >= size) {
|
||||
result = VM_FAULT_SIGBUS;
|
||||
goto out;
|
||||
}
|
||||
if ((pgoff | PG_PMD_COLOUR) >= size)
|
||||
goto fallback;
|
||||
|
||||
if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
|
||||
spinlock_t *ptl;
|
||||
pmd_t entry;
|
||||
struct page *zero_page = get_huge_zero_page();
|
||||
|
||||
if (unlikely(!zero_page))
|
||||
goto fallback;
|
||||
|
||||
ptl = pmd_lock(vma->vm_mm, pmd);
|
||||
if (!pmd_none(*pmd)) {
|
||||
spin_unlock(ptl);
|
||||
goto fallback;
|
||||
}
|
||||
|
||||
entry = mk_pmd(zero_page, vma->vm_page_prot);
|
||||
entry = pmd_mkhuge(entry);
|
||||
set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
|
||||
result = VM_FAULT_NOPAGE;
|
||||
spin_unlock(ptl);
|
||||
} else {
|
||||
sector = bh.b_blocknr << (blkbits - 9);
|
||||
length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
|
||||
bh.b_size);
|
||||
if (length < 0) {
|
||||
result = VM_FAULT_SIGBUS;
|
||||
goto out;
|
||||
}
|
||||
if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
|
||||
goto fallback;
|
||||
|
||||
result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
|
||||
}
|
||||
|
||||
out:
|
||||
if (buffer_unwritten(&bh))
|
||||
complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
|
||||
|
||||
i_mmap_unlock_write(mapping);
|
||||
|
||||
return result;
|
||||
|
||||
fallback:
|
||||
count_vm_event(THP_FAULT_FALLBACK);
|
||||
result = VM_FAULT_FALLBACK;
|
||||
goto out;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__dax_pmd_fault);
|
||||
|
||||
/**
|
||||
* dax_pmd_fault - handle a PMD fault on a DAX file
|
||||
* @vma: The virtual memory area where the fault occurred
|
||||
* @vmf: The description of the fault
|
||||
* @get_block: The filesystem method used to translate file offsets to blocks
|
||||
*
|
||||
* When a page fault occurs, filesystems may call this helper in their
|
||||
* pmd_fault handler for DAX files.
|
||||
*/
|
||||
int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
||||
pmd_t *pmd, unsigned int flags, get_block_t get_block,
|
||||
dax_iodone_t complete_unwritten)
|
||||
{
|
||||
int result;
|
||||
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
|
||||
|
||||
if (flags & FAULT_FLAG_WRITE) {
|
||||
sb_start_pagefault(sb);
|
||||
file_update_time(vma->vm_file);
|
||||
}
|
||||
result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
|
||||
complete_unwritten);
|
||||
if (flags & FAULT_FLAG_WRITE)
|
||||
sb_end_pagefault(sb);
|
||||
|
||||
return result;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_pmd_fault);
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
/**
|
||||
* dax_pfn_mkwrite - handle first write to DAX page
|
||||
* @vma: The virtual memory area where the fault occurred
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include <linux/time.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/quotaops.h>
|
||||
#include "ext2.h"
|
||||
#include "xattr.h"
|
||||
@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
return dax_fault(vma, vmf, ext2_get_block, NULL);
|
||||
}
|
||||
|
||||
static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd, unsigned int flags)
|
||||
{
|
||||
return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
|
||||
}
|
||||
|
||||
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
|
||||
@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
|
||||
static const struct vm_operations_struct ext2_dax_vm_ops = {
|
||||
.fault = ext2_dax_fault,
|
||||
.pmd_fault = ext2_dax_pmd_fault,
|
||||
.page_mkwrite = ext2_dax_mkwrite,
|
||||
.pfn_mkwrite = dax_pfn_mkwrite,
|
||||
};
|
||||
@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
|
||||
file_accessed(file);
|
||||
vma->vm_ops = &ext2_dax_vm_ops;
|
||||
vma->vm_flags |= VM_MIXEDMAP;
|
||||
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <linux/time.h>
|
||||
#include <linux/highuid.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/quotaops.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/buffer_head.h>
|
||||
|
@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
|
||||
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
|
||||
int ext4_get_block_write(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_get_block_dax(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/path.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/quotaops.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/uio.h>
|
||||
@ -195,7 +196,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||
{
|
||||
struct inode *inode = bh->b_assoc_map->host;
|
||||
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
|
||||
/* XXX: breaks on 32-bit > 16TB. Is that even supported? */
|
||||
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
|
||||
int err;
|
||||
if (!uptodate)
|
||||
@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||
|
||||
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
|
||||
/* Is this the right get_block? */
|
||||
int result;
|
||||
handle_t *handle = NULL;
|
||||
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
|
||||
bool write = vmf->flags & FAULT_FLAG_WRITE;
|
||||
|
||||
if (write) {
|
||||
sb_start_pagefault(sb);
|
||||
file_update_time(vma->vm_file);
|
||||
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
|
||||
EXT4_DATA_TRANS_BLOCKS(sb));
|
||||
}
|
||||
|
||||
if (IS_ERR(handle))
|
||||
result = VM_FAULT_SIGBUS;
|
||||
else
|
||||
result = __dax_fault(vma, vmf, ext4_get_block_dax,
|
||||
ext4_end_io_unwritten);
|
||||
|
||||
if (write) {
|
||||
if (!IS_ERR(handle))
|
||||
ext4_journal_stop(handle);
|
||||
sb_end_pagefault(sb);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd, unsigned int flags)
|
||||
{
|
||||
int result;
|
||||
handle_t *handle = NULL;
|
||||
struct inode *inode = file_inode(vma->vm_file);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
bool write = flags & FAULT_FLAG_WRITE;
|
||||
|
||||
if (write) {
|
||||
sb_start_pagefault(sb);
|
||||
file_update_time(vma->vm_file);
|
||||
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
|
||||
ext4_chunk_trans_blocks(inode,
|
||||
PMD_SIZE / PAGE_SIZE));
|
||||
}
|
||||
|
||||
if (IS_ERR(handle))
|
||||
result = VM_FAULT_SIGBUS;
|
||||
else
|
||||
result = __dax_pmd_fault(vma, addr, pmd, flags,
|
||||
ext4_get_block_dax, ext4_end_io_unwritten);
|
||||
|
||||
if (write) {
|
||||
if (!IS_ERR(handle))
|
||||
ext4_journal_stop(handle);
|
||||
sb_end_pagefault(sb);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
|
||||
return dax_mkwrite(vma, vmf, ext4_get_block_dax,
|
||||
ext4_end_io_unwritten);
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
||||
.fault = ext4_dax_fault,
|
||||
.pmd_fault = ext4_dax_pmd_fault,
|
||||
.page_mkwrite = ext4_dax_mkwrite,
|
||||
.pfn_mkwrite = dax_pfn_mkwrite,
|
||||
};
|
||||
@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
file_accessed(file);
|
||||
if (IS_DAX(file_inode(file))) {
|
||||
vma->vm_ops = &ext4_dax_vm_ops;
|
||||
vma->vm_flags |= VM_MIXEDMAP;
|
||||
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
|
||||
} else {
|
||||
vma->vm_ops = &ext4_file_vm_ops;
|
||||
}
|
||||
|
@ -22,6 +22,7 @@
|
||||
|
||||
#include "ext4_jbd2.h"
|
||||
#include "truncate.h"
|
||||
#include <linux/dax.h>
|
||||
#include <linux/uio.h>
|
||||
|
||||
#include <trace/events/ext4.h>
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include <linux/time.h>
|
||||
#include <linux/highuid.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/quotaops.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/buffer_head.h>
|
||||
@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
|
||||
EXT4_GET_BLOCKS_NO_LOCK);
|
||||
}
|
||||
|
||||
int ext4_get_block_dax(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create)
|
||||
{
|
||||
int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
|
||||
if (create)
|
||||
flags |= EXT4_GET_BLOCKS_CREATE;
|
||||
ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
|
||||
inode->i_ino, create);
|
||||
return _ext4_get_block(inode, iblock, bh_result, flags);
|
||||
}
|
||||
|
||||
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
|
||||
ssize_t size, void *private)
|
||||
{
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <linux/thread_info.h>
|
||||
#include <asm/current.h>
|
||||
#include <linux/sched.h> /* remove ASAP */
|
||||
#include <linux/falloc.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/file.h>
|
||||
@ -84,6 +85,29 @@ static const match_table_t tokens = {
|
||||
{Opt_err, NULL},
|
||||
};
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
|
||||
struct inode *inode, pgoff_t index)
|
||||
{
|
||||
vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
|
||||
index);
|
||||
}
|
||||
|
||||
static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
|
||||
{
|
||||
mpol_cond_put(vma->vm_policy);
|
||||
}
|
||||
#else
|
||||
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
|
||||
struct inode *inode, pgoff_t index)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static void huge_pagevec_release(struct pagevec *pvec)
|
||||
{
|
||||
int i;
|
||||
@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static void truncate_huge_page(struct page *page)
|
||||
static void remove_huge_page(struct page *page)
|
||||
{
|
||||
ClearPageDirty(page);
|
||||
ClearPageUptodate(page);
|
||||
delete_from_page_cache(page);
|
||||
}
|
||||
|
||||
static void truncate_hugepages(struct inode *inode, loff_t lstart)
|
||||
|
||||
/*
|
||||
* remove_inode_hugepages handles two distinct cases: truncation and hole
|
||||
* punch. There are subtle differences in operation for each case.
|
||||
|
||||
* truncation is indicated by end of range being LLONG_MAX
|
||||
* In this case, we first scan the range and release found pages.
|
||||
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
|
||||
* maps and global counts.
|
||||
* hole punch is indicated if end is not LLONG_MAX
|
||||
* In the hole punch case we scan the range and release found pages.
|
||||
* Only when releasing a page is the associated region/reserv map
|
||||
* deleted. The region/reserv map for ranges without associated
|
||||
* pages are not modified.
|
||||
* Note: If the passed end of range value is beyond the end of file, but
|
||||
* not LLONG_MAX this routine still performs a hole punch operation.
|
||||
*/
|
||||
static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||
loff_t lend)
|
||||
{
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
struct address_space *mapping = &inode->i_data;
|
||||
const pgoff_t start = lstart >> huge_page_shift(h);
|
||||
const pgoff_t end = lend >> huge_page_shift(h);
|
||||
struct vm_area_struct pseudo_vma;
|
||||
struct pagevec pvec;
|
||||
pgoff_t next;
|
||||
int i, freed = 0;
|
||||
long lookup_nr = PAGEVEC_SIZE;
|
||||
bool truncate_op = (lend == LLONG_MAX);
|
||||
|
||||
memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
|
||||
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
|
||||
pagevec_init(&pvec, 0);
|
||||
next = start;
|
||||
while (1) {
|
||||
if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
|
||||
while (next < end) {
|
||||
/*
|
||||
* Make sure to never grab more pages that we
|
||||
* might possibly need.
|
||||
*/
|
||||
if (end - next < lookup_nr)
|
||||
lookup_nr = end - next;
|
||||
|
||||
/*
|
||||
* This pagevec_lookup() may return pages past 'end',
|
||||
* so we must check for page->index > end.
|
||||
*/
|
||||
if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
|
||||
if (next == start)
|
||||
break;
|
||||
next = start;
|
||||
@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
|
||||
|
||||
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
||||
struct page *page = pvec.pages[i];
|
||||
u32 hash;
|
||||
|
||||
hash = hugetlb_fault_mutex_hash(h, current->mm,
|
||||
&pseudo_vma,
|
||||
mapping, next, 0);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
||||
lock_page(page);
|
||||
if (page->index >= end) {
|
||||
unlock_page(page);
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
next = end; /* we are done */
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If page is mapped, it was faulted in after being
|
||||
* unmapped. Do nothing in this race case. In the
|
||||
* normal case page is not mapped.
|
||||
*/
|
||||
if (!page_mapped(page)) {
|
||||
bool rsv_on_error = !PagePrivate(page);
|
||||
/*
|
||||
* We must free the huge page and remove
|
||||
* from page cache (remove_huge_page) BEFORE
|
||||
* removing the region/reserve map
|
||||
* (hugetlb_unreserve_pages). In rare out
|
||||
* of memory conditions, removal of the
|
||||
* region/reserve map could fail. Before
|
||||
* free'ing the page, note PagePrivate which
|
||||
* is used in case of error.
|
||||
*/
|
||||
remove_huge_page(page);
|
||||
freed++;
|
||||
if (!truncate_op) {
|
||||
if (unlikely(hugetlb_unreserve_pages(
|
||||
inode, next,
|
||||
next + 1, 1)))
|
||||
hugetlb_fix_reserve_counts(
|
||||
inode, rsv_on_error);
|
||||
}
|
||||
}
|
||||
|
||||
if (page->index > next)
|
||||
next = page->index;
|
||||
|
||||
++next;
|
||||
truncate_huge_page(page);
|
||||
unlock_page(page);
|
||||
freed++;
|
||||
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
}
|
||||
huge_pagevec_release(&pvec);
|
||||
}
|
||||
BUG_ON(!lstart && mapping->nrpages);
|
||||
hugetlb_unreserve_pages(inode, start, freed);
|
||||
|
||||
if (truncate_op)
|
||||
(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
|
||||
}
|
||||
|
||||
static void hugetlbfs_evict_inode(struct inode *inode)
|
||||
{
|
||||
struct resv_map *resv_map;
|
||||
|
||||
truncate_hugepages(inode, 0);
|
||||
remove_inode_hugepages(inode, 0, LLONG_MAX);
|
||||
resv_map = (struct resv_map *)inode->i_mapping->private_data;
|
||||
/* root inode doesn't have the resv_map, so we should check it */
|
||||
if (resv_map)
|
||||
@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
|
||||
}
|
||||
|
||||
static inline void
|
||||
hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
|
||||
hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
|
||||
/*
|
||||
* end == 0 indicates that the entire range after
|
||||
* start should be unmapped.
|
||||
*/
|
||||
vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
|
||||
unsigned long v_offset;
|
||||
|
||||
/*
|
||||
@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
|
||||
* which overlap the truncated area starting at pgoff,
|
||||
* and no vma on a 32-bit arch can span beyond the 4GB.
|
||||
*/
|
||||
if (vma->vm_pgoff < pgoff)
|
||||
v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
|
||||
if (vma->vm_pgoff < start)
|
||||
v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
|
||||
else
|
||||
v_offset = 0;
|
||||
|
||||
unmap_hugepage_range(vma, vma->vm_start + v_offset,
|
||||
vma->vm_end, NULL);
|
||||
if (end) {
|
||||
end = ((end - start) << PAGE_SHIFT) +
|
||||
vma->vm_start + v_offset;
|
||||
if (end > vma->vm_end)
|
||||
end = vma->vm_end;
|
||||
} else
|
||||
end = vma->vm_end;
|
||||
|
||||
unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
|
||||
i_size_write(inode, offset);
|
||||
i_mmap_lock_write(mapping);
|
||||
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
|
||||
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
|
||||
i_mmap_unlock_write(mapping);
|
||||
truncate_hugepages(inode, offset);
|
||||
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||
{
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
loff_t hpage_size = huge_page_size(h);
|
||||
loff_t hole_start, hole_end;
|
||||
|
||||
/*
|
||||
* For hole punch round up the beginning offset of the hole and
|
||||
* round down the end.
|
||||
*/
|
||||
hole_start = round_up(offset, hpage_size);
|
||||
hole_end = round_down(offset + len, hpage_size);
|
||||
|
||||
if (hole_end > hole_start) {
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
i_mmap_lock_write(mapping);
|
||||
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||
hole_start >> PAGE_SHIFT,
|
||||
hole_end >> PAGE_SHIFT);
|
||||
i_mmap_unlock_write(mapping);
|
||||
remove_inode_hugepages(inode, hole_start, hole_end);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
|
||||
loff_t len)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
struct vm_area_struct pseudo_vma;
|
||||
struct mm_struct *mm = current->mm;
|
||||
loff_t hpage_size = huge_page_size(h);
|
||||
unsigned long hpage_shift = huge_page_shift(h);
|
||||
pgoff_t start, index, end;
|
||||
int error;
|
||||
u32 hash;
|
||||
|
||||
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (mode & FALLOC_FL_PUNCH_HOLE)
|
||||
return hugetlbfs_punch_hole(inode, offset, len);
|
||||
|
||||
/*
|
||||
* Default preallocate case.
|
||||
* For this range, start is rounded down and end is rounded up
|
||||
* as well as being converted to page offsets.
|
||||
*/
|
||||
start = offset >> hpage_shift;
|
||||
end = (offset + len + hpage_size - 1) >> hpage_shift;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
|
||||
error = inode_newsize_ok(inode, offset + len);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Initialize a pseudo vma as this is required by the huge page
|
||||
* allocation routines. If NUMA is configured, use page index
|
||||
* as input to create an allocation policy.
|
||||
*/
|
||||
memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
|
||||
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
|
||||
pseudo_vma.vm_file = file;
|
||||
|
||||
for (index = start; index < end; index++) {
|
||||
/*
|
||||
* This is supposed to be the vaddr where the page is being
|
||||
* faulted in, but we have no vaddr here.
|
||||
*/
|
||||
struct page *page;
|
||||
unsigned long addr;
|
||||
int avoid_reserve = 0;
|
||||
|
||||
cond_resched();
|
||||
|
||||
/*
|
||||
* fallocate(2) manpage permits EINTR; we may have been
|
||||
* interrupted because we are using up too much memory.
|
||||
*/
|
||||
if (signal_pending(current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Set numa allocation policy based on index */
|
||||
hugetlb_set_vma_policy(&pseudo_vma, inode, index);
|
||||
|
||||
/* addr is the offset within the file (zero based) */
|
||||
addr = index * hpage_size;
|
||||
|
||||
/* mutex taken here, fault path and hole punch */
|
||||
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
|
||||
index, addr);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
||||
/* See if already present in mapping to avoid alloc/free */
|
||||
page = find_get_page(mapping, index);
|
||||
if (page) {
|
||||
put_page(page);
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
hugetlb_drop_vma_policy(&pseudo_vma);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Allocate page and add to page cache */
|
||||
page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
|
||||
hugetlb_drop_vma_policy(&pseudo_vma);
|
||||
if (IS_ERR(page)) {
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
error = PTR_ERR(page);
|
||||
goto out;
|
||||
}
|
||||
clear_huge_page(page, addr, pages_per_huge_page(h));
|
||||
__SetPageUptodate(page);
|
||||
error = huge_add_to_page_cache(page, mapping, index);
|
||||
if (unlikely(error)) {
|
||||
put_page(page);
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
||||
/*
|
||||
* page_put due to reference from alloc_huge_page()
|
||||
* unlock_page because locked by add_to_page_cache()
|
||||
*/
|
||||
put_page(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
|
||||
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
|
||||
i_size_write(inode, offset + len);
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_private = NULL;
|
||||
spin_unlock(&inode->i_lock);
|
||||
out:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
struct inode *inode = d_inode(dentry);
|
||||
@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
|
||||
.mmap = hugetlbfs_file_mmap,
|
||||
.fsync = noop_fsync,
|
||||
.get_unmapped_area = hugetlb_get_unmapped_area,
|
||||
.llseek = default_llseek,
|
||||
.llseek = default_llseek,
|
||||
.fallocate = hugetlbfs_fallocate,
|
||||
};
|
||||
|
||||
static const struct inode_operations hugetlbfs_dir_inode_operations = {
|
||||
|
@ -446,6 +446,7 @@ struct mem_size_stats {
|
||||
unsigned long anonymous_thp;
|
||||
unsigned long swap;
|
||||
u64 pss;
|
||||
u64 swap_pss;
|
||||
};
|
||||
|
||||
static void smaps_account(struct mem_size_stats *mss, struct page *page,
|
||||
@ -492,9 +493,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
|
||||
} else if (is_swap_pte(*pte)) {
|
||||
swp_entry_t swpent = pte_to_swp_entry(*pte);
|
||||
|
||||
if (!non_swap_entry(swpent))
|
||||
if (!non_swap_entry(swpent)) {
|
||||
int mapcount;
|
||||
|
||||
mss->swap += PAGE_SIZE;
|
||||
else if (is_migration_entry(swpent))
|
||||
mapcount = swp_swapcount(swpent);
|
||||
if (mapcount >= 2) {
|
||||
u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
|
||||
|
||||
do_div(pss_delta, mapcount);
|
||||
mss->swap_pss += pss_delta;
|
||||
} else {
|
||||
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
|
||||
}
|
||||
} else if (is_migration_entry(swpent))
|
||||
page = migration_entry_to_page(swpent);
|
||||
}
|
||||
|
||||
@ -640,6 +652,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||
"Anonymous: %8lu kB\n"
|
||||
"AnonHugePages: %8lu kB\n"
|
||||
"Swap: %8lu kB\n"
|
||||
"SwapPss: %8lu kB\n"
|
||||
"KernelPageSize: %8lu kB\n"
|
||||
"MMUPageSize: %8lu kB\n"
|
||||
"Locked: %8lu kB\n",
|
||||
@ -654,6 +667,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||
mss.anonymous >> 10,
|
||||
mss.anonymous_thp >> 10,
|
||||
mss.swap >> 10,
|
||||
(unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
|
||||
vma_kernel_pagesize(vma) >> 10,
|
||||
vma_mmu_pagesize(vma) >> 10,
|
||||
(vma->vm_flags & VM_LOCKED) ?
|
||||
@ -712,23 +726,6 @@ const struct file_operations proc_tid_smaps_operations = {
|
||||
.release = proc_map_release,
|
||||
};
|
||||
|
||||
/*
|
||||
* We do not want to have constant page-shift bits sitting in
|
||||
* pagemap entries and are about to reuse them some time soon.
|
||||
*
|
||||
* Here's the "migration strategy":
|
||||
* 1. when the system boots these bits remain what they are,
|
||||
* but a warning about future change is printed in log;
|
||||
* 2. once anyone clears soft-dirty bits via clear_refs file,
|
||||
* these flag is set to denote, that user is aware of the
|
||||
* new API and those page-shift bits change their meaning.
|
||||
* The respective warning is printed in dmesg;
|
||||
* 3. In a couple of releases we will remove all the mentions
|
||||
* of page-shift in pagemap entries.
|
||||
*/
|
||||
|
||||
static bool soft_dirty_cleared __read_mostly;
|
||||
|
||||
enum clear_refs_types {
|
||||
CLEAR_REFS_ALL = 1,
|
||||
CLEAR_REFS_ANON,
|
||||
@ -889,13 +886,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
|
||||
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
|
||||
return -EINVAL;
|
||||
|
||||
if (type == CLEAR_REFS_SOFT_DIRTY) {
|
||||
soft_dirty_cleared = true;
|
||||
pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
|
||||
" See the linux/Documentation/vm/pagemap.txt for "
|
||||
"details.\n");
|
||||
}
|
||||
|
||||
task = get_proc_task(file_inode(file));
|
||||
if (!task)
|
||||
return -ESRCH;
|
||||
@ -963,36 +953,26 @@ typedef struct {
|
||||
struct pagemapread {
|
||||
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
|
||||
pagemap_entry_t *buffer;
|
||||
bool v2;
|
||||
bool show_pfn;
|
||||
};
|
||||
|
||||
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
|
||||
#define PAGEMAP_WALK_MASK (PMD_MASK)
|
||||
|
||||
#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
|
||||
#define PM_STATUS_BITS 3
|
||||
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
|
||||
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
|
||||
#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
|
||||
#define PM_PSHIFT_BITS 6
|
||||
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
|
||||
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
|
||||
#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
|
||||
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
|
||||
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
|
||||
/* in "new" pagemap pshift bits are occupied with more status bits */
|
||||
#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
|
||||
#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
|
||||
#define PM_PFRAME_BITS 55
|
||||
#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
|
||||
#define PM_SOFT_DIRTY BIT_ULL(55)
|
||||
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
|
||||
#define PM_FILE BIT_ULL(61)
|
||||
#define PM_SWAP BIT_ULL(62)
|
||||
#define PM_PRESENT BIT_ULL(63)
|
||||
|
||||
#define __PM_SOFT_DIRTY (1LL)
|
||||
#define PM_PRESENT PM_STATUS(4LL)
|
||||
#define PM_SWAP PM_STATUS(2LL)
|
||||
#define PM_FILE PM_STATUS(1LL)
|
||||
#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
|
||||
#define PM_END_OF_BUFFER 1
|
||||
|
||||
static inline pagemap_entry_t make_pme(u64 val)
|
||||
static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
|
||||
{
|
||||
return (pagemap_entry_t) { .pme = val };
|
||||
return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
|
||||
}
|
||||
|
||||
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
|
||||
@ -1013,7 +993,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
|
||||
|
||||
while (addr < end) {
|
||||
struct vm_area_struct *vma = find_vma(walk->mm, addr);
|
||||
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
|
||||
pagemap_entry_t pme = make_pme(0, 0);
|
||||
/* End of address space hole, which we mark as non-present. */
|
||||
unsigned long hole_end;
|
||||
|
||||
@ -1033,7 +1013,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
|
||||
|
||||
/* Addresses in the VMA. */
|
||||
if (vma->vm_flags & VM_SOFTDIRTY)
|
||||
pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
|
||||
pme = make_pme(0, PM_SOFT_DIRTY);
|
||||
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
|
||||
err = add_to_pagemap(addr, &pme, pm);
|
||||
if (err)
|
||||
@ -1044,67 +1024,42 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
|
||||
return err;
|
||||
}
|
||||
|
||||
static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
|
||||
static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
|
||||
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
|
||||
{
|
||||
u64 frame, flags;
|
||||
u64 frame = 0, flags = 0;
|
||||
struct page *page = NULL;
|
||||
int flags2 = 0;
|
||||
|
||||
if (pte_present(pte)) {
|
||||
frame = pte_pfn(pte);
|
||||
flags = PM_PRESENT;
|
||||
if (pm->show_pfn)
|
||||
frame = pte_pfn(pte);
|
||||
flags |= PM_PRESENT;
|
||||
page = vm_normal_page(vma, addr, pte);
|
||||
if (pte_soft_dirty(pte))
|
||||
flags2 |= __PM_SOFT_DIRTY;
|
||||
flags |= PM_SOFT_DIRTY;
|
||||
} else if (is_swap_pte(pte)) {
|
||||
swp_entry_t entry;
|
||||
if (pte_swp_soft_dirty(pte))
|
||||
flags2 |= __PM_SOFT_DIRTY;
|
||||
flags |= PM_SOFT_DIRTY;
|
||||
entry = pte_to_swp_entry(pte);
|
||||
frame = swp_type(entry) |
|
||||
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
|
||||
flags = PM_SWAP;
|
||||
flags |= PM_SWAP;
|
||||
if (is_migration_entry(entry))
|
||||
page = migration_entry_to_page(entry);
|
||||
} else {
|
||||
if (vma->vm_flags & VM_SOFTDIRTY)
|
||||
flags2 |= __PM_SOFT_DIRTY;
|
||||
*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
|
||||
return;
|
||||
}
|
||||
|
||||
if (page && !PageAnon(page))
|
||||
flags |= PM_FILE;
|
||||
if ((vma->vm_flags & VM_SOFTDIRTY))
|
||||
flags2 |= __PM_SOFT_DIRTY;
|
||||
if (page && page_mapcount(page) == 1)
|
||||
flags |= PM_MMAP_EXCLUSIVE;
|
||||
if (vma->vm_flags & VM_SOFTDIRTY)
|
||||
flags |= PM_SOFT_DIRTY;
|
||||
|
||||
*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
|
||||
return make_pme(frame, flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
|
||||
pmd_t pmd, int offset, int pmd_flags2)
|
||||
{
|
||||
/*
|
||||
* Currently pmd for thp is always present because thp can not be
|
||||
* swapped-out, migrated, or HWPOISONed (split in such cases instead.)
|
||||
* This if-check is just to prepare for future implementation.
|
||||
*/
|
||||
if (pmd_present(pmd))
|
||||
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
|
||||
| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
|
||||
else
|
||||
*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
|
||||
}
|
||||
#else
|
||||
static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
|
||||
pmd_t pmd, int offset, int pmd_flags2)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||||
static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct vm_area_struct *vma = walk->vma;
|
||||
@ -1113,41 +1068,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||||
pte_t *pte, *orig_pte;
|
||||
int err = 0;
|
||||
|
||||
if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
|
||||
int pmd_flags2;
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
|
||||
u64 flags = 0, frame = 0;
|
||||
pmd_t pmd = *pmdp;
|
||||
|
||||
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
|
||||
pmd_flags2 = __PM_SOFT_DIRTY;
|
||||
else
|
||||
pmd_flags2 = 0;
|
||||
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
|
||||
flags |= PM_SOFT_DIRTY;
|
||||
|
||||
/*
|
||||
* Currently pmd for thp is always present because thp
|
||||
* can not be swapped-out, migrated, or HWPOISONed
|
||||
* (split in such cases instead.)
|
||||
* This if-check is just to prepare for future implementation.
|
||||
*/
|
||||
if (pmd_present(pmd)) {
|
||||
struct page *page = pmd_page(pmd);
|
||||
|
||||
if (page_mapcount(page) == 1)
|
||||
flags |= PM_MMAP_EXCLUSIVE;
|
||||
|
||||
flags |= PM_PRESENT;
|
||||
if (pm->show_pfn)
|
||||
frame = pmd_pfn(pmd) +
|
||||
((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
for (; addr != end; addr += PAGE_SIZE) {
|
||||
unsigned long offset;
|
||||
pagemap_entry_t pme;
|
||||
pagemap_entry_t pme = make_pme(frame, flags);
|
||||
|
||||
offset = (addr & ~PAGEMAP_WALK_MASK) >>
|
||||
PAGE_SHIFT;
|
||||
thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
|
||||
err = add_to_pagemap(addr, &pme, pm);
|
||||
if (err)
|
||||
break;
|
||||
if (pm->show_pfn && (flags & PM_PRESENT))
|
||||
frame++;
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
return err;
|
||||
}
|
||||
|
||||
if (pmd_trans_unstable(pmd))
|
||||
if (pmd_trans_unstable(pmdp))
|
||||
return 0;
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
/*
|
||||
* We can assume that @vma always points to a valid one and @end never
|
||||
* goes beyond vma->vm_end.
|
||||
*/
|
||||
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
|
||||
orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
|
||||
for (; addr < end; pte++, addr += PAGE_SIZE) {
|
||||
pagemap_entry_t pme;
|
||||
|
||||
pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
|
||||
pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
|
||||
err = add_to_pagemap(addr, &pme, pm);
|
||||
if (err)
|
||||
break;
|
||||
@ -1160,40 +1132,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
|
||||
pte_t pte, int offset, int flags2)
|
||||
{
|
||||
if (pte_present(pte))
|
||||
*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
|
||||
PM_STATUS2(pm->v2, flags2) |
|
||||
PM_PRESENT);
|
||||
else
|
||||
*pme = make_pme(PM_NOT_PRESENT(pm->v2) |
|
||||
PM_STATUS2(pm->v2, flags2));
|
||||
}
|
||||
|
||||
/* This function walks within one hugetlb entry in the single call */
|
||||
static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
|
||||
static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct pagemapread *pm = walk->private;
|
||||
struct vm_area_struct *vma = walk->vma;
|
||||
u64 flags = 0, frame = 0;
|
||||
int err = 0;
|
||||
int flags2;
|
||||
pagemap_entry_t pme;
|
||||
pte_t pte;
|
||||
|
||||
if (vma->vm_flags & VM_SOFTDIRTY)
|
||||
flags2 = __PM_SOFT_DIRTY;
|
||||
else
|
||||
flags2 = 0;
|
||||
flags |= PM_SOFT_DIRTY;
|
||||
|
||||
pte = huge_ptep_get(ptep);
|
||||
if (pte_present(pte)) {
|
||||
struct page *page = pte_page(pte);
|
||||
|
||||
if (!PageAnon(page))
|
||||
flags |= PM_FILE;
|
||||
|
||||
if (page_mapcount(page) == 1)
|
||||
flags |= PM_MMAP_EXCLUSIVE;
|
||||
|
||||
flags |= PM_PRESENT;
|
||||
if (pm->show_pfn)
|
||||
frame = pte_pfn(pte) +
|
||||
((addr & ~hmask) >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
for (; addr != end; addr += PAGE_SIZE) {
|
||||
int offset = (addr & ~hmask) >> PAGE_SHIFT;
|
||||
huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
|
||||
pagemap_entry_t pme = make_pme(frame, flags);
|
||||
|
||||
err = add_to_pagemap(addr, &pme, pm);
|
||||
if (err)
|
||||
return err;
|
||||
if (pm->show_pfn && (flags & PM_PRESENT))
|
||||
frame++;
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
@ -1211,7 +1187,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
|
||||
* Bits 0-54 page frame number (PFN) if present
|
||||
* Bits 0-4 swap type if swapped
|
||||
* Bits 5-54 swap offset if swapped
|
||||
* Bits 55-60 page shift (page size = 1<<page shift)
|
||||
* Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
|
||||
* Bit 56 page exclusively mapped
|
||||
* Bits 57-60 zero
|
||||
* Bit 61 page is file-page or shared-anon
|
||||
* Bit 62 page swapped
|
||||
* Bit 63 page present
|
||||
@ -1229,42 +1207,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
|
||||
static ssize_t pagemap_read(struct file *file, char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct task_struct *task = get_proc_task(file_inode(file));
|
||||
struct mm_struct *mm;
|
||||
struct mm_struct *mm = file->private_data;
|
||||
struct pagemapread pm;
|
||||
int ret = -ESRCH;
|
||||
struct mm_walk pagemap_walk = {};
|
||||
unsigned long src;
|
||||
unsigned long svpfn;
|
||||
unsigned long start_vaddr;
|
||||
unsigned long end_vaddr;
|
||||
int copied = 0;
|
||||
int ret = 0, copied = 0;
|
||||
|
||||
if (!task)
|
||||
if (!mm || !atomic_inc_not_zero(&mm->mm_users))
|
||||
goto out;
|
||||
|
||||
ret = -EINVAL;
|
||||
/* file position must be aligned */
|
||||
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
|
||||
goto out_task;
|
||||
goto out_mm;
|
||||
|
||||
ret = 0;
|
||||
if (!count)
|
||||
goto out_task;
|
||||
goto out_mm;
|
||||
|
||||
/* do not disclose physical addresses: attack vector */
|
||||
pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
|
||||
|
||||
pm.v2 = soft_dirty_cleared;
|
||||
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
|
||||
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
|
||||
ret = -ENOMEM;
|
||||
if (!pm.buffer)
|
||||
goto out_task;
|
||||
goto out_mm;
|
||||
|
||||
mm = mm_access(task, PTRACE_MODE_READ);
|
||||
ret = PTR_ERR(mm);
|
||||
if (!mm || IS_ERR(mm))
|
||||
goto out_free;
|
||||
|
||||
pagemap_walk.pmd_entry = pagemap_pte_range;
|
||||
pagemap_walk.pmd_entry = pagemap_pmd_range;
|
||||
pagemap_walk.pte_hole = pagemap_pte_hole;
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
|
||||
@ -1275,10 +1248,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
|
||||
src = *ppos;
|
||||
svpfn = src / PM_ENTRY_BYTES;
|
||||
start_vaddr = svpfn << PAGE_SHIFT;
|
||||
end_vaddr = TASK_SIZE_OF(task);
|
||||
end_vaddr = mm->task_size;
|
||||
|
||||
/* watch out for wraparound */
|
||||
if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
|
||||
if (svpfn > mm->task_size >> PAGE_SHIFT)
|
||||
start_vaddr = end_vaddr;
|
||||
|
||||
/*
|
||||
@ -1305,7 +1278,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
|
||||
len = min(count, PM_ENTRY_BYTES * pm.pos);
|
||||
if (copy_to_user(buf, pm.buffer, len)) {
|
||||
ret = -EFAULT;
|
||||
goto out_mm;
|
||||
goto out_free;
|
||||
}
|
||||
copied += len;
|
||||
buf += len;
|
||||
@ -1315,24 +1288,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
|
||||
if (!ret || ret == PM_END_OF_BUFFER)
|
||||
ret = copied;
|
||||
|
||||
out_mm:
|
||||
mmput(mm);
|
||||
out_free:
|
||||
kfree(pm.buffer);
|
||||
out_task:
|
||||
put_task_struct(task);
|
||||
out_mm:
|
||||
mmput(mm);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int pagemap_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
/* do not disclose physical addresses: attack vector */
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
|
||||
"to stop being page-shift some time soon. See the "
|
||||
"linux/Documentation/vm/pagemap.txt for details.\n");
|
||||
struct mm_struct *mm;
|
||||
|
||||
mm = proc_mem_open(inode, PTRACE_MODE_READ);
|
||||
if (IS_ERR(mm))
|
||||
return PTR_ERR(mm);
|
||||
file->private_data = mm;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pagemap_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct mm_struct *mm = file->private_data;
|
||||
|
||||
if (mm)
|
||||
mmdrop(mm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1340,6 +1320,7 @@ const struct file_operations proc_pagemap_operations = {
|
||||
.llseek = mem_lseek, /* borrow this */
|
||||
.read = pagemap_read,
|
||||
.open = pagemap_open,
|
||||
.release = pagemap_release,
|
||||
};
|
||||
#endif /* CONFIG_PROC_PAGE_MONITOR */
|
||||
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/list_lru.h>
|
||||
|
@ -1546,8 +1546,36 @@ xfs_filemap_fault(
|
||||
return ret;
|
||||
}
|
||||
|
||||
STATIC int
|
||||
xfs_filemap_pmd_fault(
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr,
|
||||
pmd_t *pmd,
|
||||
unsigned int flags)
|
||||
{
|
||||
struct inode *inode = file_inode(vma->vm_file);
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
int ret;
|
||||
|
||||
if (!IS_DAX(inode))
|
||||
return VM_FAULT_FALLBACK;
|
||||
|
||||
trace_xfs_filemap_pmd_fault(ip);
|
||||
|
||||
sb_start_pagefault(inode->i_sb);
|
||||
file_update_time(vma->vm_file);
|
||||
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
||||
ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
|
||||
xfs_end_io_dax_write);
|
||||
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct xfs_file_vm_ops = {
|
||||
.fault = xfs_filemap_fault,
|
||||
.pmd_fault = xfs_filemap_pmd_fault,
|
||||
.map_pages = filemap_map_pages,
|
||||
.page_mkwrite = xfs_filemap_page_mkwrite,
|
||||
};
|
||||
@ -1560,7 +1588,7 @@ xfs_file_mmap(
|
||||
file_accessed(filp);
|
||||
vma->vm_ops = &xfs_file_vm_ops;
|
||||
if (IS_DAX(file_inode(filp)))
|
||||
vma->vm_flags |= VM_MIXEDMAP;
|
||||
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
|
||||
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
|
||||
|
||||
DEFINE_INODE_EVENT(xfs_filemap_fault);
|
||||
DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
|
||||
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
|
||||
|
||||
DECLARE_EVENT_CLASS(xfs_iref_class,
|
||||
|
@ -35,6 +35,12 @@ extern void early_ioremap_setup(void);
|
||||
*/
|
||||
extern void early_ioremap_reset(void);
|
||||
|
||||
/*
|
||||
* Early copy from unmapped memory to kernel mapped memory.
|
||||
*/
|
||||
extern void copy_from_early_mem(void *dest, phys_addr_t src,
|
||||
unsigned long size);
|
||||
|
||||
#else
|
||||
static inline void early_ioremap_init(void) { }
|
||||
static inline void early_ioremap_setup(void) { }
|
||||
|
39
include/linux/dax.h
Normal file
39
include/linux/dax.h
Normal file
@ -0,0 +1,39 @@
|
||||
#ifndef _LINUX_DAX_H
|
||||
#define _LINUX_DAX_H
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
|
||||
get_block_t, dio_iodone_t, int flags);
|
||||
int dax_clear_blocks(struct inode *, sector_t block, long size);
|
||||
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
|
||||
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
|
||||
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||
dax_iodone_t);
|
||||
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||
dax_iodone_t);
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
|
||||
unsigned int flags, get_block_t, dax_iodone_t);
|
||||
int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
|
||||
unsigned int flags, get_block_t, dax_iodone_t);
|
||||
#else
|
||||
static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd, unsigned int flags, get_block_t gb,
|
||||
dax_iodone_t di)
|
||||
{
|
||||
return VM_FAULT_FALLBACK;
|
||||
}
|
||||
#define __dax_pmd_fault dax_pmd_fault
|
||||
#endif
|
||||
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
|
||||
#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
|
||||
#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
|
||||
|
||||
static inline bool vma_is_dax(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
|
||||
}
|
||||
#endif
|
@ -24,6 +24,12 @@ void dma_pool_destroy(struct dma_pool *pool);
|
||||
void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle);
|
||||
|
||||
static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle)
|
||||
{
|
||||
return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle);
|
||||
}
|
||||
|
||||
void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
|
||||
|
||||
/*
|
||||
|
@ -52,7 +52,6 @@ struct swap_info_struct;
|
||||
struct seq_file;
|
||||
struct workqueue_struct;
|
||||
struct iov_iter;
|
||||
struct vm_fault;
|
||||
|
||||
extern void __init inode_init(void);
|
||||
extern void __init inode_init_early(void);
|
||||
@ -2678,19 +2677,6 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
|
||||
extern int generic_file_open(struct inode * inode, struct file * filp);
|
||||
extern int nonseekable_open(struct inode * inode, struct file * filp);
|
||||
|
||||
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
|
||||
get_block_t, dio_iodone_t, int flags);
|
||||
int dax_clear_blocks(struct inode *, sector_t block, long size);
|
||||
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
|
||||
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
|
||||
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||
dax_iodone_t);
|
||||
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||
dax_iodone_t);
|
||||
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
|
||||
#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
|
||||
#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
|
||||
|
||||
#ifdef CONFIG_BLOCK
|
||||
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
|
||||
loff_t file_offset);
|
||||
|
@ -63,7 +63,10 @@ struct vm_area_struct;
|
||||
* but it is definitely preferable to use the flag rather than opencode endless
|
||||
* loop around allocator.
|
||||
*
|
||||
* __GFP_NORETRY: The VM implementation must not retry indefinitely.
|
||||
* __GFP_NORETRY: The VM implementation must not retry indefinitely and will
|
||||
* return NULL when direct reclaim and memory compaction have failed to allow
|
||||
* the allocation to succeed. The OOM killer is not called with the current
|
||||
* implementation.
|
||||
*
|
||||
* __GFP_MOVABLE: Flag that this page will be movable by the page migration
|
||||
* mechanism or reclaimed
|
||||
@ -300,22 +303,31 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
|
||||
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
|
||||
}
|
||||
|
||||
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
|
||||
unsigned int order)
|
||||
/*
|
||||
* Allocate pages, preferring the node given as nid. The node must be valid and
|
||||
* online. For more general interface, see alloc_pages_node().
|
||||
*/
|
||||
static inline struct page *
|
||||
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
|
||||
{
|
||||
/* Unknown node is current node */
|
||||
if (nid < 0)
|
||||
nid = numa_node_id();
|
||||
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
|
||||
VM_WARN_ON(!node_online(nid));
|
||||
|
||||
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
|
||||
}
|
||||
|
||||
static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
|
||||
/*
|
||||
* Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
|
||||
* prefer the current CPU's closest node. Otherwise node must be valid and
|
||||
* online.
|
||||
*/
|
||||
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
|
||||
unsigned int order)
|
||||
{
|
||||
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
|
||||
if (nid == NUMA_NO_NODE)
|
||||
nid = numa_mem_id();
|
||||
|
||||
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
|
||||
return __alloc_pages_node(nid, gfp_mask, order);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
@ -354,7 +366,6 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask);
|
||||
|
||||
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
|
||||
void free_pages_exact(void *virt, size_t size);
|
||||
/* This is different from alloc_pages_exact_node !!! */
|
||||
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
|
||||
|
||||
#define __get_free_page(gfp_mask) \
|
||||
|
@ -33,6 +33,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
|
||||
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long addr, pgprot_t newprot,
|
||||
int prot_numa);
|
||||
int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
|
||||
unsigned long pfn, bool write);
|
||||
|
||||
enum transparent_hugepage_flag {
|
||||
TRANSPARENT_HUGEPAGE_FLAG,
|
||||
@ -122,7 +124,7 @@ extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
|
||||
#endif
|
||||
extern int hugepage_madvise(struct vm_area_struct *vma,
|
||||
unsigned long *vm_flags, int advice);
|
||||
extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||
extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
long adjust_next);
|
||||
@ -138,15 +140,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
long adjust_next)
|
||||
{
|
||||
if (!vma->anon_vma || vma->vm_ops)
|
||||
return;
|
||||
__vma_adjust_trans_huge(vma, start, end, adjust_next);
|
||||
}
|
||||
static inline int hpage_nr_pages(struct page *page)
|
||||
{
|
||||
if (unlikely(PageTransHuge(page)))
|
||||
@ -164,6 +157,13 @@ static inline bool is_huge_zero_page(struct page *page)
|
||||
return ACCESS_ONCE(huge_zero_page) == page;
|
||||
}
|
||||
|
||||
static inline bool is_huge_zero_pmd(pmd_t pmd)
|
||||
{
|
||||
return is_huge_zero_page(pmd_page(pmd));
|
||||
}
|
||||
|
||||
struct page *get_huge_zero_page(void);
|
||||
|
||||
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
|
||||
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
|
||||
|
@ -35,6 +35,9 @@ struct resv_map {
|
||||
struct kref refs;
|
||||
spinlock_t lock;
|
||||
struct list_head regions;
|
||||
long adds_in_progress;
|
||||
struct list_head region_cache;
|
||||
long region_cache_count;
|
||||
};
|
||||
extern struct resv_map *resv_map_alloc(void);
|
||||
void resv_map_release(struct kref *ref);
|
||||
@ -80,11 +83,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
|
||||
struct vm_area_struct *vma,
|
||||
vm_flags_t vm_flags);
|
||||
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
|
||||
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
|
||||
long freed);
|
||||
int dequeue_hwpoisoned_huge_page(struct page *page);
|
||||
bool isolate_huge_page(struct page *page, struct list_head *list);
|
||||
void putback_active_hugepage(struct page *page);
|
||||
void free_huge_page(struct page *page);
|
||||
void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
|
||||
extern struct mutex *hugetlb_fault_mutex_table;
|
||||
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||
struct vm_area_struct *vma,
|
||||
struct address_space *mapping,
|
||||
pgoff_t idx, unsigned long address);
|
||||
|
||||
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
|
||||
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
|
||||
@ -320,9 +330,13 @@ struct huge_bootmem_page {
|
||||
#endif
|
||||
};
|
||||
|
||||
struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
unsigned long addr, int avoid_reserve);
|
||||
struct page *alloc_huge_page_node(struct hstate *h, int nid);
|
||||
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
|
||||
unsigned long addr, int avoid_reserve);
|
||||
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
|
||||
pgoff_t idx);
|
||||
|
||||
/* arch callback */
|
||||
int __init alloc_bootmem_huge_page(struct hstate *h);
|
||||
@ -471,6 +485,7 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
|
||||
|
||||
#else /* CONFIG_HUGETLB_PAGE */
|
||||
struct hstate {};
|
||||
#define alloc_huge_page(v, a, r) NULL
|
||||
#define alloc_huge_page_node(h, nid) NULL
|
||||
#define alloc_huge_page_noerr(v, a, r) NULL
|
||||
#define alloc_bootmem_huge_page(h) NULL
|
||||
|
@ -77,6 +77,8 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
|
||||
int memblock_free(phys_addr_t base, phys_addr_t size);
|
||||
int memblock_reserve(phys_addr_t base, phys_addr_t size);
|
||||
void memblock_trim_memory(phys_addr_t align);
|
||||
bool memblock_overlaps_region(struct memblock_type *type,
|
||||
phys_addr_t base, phys_addr_t size);
|
||||
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
|
||||
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
|
||||
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
|
||||
@ -323,7 +325,7 @@ void memblock_enforce_memory_limit(phys_addr_t memory_limit);
|
||||
int memblock_is_memory(phys_addr_t addr);
|
||||
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
|
||||
int memblock_is_reserved(phys_addr_t addr);
|
||||
int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
|
||||
bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
|
||||
|
||||
extern void __memblock_dump_all(void);
|
||||
|
||||
|
@ -23,6 +23,11 @@
|
||||
#include <linux/vm_event_item.h>
|
||||
#include <linux/hardirq.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/page_counter.h>
|
||||
#include <linux/vmpressure.h>
|
||||
#include <linux/eventfd.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/writeback.h>
|
||||
|
||||
struct mem_cgroup;
|
||||
struct page;
|
||||
@ -67,12 +72,221 @@ enum mem_cgroup_events_index {
|
||||
MEMCG_NR_EVENTS,
|
||||
};
|
||||
|
||||
/*
|
||||
* Per memcg event counter is incremented at every pagein/pageout. With THP,
|
||||
* it will be incremated by the number of pages. This counter is used for
|
||||
* for trigger some periodic events. This is straightforward and better
|
||||
* than using jiffies etc. to handle periodic memcg event.
|
||||
*/
|
||||
enum mem_cgroup_events_target {
|
||||
MEM_CGROUP_TARGET_THRESH,
|
||||
MEM_CGROUP_TARGET_SOFTLIMIT,
|
||||
MEM_CGROUP_TARGET_NUMAINFO,
|
||||
MEM_CGROUP_NTARGETS,
|
||||
};
|
||||
|
||||
/*
|
||||
* Bits in struct cg_proto.flags
|
||||
*/
|
||||
enum cg_proto_flags {
|
||||
/* Currently active and new sockets should be assigned to cgroups */
|
||||
MEMCG_SOCK_ACTIVE,
|
||||
/* It was ever activated; we must disarm static keys on destruction */
|
||||
MEMCG_SOCK_ACTIVATED,
|
||||
};
|
||||
|
||||
struct cg_proto {
|
||||
struct page_counter memory_allocated; /* Current allocated memory. */
|
||||
struct percpu_counter sockets_allocated; /* Current number of sockets. */
|
||||
int memory_pressure;
|
||||
long sysctl_mem[3];
|
||||
unsigned long flags;
|
||||
/*
|
||||
* memcg field is used to find which memcg we belong directly
|
||||
* Each memcg struct can hold more than one cg_proto, so container_of
|
||||
* won't really cut.
|
||||
*
|
||||
* The elegant solution would be having an inverse function to
|
||||
* proto_cgroup in struct proto, but that means polluting the structure
|
||||
* for everybody, instead of just for memcg users.
|
||||
*/
|
||||
struct mem_cgroup *memcg;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct mem_cgroup_stat_cpu {
|
||||
long count[MEM_CGROUP_STAT_NSTATS];
|
||||
unsigned long events[MEMCG_NR_EVENTS];
|
||||
unsigned long nr_page_events;
|
||||
unsigned long targets[MEM_CGROUP_NTARGETS];
|
||||
};
|
||||
|
||||
struct mem_cgroup_reclaim_iter {
|
||||
struct mem_cgroup *position;
|
||||
/* scan generation, increased every round-trip */
|
||||
unsigned int generation;
|
||||
};
|
||||
|
||||
/*
|
||||
* per-zone information in memory controller.
|
||||
*/
|
||||
struct mem_cgroup_per_zone {
|
||||
struct lruvec lruvec;
|
||||
unsigned long lru_size[NR_LRU_LISTS];
|
||||
|
||||
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
|
||||
|
||||
struct rb_node tree_node; /* RB tree node */
|
||||
unsigned long usage_in_excess;/* Set to the value by which */
|
||||
/* the soft limit is exceeded*/
|
||||
bool on_tree;
|
||||
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
||||
/* use container_of */
|
||||
};
|
||||
|
||||
struct mem_cgroup_per_node {
|
||||
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
|
||||
};
|
||||
|
||||
struct mem_cgroup_threshold {
|
||||
struct eventfd_ctx *eventfd;
|
||||
unsigned long threshold;
|
||||
};
|
||||
|
||||
/* For threshold */
|
||||
struct mem_cgroup_threshold_ary {
|
||||
/* An array index points to threshold just below or equal to usage. */
|
||||
int current_threshold;
|
||||
/* Size of entries[] */
|
||||
unsigned int size;
|
||||
/* Array of thresholds */
|
||||
struct mem_cgroup_threshold entries[0];
|
||||
};
|
||||
|
||||
struct mem_cgroup_thresholds {
|
||||
/* Primary thresholds array */
|
||||
struct mem_cgroup_threshold_ary *primary;
|
||||
/*
|
||||
* Spare threshold array.
|
||||
* This is needed to make mem_cgroup_unregister_event() "never fail".
|
||||
* It must be able to store at least primary->size - 1 entries.
|
||||
*/
|
||||
struct mem_cgroup_threshold_ary *spare;
|
||||
};
|
||||
|
||||
/*
|
||||
* The memory controller data structure. The memory controller controls both
|
||||
* page cache and RSS per cgroup. We would eventually like to provide
|
||||
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
|
||||
* to help the administrator determine what knobs to tune.
|
||||
*/
|
||||
struct mem_cgroup {
|
||||
struct cgroup_subsys_state css;
|
||||
|
||||
/* Accounted resources */
|
||||
struct page_counter memory;
|
||||
struct page_counter memsw;
|
||||
struct page_counter kmem;
|
||||
|
||||
/* Normal memory consumption range */
|
||||
unsigned long low;
|
||||
unsigned long high;
|
||||
|
||||
unsigned long soft_limit;
|
||||
|
||||
/* vmpressure notifications */
|
||||
struct vmpressure vmpressure;
|
||||
|
||||
/* css_online() has been completed */
|
||||
int initialized;
|
||||
|
||||
/*
|
||||
* Should the accounting and control be hierarchical, per subtree?
|
||||
*/
|
||||
bool use_hierarchy;
|
||||
|
||||
/* protected by memcg_oom_lock */
|
||||
bool oom_lock;
|
||||
int under_oom;
|
||||
|
||||
int swappiness;
|
||||
/* OOM-Killer disable */
|
||||
int oom_kill_disable;
|
||||
|
||||
/* protect arrays of thresholds */
|
||||
struct mutex thresholds_lock;
|
||||
|
||||
/* thresholds for memory usage. RCU-protected */
|
||||
struct mem_cgroup_thresholds thresholds;
|
||||
|
||||
/* thresholds for mem+swap usage. RCU-protected */
|
||||
struct mem_cgroup_thresholds memsw_thresholds;
|
||||
|
||||
/* For oom notifier event fd */
|
||||
struct list_head oom_notify;
|
||||
|
||||
/*
|
||||
* Should we move charges of a task when a task is moved into this
|
||||
* mem_cgroup ? And what type of charges should we move ?
|
||||
*/
|
||||
unsigned long move_charge_at_immigrate;
|
||||
/*
|
||||
* set > 0 if pages under this cgroup are moving to other cgroup.
|
||||
*/
|
||||
atomic_t moving_account;
|
||||
/* taken only while moving_account > 0 */
|
||||
spinlock_t move_lock;
|
||||
struct task_struct *move_lock_task;
|
||||
unsigned long move_lock_flags;
|
||||
/*
|
||||
* percpu counter.
|
||||
*/
|
||||
struct mem_cgroup_stat_cpu __percpu *stat;
|
||||
spinlock_t pcp_counter_lock;
|
||||
|
||||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
|
||||
struct cg_proto tcp_mem;
|
||||
#endif
|
||||
#if defined(CONFIG_MEMCG_KMEM)
|
||||
/* Index in the kmem_cache->memcg_params.memcg_caches array */
|
||||
int kmemcg_id;
|
||||
bool kmem_acct_activated;
|
||||
bool kmem_acct_active;
|
||||
#endif
|
||||
|
||||
int last_scanned_node;
|
||||
#if MAX_NUMNODES > 1
|
||||
nodemask_t scan_nodes;
|
||||
atomic_t numainfo_events;
|
||||
atomic_t numainfo_updating;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
struct list_head cgwb_list;
|
||||
struct wb_domain cgwb_domain;
|
||||
#endif
|
||||
|
||||
/* List of events which userspace want to receive */
|
||||
struct list_head event_list;
|
||||
spinlock_t event_list_lock;
|
||||
|
||||
struct mem_cgroup_per_node *nodeinfo[0];
|
||||
/* WARNING: nodeinfo must be the last member here */
|
||||
};
|
||||
extern struct cgroup_subsys_state *mem_cgroup_root_css;
|
||||
|
||||
void mem_cgroup_events(struct mem_cgroup *memcg,
|
||||
/**
|
||||
* mem_cgroup_events - count memory events against a cgroup
|
||||
* @memcg: the memory cgroup
|
||||
* @idx: the event index
|
||||
* @nr: the number of events to account for
|
||||
*/
|
||||
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
|
||||
enum mem_cgroup_events_index idx,
|
||||
unsigned int nr);
|
||||
unsigned int nr)
|
||||
{
|
||||
this_cpu_add(memcg->stat->events[idx], nr);
|
||||
}
|
||||
|
||||
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
|
||||
|
||||
@ -90,15 +304,31 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
|
||||
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
|
||||
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
|
||||
|
||||
bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
|
||||
struct mem_cgroup *root);
|
||||
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
|
||||
|
||||
extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
|
||||
extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
||||
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
|
||||
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
||||
|
||||
extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
|
||||
extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
|
||||
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
|
||||
static inline
|
||||
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
|
||||
return css ? container_of(css, struct mem_cgroup, css) : NULL;
|
||||
}
|
||||
|
||||
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
||||
struct mem_cgroup *,
|
||||
struct mem_cgroup_reclaim_cookie *);
|
||||
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
||||
|
||||
static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
|
||||
struct mem_cgroup *root)
|
||||
{
|
||||
if (root == memcg)
|
||||
return true;
|
||||
if (!root->use_hierarchy)
|
||||
return false;
|
||||
return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
|
||||
}
|
||||
|
||||
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
||||
struct mem_cgroup *memcg)
|
||||
@ -114,24 +344,67 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
|
||||
return match;
|
||||
}
|
||||
|
||||
extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
|
||||
extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
|
||||
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
|
||||
|
||||
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
||||
struct mem_cgroup *,
|
||||
struct mem_cgroup_reclaim_cookie *);
|
||||
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
||||
static inline bool mem_cgroup_disabled(void)
|
||||
{
|
||||
if (memory_cgrp_subsys.disabled)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* For memory reclaim.
|
||||
*/
|
||||
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
|
||||
bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
|
||||
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
|
||||
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
|
||||
void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
|
||||
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
||||
struct task_struct *p);
|
||||
|
||||
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
||||
int nr_pages);
|
||||
|
||||
static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
|
||||
{
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return true;
|
||||
|
||||
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
||||
memcg = mz->memcg;
|
||||
|
||||
return !!(memcg->css.flags & CSS_ONLINE);
|
||||
}
|
||||
|
||||
static inline
|
||||
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
|
||||
{
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
|
||||
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
||||
return mz->lru_size[lru];
|
||||
}
|
||||
|
||||
static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
||||
{
|
||||
unsigned long inactive_ratio;
|
||||
unsigned long inactive;
|
||||
unsigned long active;
|
||||
unsigned long gb;
|
||||
|
||||
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
|
||||
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
|
||||
|
||||
gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
||||
if (gb)
|
||||
inactive_ratio = int_sqrt(10 * gb);
|
||||
else
|
||||
inactive_ratio = 1;
|
||||
|
||||
return inactive * inactive_ratio < active;
|
||||
}
|
||||
|
||||
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
||||
struct task_struct *p);
|
||||
|
||||
static inline void mem_cgroup_oom_enable(void)
|
||||
{
|
||||
@ -156,18 +429,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
|
||||
extern int do_swap_account;
|
||||
#endif
|
||||
|
||||
static inline bool mem_cgroup_disabled(void)
|
||||
{
|
||||
if (memory_cgrp_subsys.disabled)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
|
||||
void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
|
||||
enum mem_cgroup_stat_index idx, int val);
|
||||
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
|
||||
|
||||
/**
|
||||
* mem_cgroup_update_page_stat - update page state statistics
|
||||
* @memcg: memcg to account against
|
||||
* @idx: page state item to account
|
||||
* @val: number of pages (positive or negative)
|
||||
*
|
||||
* See mem_cgroup_begin_page_stat() for locking requirements.
|
||||
*/
|
||||
static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
|
||||
enum mem_cgroup_stat_index idx, int val)
|
||||
{
|
||||
VM_BUG_ON(!rcu_read_lock_held());
|
||||
|
||||
if (memcg)
|
||||
this_cpu_add(memcg->stat->count[idx], val);
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
|
||||
enum mem_cgroup_stat_index idx)
|
||||
{
|
||||
@ -184,13 +465,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
||||
gfp_t gfp_mask,
|
||||
unsigned long *total_scanned);
|
||||
|
||||
void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
|
||||
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
|
||||
enum vm_event_item idx)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
__mem_cgroup_count_vm_event(mm, idx);
|
||||
|
||||
rcu_read_lock();
|
||||
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
||||
if (unlikely(!memcg))
|
||||
goto out;
|
||||
|
||||
switch (idx) {
|
||||
case PGFAULT:
|
||||
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
|
||||
break;
|
||||
case PGMAJFAULT:
|
||||
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
void mem_cgroup_split_huge_fixup(struct page *head);
|
||||
@ -199,8 +498,6 @@ void mem_cgroup_split_huge_fixup(struct page *head);
|
||||
#else /* CONFIG_MEMCG */
|
||||
struct mem_cgroup;
|
||||
|
||||
#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
|
||||
|
||||
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
|
||||
enum mem_cgroup_events_index idx,
|
||||
unsigned int nr)
|
||||
@ -275,12 +572,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline struct cgroup_subsys_state
|
||||
*mem_cgroup_css(struct mem_cgroup *memcg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct mem_cgroup *
|
||||
mem_cgroup_iter(struct mem_cgroup *root,
|
||||
struct mem_cgroup *prev,
|
||||
@ -428,8 +719,8 @@ static inline void sock_release_memcg(struct sock *sk)
|
||||
extern struct static_key memcg_kmem_enabled_key;
|
||||
|
||||
extern int memcg_nr_cache_ids;
|
||||
extern void memcg_get_cache_ids(void);
|
||||
extern void memcg_put_cache_ids(void);
|
||||
void memcg_get_cache_ids(void);
|
||||
void memcg_put_cache_ids(void);
|
||||
|
||||
/*
|
||||
* Helper macro to loop through all memcg-specific caches. Callers must still
|
||||
@ -444,7 +735,10 @@ static inline bool memcg_kmem_enabled(void)
|
||||
return static_key_false(&memcg_kmem_enabled_key);
|
||||
}
|
||||
|
||||
bool memcg_kmem_is_active(struct mem_cgroup *memcg);
|
||||
static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
|
||||
{
|
||||
return memcg->kmem_acct_active;
|
||||
}
|
||||
|
||||
/*
|
||||
* In general, we'll do everything in our power to not incur in any overhead
|
||||
@ -463,7 +757,15 @@ void __memcg_kmem_commit_charge(struct page *page,
|
||||
struct mem_cgroup *memcg, int order);
|
||||
void __memcg_kmem_uncharge_pages(struct page *page, int order);
|
||||
|
||||
int memcg_cache_id(struct mem_cgroup *memcg);
|
||||
/*
|
||||
* helper for acessing a memcg's index. It will be used as an index in the
|
||||
* child cache array in kmem_cache, and also to derive its name. This function
|
||||
* will return -1 when this is not a kmem-limited memcg.
|
||||
*/
|
||||
static inline int memcg_cache_id(struct mem_cgroup *memcg)
|
||||
{
|
||||
return memcg ? memcg->kmemcg_id : -1;
|
||||
}
|
||||
|
||||
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
|
||||
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
|
||||
|
@ -249,6 +249,8 @@ struct vm_operations_struct {
|
||||
void (*close)(struct vm_area_struct * area);
|
||||
int (*mremap)(struct vm_area_struct * area);
|
||||
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
|
||||
int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
|
||||
pmd_t *, unsigned int flags);
|
||||
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
|
||||
|
||||
/* notification that a previously read-only page is about to become
|
||||
@ -307,18 +309,6 @@ struct inode;
|
||||
#define page_private(page) ((page)->private)
|
||||
#define set_page_private(page, v) ((page)->private = (v))
|
||||
|
||||
/* It's valid only if the page is free path or free_list */
|
||||
static inline void set_freepage_migratetype(struct page *page, int migratetype)
|
||||
{
|
||||
page->index = migratetype;
|
||||
}
|
||||
|
||||
/* It's valid only if the page is free path or free_list */
|
||||
static inline int get_freepage_migratetype(struct page *page)
|
||||
{
|
||||
return page->index;
|
||||
}
|
||||
|
||||
/*
|
||||
* FIXME: take this include out, include page-flags.h in
|
||||
* files which need it (119 of them)
|
||||
@ -359,18 +349,6 @@ static inline int get_page_unless_zero(struct page *page)
|
||||
return atomic_inc_not_zero(&page->_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to drop a ref unless the page has a refcount of one, return false if
|
||||
* that is the case.
|
||||
* This is to make sure that the refcount won't become zero after this drop.
|
||||
* This can be called when MMU is off so it must not access
|
||||
* any of the virtual mappings.
|
||||
*/
|
||||
static inline int put_page_unless_one(struct page *page)
|
||||
{
|
||||
return atomic_add_unless(&page->_count, -1, 1);
|
||||
}
|
||||
|
||||
extern int page_is_ram(unsigned long pfn);
|
||||
|
||||
enum {
|
||||
@ -1267,6 +1245,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
|
||||
return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
|
||||
}
|
||||
|
||||
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
|
||||
{
|
||||
return !vma->vm_ops;
|
||||
}
|
||||
|
||||
static inline int stack_guard_page_start(struct vm_area_struct *vma,
|
||||
unsigned long addr)
|
||||
{
|
||||
@ -2193,6 +2176,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
|
||||
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
|
||||
extern int unpoison_memory(unsigned long pfn);
|
||||
extern int get_hwpoison_page(struct page *page);
|
||||
extern void put_hwpoison_page(struct page *page);
|
||||
extern int sysctl_memory_failure_early_kill;
|
||||
extern int sysctl_memory_failure_recovery;
|
||||
extern void shake_page(struct page *p, int access);
|
||||
|
@ -235,7 +235,7 @@ struct page_frag_cache {
|
||||
bool pfmemalloc;
|
||||
};
|
||||
|
||||
typedef unsigned long __nocast vm_flags_t;
|
||||
typedef unsigned long vm_flags_t;
|
||||
|
||||
/*
|
||||
* A region containing a mapping of a non-memory backed file under NOMMU
|
||||
|
@ -12,6 +12,27 @@ struct notifier_block;
|
||||
struct mem_cgroup;
|
||||
struct task_struct;
|
||||
|
||||
/*
|
||||
* Details of the page allocation that triggered the oom killer that are used to
|
||||
* determine what should be killed.
|
||||
*/
|
||||
struct oom_control {
|
||||
/* Used to determine cpuset */
|
||||
struct zonelist *zonelist;
|
||||
|
||||
/* Used to determine mempolicy */
|
||||
nodemask_t *nodemask;
|
||||
|
||||
/* Used to determine cpuset and node locality requirement */
|
||||
const gfp_t gfp_mask;
|
||||
|
||||
/*
|
||||
* order == -1 means the oom kill is required by sysrq, otherwise only
|
||||
* for display purposes.
|
||||
*/
|
||||
const int order;
|
||||
};
|
||||
|
||||
/*
|
||||
* Types of limitations to the nodes from which allocations may occur
|
||||
*/
|
||||
@ -57,21 +78,18 @@ extern unsigned long oom_badness(struct task_struct *p,
|
||||
|
||||
extern int oom_kills_count(void);
|
||||
extern void note_oom_kill(void);
|
||||
extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||
unsigned int points, unsigned long totalpages,
|
||||
struct mem_cgroup *memcg, nodemask_t *nodemask,
|
||||
const char *message);
|
||||
struct mem_cgroup *memcg, const char *message);
|
||||
|
||||
extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
||||
int order, const nodemask_t *nodemask,
|
||||
extern void check_panic_on_oom(struct oom_control *oc,
|
||||
enum oom_constraint constraint,
|
||||
struct mem_cgroup *memcg);
|
||||
|
||||
extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||
unsigned long totalpages, const nodemask_t *nodemask,
|
||||
bool force_kill);
|
||||
extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
|
||||
struct task_struct *task, unsigned long totalpages);
|
||||
|
||||
extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
int order, nodemask_t *mask, bool force_kill);
|
||||
extern bool out_of_memory(struct oom_control *oc);
|
||||
|
||||
extern void exit_oom_victim(void);
|
||||
|
||||
|
@ -65,11 +65,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||
bool skip_hwpoisoned_pages);
|
||||
|
||||
/*
|
||||
* Internal functions. Changes pageblock's migrate type.
|
||||
*/
|
||||
int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
|
||||
void unset_migratetype_isolate(struct page *page, unsigned migratetype);
|
||||
struct page *alloc_migrate_target(struct page *page, unsigned long private,
|
||||
int **resultp);
|
||||
|
||||
|
@ -1227,6 +1227,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
|
||||
dma_pool_create(name, &pdev->dev, size, align, allocation)
|
||||
#define pci_pool_destroy(pool) dma_pool_destroy(pool)
|
||||
#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
|
||||
#define pci_pool_zalloc(pool, flags, handle) \
|
||||
dma_pool_zalloc(pool, flags, handle)
|
||||
#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
|
||||
|
||||
struct msix_entry {
|
||||
|
@ -351,7 +351,15 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);
|
||||
extern int kswapd_run(int nid);
|
||||
extern void kswapd_stop(int nid);
|
||||
#ifdef CONFIG_MEMCG
|
||||
extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
|
||||
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
|
||||
{
|
||||
/* root ? */
|
||||
if (mem_cgroup_disabled() || !memcg->css.parent)
|
||||
return vm_swappiness;
|
||||
|
||||
return memcg->swappiness;
|
||||
}
|
||||
|
||||
#else
|
||||
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
|
||||
{
|
||||
@ -398,6 +406,9 @@ extern void free_pages_and_swap_cache(struct page **, int);
|
||||
extern struct page *lookup_swap_cache(swp_entry_t);
|
||||
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
|
||||
struct vm_area_struct *vma, unsigned long addr);
|
||||
extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
bool *new_page_allocated);
|
||||
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
|
||||
struct vm_area_struct *vma, unsigned long addr);
|
||||
|
||||
@ -431,6 +442,7 @@ extern unsigned int count_swap_pages(int, int);
|
||||
extern sector_t map_swap_page(struct page *, struct block_device **);
|
||||
extern sector_t swapdev_block(int, pgoff_t);
|
||||
extern int page_swapcount(struct page *);
|
||||
extern int swp_swapcount(swp_entry_t entry);
|
||||
extern struct swap_info_struct *page_swap_info(struct page *);
|
||||
extern int reuse_swap_page(struct page *);
|
||||
extern int try_to_free_swap(struct page *);
|
||||
@ -522,6 +534,11 @@ static inline int page_swapcount(struct page *page)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int swp_swapcount(swp_entry_t entry)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define reuse_swap_page(page) (page_mapcount(page) == 1)
|
||||
|
||||
static inline int try_to_free_swap(struct page *page)
|
||||
|
@ -164,6 +164,9 @@ static inline int is_write_migration_entry(swp_entry_t entry)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
|
||||
extern atomic_long_t num_poisoned_pages __read_mostly;
|
||||
|
||||
/*
|
||||
* Support for hardware poisoned pages
|
||||
*/
|
||||
@ -177,6 +180,31 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
|
||||
{
|
||||
return swp_type(entry) == SWP_HWPOISON;
|
||||
}
|
||||
|
||||
static inline bool test_set_page_hwpoison(struct page *page)
|
||||
{
|
||||
return TestSetPageHWPoison(page);
|
||||
}
|
||||
|
||||
static inline void num_poisoned_pages_inc(void)
|
||||
{
|
||||
atomic_long_inc(&num_poisoned_pages);
|
||||
}
|
||||
|
||||
static inline void num_poisoned_pages_dec(void)
|
||||
{
|
||||
atomic_long_dec(&num_poisoned_pages);
|
||||
}
|
||||
|
||||
static inline void num_poisoned_pages_add(long num)
|
||||
{
|
||||
atomic_long_add(num, &num_poisoned_pages);
|
||||
}
|
||||
|
||||
static inline void num_poisoned_pages_sub(long num)
|
||||
{
|
||||
atomic_long_sub(num, &num_poisoned_pages);
|
||||
}
|
||||
#else
|
||||
|
||||
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
||||
@ -188,6 +216,15 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool test_set_page_hwpoison(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void num_poisoned_pages_inc(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
|
||||
|
@ -9,7 +9,7 @@ struct zbud_ops {
|
||||
int (*evict)(struct zbud_pool *pool, unsigned long handle);
|
||||
};
|
||||
|
||||
struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
|
||||
struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
|
||||
void zbud_destroy_pool(struct zbud_pool *pool);
|
||||
int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
|
||||
unsigned long *handle);
|
||||
|
@ -37,7 +37,7 @@ enum zpool_mapmode {
|
||||
};
|
||||
|
||||
struct zpool *zpool_create_pool(char *type, char *name,
|
||||
gfp_t gfp, struct zpool_ops *ops);
|
||||
gfp_t gfp, const struct zpool_ops *ops);
|
||||
|
||||
char *zpool_get_type(struct zpool *pool);
|
||||
|
||||
@ -81,7 +81,7 @@ struct zpool_driver {
|
||||
atomic_t refcount;
|
||||
struct list_head list;
|
||||
|
||||
void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
|
||||
void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
|
||||
struct zpool *zpool);
|
||||
void (*destroy)(void *pool);
|
||||
|
||||
|
@ -34,6 +34,11 @@ enum zs_mapmode {
|
||||
*/
|
||||
};
|
||||
|
||||
struct zs_pool_stats {
|
||||
/* How many pages were migrated (freed) */
|
||||
unsigned long pages_compacted;
|
||||
};
|
||||
|
||||
struct zs_pool;
|
||||
|
||||
struct zs_pool *zs_create_pool(char *name, gfp_t flags);
|
||||
@ -49,4 +54,5 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
|
||||
unsigned long zs_get_total_pages(struct zs_pool *pool);
|
||||
unsigned long zs_compact(struct zs_pool *pool);
|
||||
|
||||
void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
|
||||
#endif
|
||||
|
@ -1042,42 +1042,9 @@ struct proto {
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Bits in struct cg_proto.flags
|
||||
*/
|
||||
enum cg_proto_flags {
|
||||
/* Currently active and new sockets should be assigned to cgroups */
|
||||
MEMCG_SOCK_ACTIVE,
|
||||
/* It was ever activated; we must disarm static keys on destruction */
|
||||
MEMCG_SOCK_ACTIVATED,
|
||||
};
|
||||
|
||||
struct cg_proto {
|
||||
struct page_counter memory_allocated; /* Current allocated memory. */
|
||||
struct percpu_counter sockets_allocated; /* Current number of sockets. */
|
||||
int memory_pressure;
|
||||
long sysctl_mem[3];
|
||||
unsigned long flags;
|
||||
/*
|
||||
* memcg field is used to find which memcg we belong directly
|
||||
* Each memcg struct can hold more than one cg_proto, so container_of
|
||||
* won't really cut.
|
||||
*
|
||||
* The elegant solution would be having an inverse function to
|
||||
* proto_cgroup in struct proto, but that means polluting the structure
|
||||
* for everybody, instead of just for memcg users.
|
||||
*/
|
||||
struct mem_cgroup *memcg;
|
||||
};
|
||||
|
||||
int proto_register(struct proto *prot, int alloc_slab);
|
||||
void proto_unregister(struct proto *prot);
|
||||
|
||||
static inline bool memcg_proto_active(struct cg_proto *cg_proto)
|
||||
{
|
||||
return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
|
||||
}
|
||||
|
||||
#ifdef SOCK_REFCNT_DEBUG
|
||||
static inline void sk_refcnt_debug_inc(struct sock *sk)
|
||||
{
|
||||
|
@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
|
||||
if (root != &cgrp_dfl_root)
|
||||
for_each_subsys(ss, ssid)
|
||||
if (root->subsys_mask & (1 << ssid))
|
||||
seq_show_option(seq, ss->name, NULL);
|
||||
seq_show_option(seq, ss->legacy_name, NULL);
|
||||
if (root->flags & CGRP_ROOT_NOPREFIX)
|
||||
seq_puts(seq, ",noprefix");
|
||||
if (root->flags & CGRP_ROOT_XATTR)
|
||||
|
@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
|
||||
node = cpu_to_mem(cpu);
|
||||
per_cpu(cpu_profile_flip, cpu) = 0;
|
||||
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
|
||||
page = alloc_pages_exact_node(node,
|
||||
page = __alloc_pages_node(node,
|
||||
GFP_KERNEL | __GFP_ZERO,
|
||||
0);
|
||||
if (!page)
|
||||
@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
|
||||
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
|
||||
}
|
||||
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
|
||||
page = alloc_pages_exact_node(node,
|
||||
page = __alloc_pages_node(node,
|
||||
GFP_KERNEL | __GFP_ZERO,
|
||||
0);
|
||||
if (!page)
|
||||
@ -543,14 +543,14 @@ static int create_hash_tables(void)
|
||||
int node = cpu_to_mem(cpu);
|
||||
struct page *page;
|
||||
|
||||
page = alloc_pages_exact_node(node,
|
||||
page = __alloc_pages_node(node,
|
||||
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
||||
0);
|
||||
if (!page)
|
||||
goto out_cleanup;
|
||||
per_cpu(cpu_profile_hits, cpu)[1]
|
||||
= (struct profile_hit *)page_address(page);
|
||||
page = alloc_pages_exact_node(node,
|
||||
page = __alloc_pages_node(node,
|
||||
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
||||
0);
|
||||
if (!page)
|
||||
|
@ -38,11 +38,9 @@ void show_mem(unsigned int filter)
|
||||
|
||||
printk("%lu pages RAM\n", total);
|
||||
printk("%lu pages HighMem/MovableOnly\n", highmem);
|
||||
#ifdef CONFIG_CMA
|
||||
printk("%lu pages reserved\n", (reserved - totalcma_pages));
|
||||
printk("%lu pages cma reserved\n", totalcma_pages);
|
||||
#else
|
||||
printk("%lu pages reserved\n", reserved);
|
||||
#ifdef CONFIG_CMA
|
||||
printk("%lu pages cma reserved\n", totalcma_pages);
|
||||
#endif
|
||||
#ifdef CONFIG_QUICKLIST
|
||||
printk("%lu pages in pagetable cache\n",
|
||||
|
@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
|
||||
count += pages;
|
||||
while (pages--)
|
||||
__free_pages_bootmem(page++, cur++, 0);
|
||||
bdata->node_bootmem_map = NULL;
|
||||
|
||||
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
|
||||
|
||||
@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata,
|
||||
sidx + bdata->node_min_pfn,
|
||||
eidx + bdata->node_min_pfn);
|
||||
|
||||
if (WARN_ON(bdata->node_bootmem_map == NULL))
|
||||
return;
|
||||
|
||||
if (bdata->hint_idx > sidx)
|
||||
bdata->hint_idx = sidx;
|
||||
|
||||
@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
|
||||
eidx + bdata->node_min_pfn,
|
||||
flags);
|
||||
|
||||
if (WARN_ON(bdata->node_bootmem_map == NULL))
|
||||
return 0;
|
||||
|
||||
for (idx = sidx; idx < eidx; idx++)
|
||||
if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
|
||||
if (exclusive) {
|
||||
|
175
mm/compaction.c
175
mm/compaction.c
@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
|
||||
return !get_pageblock_skip(page);
|
||||
}
|
||||
|
||||
static void reset_cached_positions(struct zone *zone)
|
||||
{
|
||||
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
|
||||
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
|
||||
zone->compact_cached_free_pfn = zone_end_pfn(zone);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called to clear all cached information on pageblocks that
|
||||
* should be skipped for page isolation when the migrate and free page scanner
|
||||
@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone)
|
||||
unsigned long end_pfn = zone_end_pfn(zone);
|
||||
unsigned long pfn;
|
||||
|
||||
zone->compact_cached_migrate_pfn[0] = start_pfn;
|
||||
zone->compact_cached_migrate_pfn[1] = start_pfn;
|
||||
zone->compact_cached_free_pfn = end_pfn;
|
||||
zone->compact_blockskip_flush = false;
|
||||
|
||||
/* Walk the zone and mark every pageblock as suitable for isolation */
|
||||
@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone)
|
||||
|
||||
clear_pageblock_skip(page);
|
||||
}
|
||||
|
||||
reset_cached_positions(zone);
|
||||
}
|
||||
|
||||
void reset_isolation_suitable(pg_data_t *pgdat)
|
||||
@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
|
||||
|
||||
if (!valid_page)
|
||||
valid_page = page;
|
||||
|
||||
/*
|
||||
* For compound pages such as THP and hugetlbfs, we can save
|
||||
* potentially a lot of iterations if we skip them at once.
|
||||
* The check is racy, but we can consider only valid values
|
||||
* and the only danger is skipping too much.
|
||||
*/
|
||||
if (PageCompound(page)) {
|
||||
unsigned int comp_order = compound_order(page);
|
||||
|
||||
if (likely(comp_order < MAX_ORDER)) {
|
||||
blockpfn += (1UL << comp_order) - 1;
|
||||
cursor += (1UL << comp_order) - 1;
|
||||
}
|
||||
|
||||
goto isolate_fail;
|
||||
}
|
||||
|
||||
if (!PageBuddy(page))
|
||||
goto isolate_fail;
|
||||
|
||||
@ -490,6 +514,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* There is a tiny chance that we have read bogus compound_order(),
|
||||
* so be careful to not go outside of the pageblock.
|
||||
*/
|
||||
if (unlikely(blockpfn > end_pfn))
|
||||
blockpfn = end_pfn;
|
||||
|
||||
trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
|
||||
nr_scanned, total_isolated);
|
||||
|
||||
@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
|
||||
/* Time to isolate some pages for migration */
|
||||
for (; low_pfn < end_pfn; low_pfn++) {
|
||||
bool is_lru;
|
||||
|
||||
/*
|
||||
* Periodically drop the lock (if held) regardless of its
|
||||
* contention, to give chance to IRQs. Abort async compaction
|
||||
@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
* It's possible to migrate LRU pages and balloon pages
|
||||
* Skip any other type of page
|
||||
*/
|
||||
if (!PageLRU(page)) {
|
||||
is_lru = PageLRU(page);
|
||||
if (!is_lru) {
|
||||
if (unlikely(balloon_page_movable(page))) {
|
||||
if (balloon_page_isolate(page)) {
|
||||
/* Successfully isolated */
|
||||
goto isolate_success;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* PageLRU is set. lru_lock normally excludes isolation
|
||||
* splitting and collapsing (collapsing has already happened
|
||||
* if PageLRU is set) but the lock is not necessarily taken
|
||||
* here and it is wasteful to take it just to check transhuge.
|
||||
* Check TransHuge without lock and skip the whole pageblock if
|
||||
* it's either a transhuge or hugetlbfs page, as calling
|
||||
* compound_order() without preventing THP from splitting the
|
||||
* page underneath us may return surprising results.
|
||||
* Regardless of being on LRU, compound pages such as THP and
|
||||
* hugetlbfs are not to be compacted. We can potentially save
|
||||
* a lot of iterations if we skip them at once. The check is
|
||||
* racy, but we can consider only valid values and the only
|
||||
* danger is skipping too much.
|
||||
*/
|
||||
if (PageTransHuge(page)) {
|
||||
if (!locked)
|
||||
low_pfn = ALIGN(low_pfn + 1,
|
||||
pageblock_nr_pages) - 1;
|
||||
else
|
||||
low_pfn += (1 << compound_order(page)) - 1;
|
||||
if (PageCompound(page)) {
|
||||
unsigned int comp_order = compound_order(page);
|
||||
|
||||
if (likely(comp_order < MAX_ORDER))
|
||||
low_pfn += (1UL << comp_order) - 1;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!is_lru)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Migration will fail if an anonymous page is pinned in memory,
|
||||
* so avoid taking lru_lock and isolating it unnecessarily in an
|
||||
@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
if (!locked)
|
||||
break;
|
||||
|
||||
/* Recheck PageLRU and PageTransHuge under lock */
|
||||
/* Recheck PageLRU and PageCompound under lock */
|
||||
if (!PageLRU(page))
|
||||
continue;
|
||||
if (PageTransHuge(page)) {
|
||||
low_pfn += (1 << compound_order(page)) - 1;
|
||||
|
||||
/*
|
||||
* Page become compound since the non-locked check,
|
||||
* and it's on LRU. It can only be a THP so the order
|
||||
* is safe to read and it's 0 for tail pages.
|
||||
*/
|
||||
if (unlikely(PageCompound(page))) {
|
||||
low_pfn += (1UL << compound_order(page)) - 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
if (__isolate_lru_page(page, isolate_mode) != 0)
|
||||
continue;
|
||||
|
||||
VM_BUG_ON_PAGE(PageTransCompound(page), page);
|
||||
VM_BUG_ON_PAGE(PageCompound(page), page);
|
||||
|
||||
/* Successfully isolated */
|
||||
del_page_from_lru_list(page, lruvec, page_lru(page));
|
||||
@ -897,6 +935,16 @@ static bool suitable_migration_target(struct page *page)
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test whether the free scanner has reached the same or lower pageblock than
|
||||
* the migration scanner, and compaction should thus terminate.
|
||||
*/
|
||||
static inline bool compact_scanners_met(struct compact_control *cc)
|
||||
{
|
||||
return (cc->free_pfn >> pageblock_order)
|
||||
<= (cc->migrate_pfn >> pageblock_order);
|
||||
}
|
||||
|
||||
/*
|
||||
* Based on information in the current compact_control, find blocks
|
||||
* suitable for isolating free pages from and then isolate them.
|
||||
@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc)
|
||||
* pages on cc->migratepages. We stop searching if the migrate
|
||||
* and free page scanners meet or enough free pages are isolated.
|
||||
*/
|
||||
for (; block_start_pfn >= low_pfn &&
|
||||
cc->nr_migratepages > cc->nr_freepages;
|
||||
for (; block_start_pfn >= low_pfn;
|
||||
block_end_pfn = block_start_pfn,
|
||||
block_start_pfn -= pageblock_nr_pages,
|
||||
isolate_start_pfn = block_start_pfn) {
|
||||
@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc)
|
||||
block_end_pfn, freelist, false);
|
||||
|
||||
/*
|
||||
* If we isolated enough freepages, or aborted due to async
|
||||
* compaction being contended, terminate the loop.
|
||||
* Remember where the free scanner should restart next time,
|
||||
* which is where isolate_freepages_block() left off.
|
||||
* But if it scanned the whole pageblock, isolate_start_pfn
|
||||
@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc)
|
||||
* In that case we will however want to restart at the start
|
||||
* of the previous pageblock.
|
||||
*/
|
||||
cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
|
||||
isolate_start_pfn :
|
||||
block_start_pfn - pageblock_nr_pages;
|
||||
|
||||
/*
|
||||
* isolate_freepages_block() might have aborted due to async
|
||||
* compaction being contended
|
||||
*/
|
||||
if (cc->contended)
|
||||
if ((cc->nr_freepages >= cc->nr_migratepages)
|
||||
|| cc->contended) {
|
||||
if (isolate_start_pfn >= block_end_pfn)
|
||||
isolate_start_pfn =
|
||||
block_start_pfn - pageblock_nr_pages;
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
* isolate_freepages_block() should not terminate
|
||||
* prematurely unless contended, or isolated enough
|
||||
*/
|
||||
VM_BUG_ON(isolate_start_pfn < block_end_pfn);
|
||||
}
|
||||
}
|
||||
|
||||
/* split_free_page does not map the pages */
|
||||
map_pages(freelist);
|
||||
|
||||
/*
|
||||
* If we crossed the migrate scanner, we want to keep it that way
|
||||
* so that compact_finished() may detect this
|
||||
* Record where the free scanner will restart next time. Either we
|
||||
* broke from the loop and set isolate_start_pfn based on the last
|
||||
* call to isolate_freepages_block(), or we met the migration scanner
|
||||
* and the loop terminated due to isolate_start_pfn < low_pfn
|
||||
*/
|
||||
if (block_start_pfn < low_pfn)
|
||||
cc->free_pfn = cc->migrate_pfn;
|
||||
cc->free_pfn = isolate_start_pfn;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1062,6 +1115,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
struct compact_control *cc)
|
||||
{
|
||||
unsigned long low_pfn, end_pfn;
|
||||
unsigned long isolate_start_pfn;
|
||||
struct page *page;
|
||||
const isolate_mode_t isolate_mode =
|
||||
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
|
||||
@ -1110,6 +1164,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
continue;
|
||||
|
||||
/* Perform the isolation */
|
||||
isolate_start_pfn = low_pfn;
|
||||
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
|
||||
isolate_mode);
|
||||
|
||||
@ -1118,6 +1173,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
return ISOLATE_ABORT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Record where we could have freed pages by migration and not
|
||||
* yet flushed them to buddy allocator.
|
||||
* - this is the lowest page that could have been isolated and
|
||||
* then freed by migration.
|
||||
*/
|
||||
if (cc->nr_migratepages && !cc->last_migrated_pfn)
|
||||
cc->last_migrated_pfn = isolate_start_pfn;
|
||||
|
||||
/*
|
||||
* Either we isolated something and proceed with migration. Or
|
||||
* we failed and compact_zone should decide if we should
|
||||
@ -1127,12 +1191,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
}
|
||||
|
||||
acct_isolated(zone, cc);
|
||||
/*
|
||||
* Record where migration scanner will be restarted. If we end up in
|
||||
* the same pageblock as the free scanner, make the scanners fully
|
||||
* meet so that compact_finished() terminates compaction.
|
||||
*/
|
||||
cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
|
||||
/* Record where migration scanner will be restarted. */
|
||||
cc->migrate_pfn = low_pfn;
|
||||
|
||||
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
|
||||
}
|
||||
@ -1147,11 +1207,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
|
||||
return COMPACT_PARTIAL;
|
||||
|
||||
/* Compaction run completes if the migrate and free scanner meet */
|
||||
if (cc->free_pfn <= cc->migrate_pfn) {
|
||||
if (compact_scanners_met(cc)) {
|
||||
/* Let the next compaction start anew. */
|
||||
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
|
||||
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
|
||||
zone->compact_cached_free_pfn = zone_end_pfn(zone);
|
||||
reset_cached_positions(zone);
|
||||
|
||||
/*
|
||||
* Mark that the PG_migrate_skip information should be cleared
|
||||
@ -1295,7 +1353,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
unsigned long end_pfn = zone_end_pfn(zone);
|
||||
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
|
||||
const bool sync = cc->mode != MIGRATE_ASYNC;
|
||||
unsigned long last_migrated_pfn = 0;
|
||||
|
||||
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
|
||||
cc->classzone_idx);
|
||||
@ -1333,6 +1390,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
|
||||
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
|
||||
}
|
||||
cc->last_migrated_pfn = 0;
|
||||
|
||||
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
|
||||
cc->free_pfn, end_pfn, sync);
|
||||
@ -1342,7 +1400,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
while ((ret = compact_finished(zone, cc, migratetype)) ==
|
||||
COMPACT_CONTINUE) {
|
||||
int err;
|
||||
unsigned long isolate_start_pfn = cc->migrate_pfn;
|
||||
|
||||
switch (isolate_migratepages(zone, cc)) {
|
||||
case ISOLATE_ABORT:
|
||||
@ -1376,22 +1433,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
* migrate_pages() may return -ENOMEM when scanners meet
|
||||
* and we want compact_finished() to detect it
|
||||
*/
|
||||
if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
|
||||
if (err == -ENOMEM && !compact_scanners_met(cc)) {
|
||||
ret = COMPACT_PARTIAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Record where we could have freed pages by migration and not
|
||||
* yet flushed them to buddy allocator. We use the pfn that
|
||||
* isolate_migratepages() started from in this loop iteration
|
||||
* - this is the lowest page that could have been isolated and
|
||||
* then freed by migration.
|
||||
*/
|
||||
if (!last_migrated_pfn)
|
||||
last_migrated_pfn = isolate_start_pfn;
|
||||
|
||||
check_drain:
|
||||
/*
|
||||
* Has the migration scanner moved away from the previous
|
||||
@ -1400,18 +1447,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
* compact_finished() can detect immediately if allocation
|
||||
* would succeed.
|
||||
*/
|
||||
if (cc->order > 0 && last_migrated_pfn) {
|
||||
if (cc->order > 0 && cc->last_migrated_pfn) {
|
||||
int cpu;
|
||||
unsigned long current_block_start =
|
||||
cc->migrate_pfn & ~((1UL << cc->order) - 1);
|
||||
|
||||
if (last_migrated_pfn < current_block_start) {
|
||||
if (cc->last_migrated_pfn < current_block_start) {
|
||||
cpu = get_cpu();
|
||||
lru_add_drain_cpu(cpu);
|
||||
drain_local_pages(zone);
|
||||
put_cpu();
|
||||
/* No more flushing until we migrate again */
|
||||
last_migrated_pfn = 0;
|
||||
cc->last_migrated_pfn = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
12
mm/dmapool.c
12
mm/dmapool.c
@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool)
|
||||
{
|
||||
bool empty = false;
|
||||
|
||||
if (unlikely(!pool))
|
||||
return;
|
||||
|
||||
mutex_lock(&pools_reg_lock);
|
||||
mutex_lock(&pools_lock);
|
||||
list_del(&pool->pools);
|
||||
@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
|
||||
page = pool_alloc_page(pool, mem_flags);
|
||||
page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
break;
|
||||
}
|
||||
}
|
||||
memset(retval, POOL_POISON_ALLOCATED, pool->size);
|
||||
if (!(mem_flags & __GFP_ZERO))
|
||||
memset(retval, POOL_POISON_ALLOCATED, pool->size);
|
||||
#endif
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
|
||||
if (mem_flags & __GFP_ZERO)
|
||||
memset(retval, 0, pool->size);
|
||||
|
||||
return retval;
|
||||
}
|
||||
EXPORT_SYMBOL(dma_pool_alloc);
|
||||
|
@ -224,6 +224,28 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size)
|
||||
return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
|
||||
|
||||
void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
|
||||
{
|
||||
unsigned long slop, clen;
|
||||
char *p;
|
||||
|
||||
while (size) {
|
||||
slop = src & ~PAGE_MASK;
|
||||
clen = size;
|
||||
if (clen > MAX_MAP_CHUNK - slop)
|
||||
clen = MAX_MAP_CHUNK - slop;
|
||||
p = early_memremap(src & PAGE_MASK, clen + slop);
|
||||
memcpy(dest, p + slop, clen);
|
||||
early_memunmap(p, clen + slop);
|
||||
dest += clen;
|
||||
src += clen;
|
||||
size -= clen;
|
||||
}
|
||||
}
|
||||
|
||||
#else /* CONFIG_MMU */
|
||||
|
||||
void __init __iomem *
|
||||
|
36
mm/filemap.c
36
mm/filemap.c
@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
|
||||
do {
|
||||
cpuset_mems_cookie = read_mems_allowed_begin();
|
||||
n = cpuset_mem_spread_node();
|
||||
page = alloc_pages_exact_node(n, gfp, 0);
|
||||
page = __alloc_pages_node(n, gfp, 0);
|
||||
} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
|
||||
|
||||
return page;
|
||||
@ -2473,21 +2473,6 @@ ssize_t generic_perform_write(struct file *file,
|
||||
iov_iter_count(i));
|
||||
|
||||
again:
|
||||
/*
|
||||
* Bring in the user page that we will copy from _first_.
|
||||
* Otherwise there's a nasty deadlock on copying from the
|
||||
* same page as we're writing to, without it being marked
|
||||
* up-to-date.
|
||||
*
|
||||
* Not only is this an optimisation, but it is also required
|
||||
* to check that the address is actually valid, when atomic
|
||||
* usercopies are used, below.
|
||||
*/
|
||||
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
|
||||
status = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
|
||||
&page, &fsdata);
|
||||
if (unlikely(status < 0))
|
||||
@ -2495,8 +2480,17 @@ ssize_t generic_perform_write(struct file *file,
|
||||
|
||||
if (mapping_writably_mapped(mapping))
|
||||
flush_dcache_page(page);
|
||||
|
||||
/*
|
||||
* 'page' is now locked. If we are trying to copy from a
|
||||
* mapping of 'page' in userspace, the copy might fault and
|
||||
* would need PageUptodate() to complete. But, page can not be
|
||||
* made Uptodate without acquiring the page lock, which we hold.
|
||||
* Deadlock. Avoid with pagefault_disable(). Fix up below with
|
||||
* iov_iter_fault_in_readable().
|
||||
*/
|
||||
pagefault_disable();
|
||||
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
|
||||
pagefault_enable();
|
||||
flush_dcache_page(page);
|
||||
|
||||
status = a_ops->write_end(file, mapping, pos, bytes, copied,
|
||||
@ -2519,6 +2513,14 @@ ssize_t generic_perform_write(struct file *file,
|
||||
*/
|
||||
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
|
||||
iov_iter_single_seg_count(i));
|
||||
/*
|
||||
* This is the fallback to recover if the copy from
|
||||
* userspace above faults.
|
||||
*/
|
||||
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
|
||||
status = -EFAULT;
|
||||
break;
|
||||
}
|
||||
goto again;
|
||||
}
|
||||
pos += copied;
|
||||
|
163
mm/huge_memory.c
163
mm/huge_memory.c
@ -16,6 +16,7 @@
|
||||
#include <linux/swap.h>
|
||||
#include <linux/shrinker.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/khugepaged.h>
|
||||
#include <linux/freezer.h>
|
||||
@ -105,7 +106,7 @@ static struct khugepaged_scan khugepaged_scan = {
|
||||
};
|
||||
|
||||
|
||||
static int set_recommended_min_free_kbytes(void)
|
||||
static void set_recommended_min_free_kbytes(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
int nr_zones = 0;
|
||||
@ -140,7 +141,6 @@ static int set_recommended_min_free_kbytes(void)
|
||||
min_free_kbytes = recommended_min;
|
||||
}
|
||||
setup_per_zone_wmarks();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int start_stop_khugepaged(void)
|
||||
@ -172,12 +172,7 @@ static int start_stop_khugepaged(void)
|
||||
static atomic_t huge_zero_refcount;
|
||||
struct page *huge_zero_page __read_mostly;
|
||||
|
||||
static inline bool is_huge_zero_pmd(pmd_t pmd)
|
||||
{
|
||||
return is_huge_zero_page(pmd_page(pmd));
|
||||
}
|
||||
|
||||
static struct page *get_huge_zero_page(void)
|
||||
struct page *get_huge_zero_page(void)
|
||||
{
|
||||
struct page *zero_page;
|
||||
retry:
|
||||
@ -794,16 +789,19 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
|
||||
}
|
||||
|
||||
/* Caller must hold page table lock. */
|
||||
static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
|
||||
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
|
||||
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
|
||||
struct page *zero_page)
|
||||
{
|
||||
pmd_t entry;
|
||||
if (!pmd_none(*pmd))
|
||||
return false;
|
||||
entry = mk_pmd(zero_page, vma->vm_page_prot);
|
||||
entry = pmd_mkhuge(entry);
|
||||
pgtable_trans_huge_deposit(mm, pmd, pgtable);
|
||||
set_pmd_at(mm, haddr, pmd, entry);
|
||||
atomic_long_inc(&mm->nr_ptes);
|
||||
return true;
|
||||
}
|
||||
|
||||
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
@ -870,6 +868,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
flags);
|
||||
}
|
||||
|
||||
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pmd_t entry;
|
||||
spinlock_t *ptl;
|
||||
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
if (pmd_none(*pmd)) {
|
||||
entry = pmd_mkhuge(pfn_pmd(pfn, prot));
|
||||
if (write) {
|
||||
entry = pmd_mkyoung(pmd_mkdirty(entry));
|
||||
entry = maybe_pmd_mkwrite(entry, vma);
|
||||
}
|
||||
set_pmd_at(mm, addr, pmd, entry);
|
||||
update_mmu_cache_pmd(vma, addr, pmd);
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
|
||||
int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd, unsigned long pfn, bool write)
|
||||
{
|
||||
pgprot_t pgprot = vma->vm_page_prot;
|
||||
/*
|
||||
* If we had pmd_special, we could avoid all these restrictions,
|
||||
* but we need to be consistent with PTEs and architectures that
|
||||
* can't support a 'special' bit.
|
||||
*/
|
||||
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
|
||||
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
|
||||
(VM_PFNMAP|VM_MIXEDMAP));
|
||||
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
|
||||
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
|
||||
|
||||
if (addr < vma->vm_start || addr >= vma->vm_end)
|
||||
return VM_FAULT_SIGBUS;
|
||||
if (track_pfn_insert(vma, &pgprot, pfn))
|
||||
return VM_FAULT_SIGBUS;
|
||||
insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
|
||||
return VM_FAULT_NOPAGE;
|
||||
}
|
||||
|
||||
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
|
||||
struct vm_area_struct *vma)
|
||||
@ -1414,41 +1455,41 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
pmd_t *pmd, unsigned long addr)
|
||||
{
|
||||
pmd_t orig_pmd;
|
||||
spinlock_t *ptl;
|
||||
int ret = 0;
|
||||
|
||||
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
|
||||
struct page *page;
|
||||
pgtable_t pgtable;
|
||||
pmd_t orig_pmd;
|
||||
/*
|
||||
* For architectures like ppc64 we look at deposited pgtable
|
||||
* when calling pmdp_huge_get_and_clear. So do the
|
||||
* pgtable_trans_huge_withdraw after finishing pmdp related
|
||||
* operations.
|
||||
*/
|
||||
orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
|
||||
tlb->fullmm);
|
||||
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
|
||||
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
|
||||
if (is_huge_zero_pmd(orig_pmd)) {
|
||||
atomic_long_dec(&tlb->mm->nr_ptes);
|
||||
spin_unlock(ptl);
|
||||
if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
|
||||
return 0;
|
||||
/*
|
||||
* For architectures like ppc64 we look at deposited pgtable
|
||||
* when calling pmdp_huge_get_and_clear. So do the
|
||||
* pgtable_trans_huge_withdraw after finishing pmdp related
|
||||
* operations.
|
||||
*/
|
||||
orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
|
||||
tlb->fullmm);
|
||||
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
|
||||
if (vma_is_dax(vma)) {
|
||||
spin_unlock(ptl);
|
||||
if (is_huge_zero_pmd(orig_pmd))
|
||||
put_huge_zero_page();
|
||||
} else {
|
||||
page = pmd_page(orig_pmd);
|
||||
page_remove_rmap(page);
|
||||
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
|
||||
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
|
||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||
atomic_long_dec(&tlb->mm->nr_ptes);
|
||||
spin_unlock(ptl);
|
||||
tlb_remove_page(tlb, page);
|
||||
}
|
||||
pte_free(tlb->mm, pgtable);
|
||||
ret = 1;
|
||||
} else if (is_huge_zero_pmd(orig_pmd)) {
|
||||
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
|
||||
atomic_long_dec(&tlb->mm->nr_ptes);
|
||||
spin_unlock(ptl);
|
||||
put_huge_zero_page();
|
||||
} else {
|
||||
struct page *page = pmd_page(orig_pmd);
|
||||
page_remove_rmap(page);
|
||||
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
|
||||
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
|
||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
|
||||
atomic_long_dec(&tlb->mm->nr_ptes);
|
||||
spin_unlock(ptl);
|
||||
tlb_remove_page(tlb, page);
|
||||
}
|
||||
return ret;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
|
||||
@ -2285,8 +2326,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
|
||||
|
||||
static void khugepaged_alloc_sleep(void)
|
||||
{
|
||||
wait_event_freezable_timeout(khugepaged_wait, false,
|
||||
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
add_wait_queue(&khugepaged_wait, &wait);
|
||||
freezable_schedule_timeout_interruptible(
|
||||
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
|
||||
remove_wait_queue(&khugepaged_wait, &wait);
|
||||
}
|
||||
|
||||
static int khugepaged_node_load[MAX_NUMNODES];
|
||||
@ -2373,7 +2418,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
|
||||
*/
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
*hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
|
||||
*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
|
||||
if (unlikely(!*hpage)) {
|
||||
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
|
||||
*hpage = ERR_PTR(-ENOMEM);
|
||||
@ -2911,7 +2956,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
|
||||
pmd_t *pmd)
|
||||
{
|
||||
spinlock_t *ptl;
|
||||
struct page *page;
|
||||
struct page *page = NULL;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long haddr = address & HPAGE_PMD_MASK;
|
||||
unsigned long mmun_start; /* For mmu_notifiers */
|
||||
@ -2924,25 +2969,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
|
||||
again:
|
||||
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
if (unlikely(!pmd_trans_huge(*pmd))) {
|
||||
spin_unlock(ptl);
|
||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
||||
return;
|
||||
}
|
||||
if (is_huge_zero_pmd(*pmd)) {
|
||||
if (unlikely(!pmd_trans_huge(*pmd)))
|
||||
goto unlock;
|
||||
if (vma_is_dax(vma)) {
|
||||
pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
|
||||
if (is_huge_zero_pmd(_pmd))
|
||||
put_huge_zero_page();
|
||||
} else if (is_huge_zero_pmd(*pmd)) {
|
||||
__split_huge_zero_page_pmd(vma, haddr, pmd);
|
||||
spin_unlock(ptl);
|
||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
||||
return;
|
||||
} else {
|
||||
page = pmd_page(*pmd);
|
||||
VM_BUG_ON_PAGE(!page_count(page), page);
|
||||
get_page(page);
|
||||
}
|
||||
page = pmd_page(*pmd);
|
||||
VM_BUG_ON_PAGE(!page_count(page), page);
|
||||
get_page(page);
|
||||
unlock:
|
||||
spin_unlock(ptl);
|
||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
||||
|
||||
split_huge_page(page);
|
||||
if (!page)
|
||||
return;
|
||||
|
||||
split_huge_page(page);
|
||||
put_page(page);
|
||||
|
||||
/*
|
||||
@ -2991,7 +3038,7 @@ static void split_huge_page_address(struct mm_struct *mm,
|
||||
split_huge_page_pmd_mm(mm, address, pmd);
|
||||
}
|
||||
|
||||
void __vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||
void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
long adjust_next)
|
||||
|
436
mm/hugetlb.c
436
mm/hugetlb.c
@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
|
||||
* prevent spurious OOMs when the hugepage pool is fully utilized.
|
||||
*/
|
||||
static int num_fault_mutexes;
|
||||
static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
|
||||
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
|
||||
|
||||
/* Forward declaration */
|
||||
static int hugetlb_acct_memory(struct hstate *h, long delta);
|
||||
@ -240,11 +240,14 @@ struct file_region {
|
||||
|
||||
/*
|
||||
* Add the huge page range represented by [f, t) to the reserve
|
||||
* map. Existing regions will be expanded to accommodate the
|
||||
* specified range. We know only existing regions need to be
|
||||
* expanded, because region_add is only called after region_chg
|
||||
* with the same range. If a new file_region structure must
|
||||
* be allocated, it is done in region_chg.
|
||||
* map. In the normal case, existing regions will be expanded
|
||||
* to accommodate the specified range. Sufficient regions should
|
||||
* exist for expansion due to the previous call to region_chg
|
||||
* with the same range. However, it is possible that region_del
|
||||
* could have been called after region_chg and modifed the map
|
||||
* in such a way that no region exists to be expanded. In this
|
||||
* case, pull a region descriptor from the cache associated with
|
||||
* the map and use that for the new range.
|
||||
*
|
||||
* Return the number of new huge pages added to the map. This
|
||||
* number is greater than or equal to zero.
|
||||
@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
|
||||
if (f <= rg->to)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If no region exists which can be expanded to include the
|
||||
* specified range, the list must have been modified by an
|
||||
* interleving call to region_del(). Pull a region descriptor
|
||||
* from the cache and use it for this range.
|
||||
*/
|
||||
if (&rg->link == head || t < rg->from) {
|
||||
VM_BUG_ON(resv->region_cache_count <= 0);
|
||||
|
||||
resv->region_cache_count--;
|
||||
nrg = list_first_entry(&resv->region_cache, struct file_region,
|
||||
link);
|
||||
list_del(&nrg->link);
|
||||
|
||||
nrg->from = f;
|
||||
nrg->to = t;
|
||||
list_add(&nrg->link, rg->link.prev);
|
||||
|
||||
add += t - f;
|
||||
goto out_locked;
|
||||
}
|
||||
|
||||
/* Round our left edge to the current segment if it encloses us. */
|
||||
if (f > rg->from)
|
||||
f = rg->from;
|
||||
@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
|
||||
add += t - nrg->to; /* Added to end of region */
|
||||
nrg->to = t;
|
||||
|
||||
out_locked:
|
||||
resv->adds_in_progress--;
|
||||
spin_unlock(&resv->lock);
|
||||
VM_BUG_ON(add < 0);
|
||||
return add;
|
||||
@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
|
||||
* so that the subsequent region_add call will have all the
|
||||
* regions it needs and will not fail.
|
||||
*
|
||||
* Returns the number of huge pages that need to be added
|
||||
* to the existing reservation map for the range [f, t).
|
||||
* This number is greater or equal to zero. -ENOMEM is
|
||||
* returned if a new file_region structure is needed and can
|
||||
* not be allocated.
|
||||
* Upon entry, region_chg will also examine the cache of region descriptors
|
||||
* associated with the map. If there are not enough descriptors cached, one
|
||||
* will be allocated for the in progress add operation.
|
||||
*
|
||||
* Returns the number of huge pages that need to be added to the existing
|
||||
* reservation map for the range [f, t). This number is greater or equal to
|
||||
* zero. -ENOMEM is returned if a new file_region structure or cache entry
|
||||
* is needed and can not be allocated.
|
||||
*/
|
||||
static long region_chg(struct resv_map *resv, long f, long t)
|
||||
{
|
||||
@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
|
||||
|
||||
retry:
|
||||
spin_lock(&resv->lock);
|
||||
retry_locked:
|
||||
resv->adds_in_progress++;
|
||||
|
||||
/*
|
||||
* Check for sufficient descriptors in the cache to accommodate
|
||||
* the number of in progress add operations.
|
||||
*/
|
||||
if (resv->adds_in_progress > resv->region_cache_count) {
|
||||
struct file_region *trg;
|
||||
|
||||
VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
|
||||
/* Must drop lock to allocate a new descriptor. */
|
||||
resv->adds_in_progress--;
|
||||
spin_unlock(&resv->lock);
|
||||
|
||||
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
|
||||
if (!trg)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock(&resv->lock);
|
||||
list_add(&trg->link, &resv->region_cache);
|
||||
resv->region_cache_count++;
|
||||
goto retry_locked;
|
||||
}
|
||||
|
||||
/* Locate the region we are before or in. */
|
||||
list_for_each_entry(rg, head, link)
|
||||
if (f <= rg->to)
|
||||
@ -336,6 +391,7 @@ static long region_chg(struct resv_map *resv, long f, long t)
|
||||
* size such that we can guarantee to record the reservation. */
|
||||
if (&rg->link == head || t < rg->from) {
|
||||
if (!nrg) {
|
||||
resv->adds_in_progress--;
|
||||
spin_unlock(&resv->lock);
|
||||
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
|
||||
if (!nrg)
|
||||
@ -385,43 +441,131 @@ static long region_chg(struct resv_map *resv, long f, long t)
|
||||
}
|
||||
|
||||
/*
|
||||
* Truncate the reserve map at index 'end'. Modify/truncate any
|
||||
* region which contains end. Delete any regions past end.
|
||||
* Return the number of huge pages removed from the map.
|
||||
* Abort the in progress add operation. The adds_in_progress field
|
||||
* of the resv_map keeps track of the operations in progress between
|
||||
* calls to region_chg and region_add. Operations are sometimes
|
||||
* aborted after the call to region_chg. In such cases, region_abort
|
||||
* is called to decrement the adds_in_progress counter.
|
||||
*
|
||||
* NOTE: The range arguments [f, t) are not needed or used in this
|
||||
* routine. They are kept to make reading the calling code easier as
|
||||
* arguments will match the associated region_chg call.
|
||||
*/
|
||||
static long region_truncate(struct resv_map *resv, long end)
|
||||
static void region_abort(struct resv_map *resv, long f, long t)
|
||||
{
|
||||
spin_lock(&resv->lock);
|
||||
VM_BUG_ON(!resv->region_cache_count);
|
||||
resv->adds_in_progress--;
|
||||
spin_unlock(&resv->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Delete the specified range [f, t) from the reserve map. If the
|
||||
* t parameter is LONG_MAX, this indicates that ALL regions after f
|
||||
* should be deleted. Locate the regions which intersect [f, t)
|
||||
* and either trim, delete or split the existing regions.
|
||||
*
|
||||
* Returns the number of huge pages deleted from the reserve map.
|
||||
* In the normal case, the return value is zero or more. In the
|
||||
* case where a region must be split, a new region descriptor must
|
||||
* be allocated. If the allocation fails, -ENOMEM will be returned.
|
||||
* NOTE: If the parameter t == LONG_MAX, then we will never split
|
||||
* a region and possibly return -ENOMEM. Callers specifying
|
||||
* t == LONG_MAX do not need to check for -ENOMEM error.
|
||||
*/
|
||||
static long region_del(struct resv_map *resv, long f, long t)
|
||||
{
|
||||
struct list_head *head = &resv->regions;
|
||||
struct file_region *rg, *trg;
|
||||
long chg = 0;
|
||||
struct file_region *nrg = NULL;
|
||||
long del = 0;
|
||||
|
||||
retry:
|
||||
spin_lock(&resv->lock);
|
||||
/* Locate the region we are either in or before. */
|
||||
list_for_each_entry(rg, head, link)
|
||||
if (end <= rg->to)
|
||||
list_for_each_entry_safe(rg, trg, head, link) {
|
||||
if (rg->to <= f)
|
||||
continue;
|
||||
if (rg->from >= t)
|
||||
break;
|
||||
if (&rg->link == head)
|
||||
goto out;
|
||||
|
||||
/* If we are in the middle of a region then adjust it. */
|
||||
if (end > rg->from) {
|
||||
chg = rg->to - end;
|
||||
rg->to = end;
|
||||
rg = list_entry(rg->link.next, typeof(*rg), link);
|
||||
if (f > rg->from && t < rg->to) { /* Must split region */
|
||||
/*
|
||||
* Check for an entry in the cache before dropping
|
||||
* lock and attempting allocation.
|
||||
*/
|
||||
if (!nrg &&
|
||||
resv->region_cache_count > resv->adds_in_progress) {
|
||||
nrg = list_first_entry(&resv->region_cache,
|
||||
struct file_region,
|
||||
link);
|
||||
list_del(&nrg->link);
|
||||
resv->region_cache_count--;
|
||||
}
|
||||
|
||||
if (!nrg) {
|
||||
spin_unlock(&resv->lock);
|
||||
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
|
||||
if (!nrg)
|
||||
return -ENOMEM;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
del += t - f;
|
||||
|
||||
/* New entry for end of split region */
|
||||
nrg->from = t;
|
||||
nrg->to = rg->to;
|
||||
INIT_LIST_HEAD(&nrg->link);
|
||||
|
||||
/* Original entry is trimmed */
|
||||
rg->to = f;
|
||||
|
||||
list_add(&nrg->link, &rg->link);
|
||||
nrg = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
|
||||
del += rg->to - rg->from;
|
||||
list_del(&rg->link);
|
||||
kfree(rg);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (f <= rg->from) { /* Trim beginning of region */
|
||||
del += t - rg->from;
|
||||
rg->from = t;
|
||||
} else { /* Trim end of region */
|
||||
del += rg->to - f;
|
||||
rg->to = f;
|
||||
}
|
||||
}
|
||||
|
||||
/* Drop any remaining regions. */
|
||||
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
|
||||
if (&rg->link == head)
|
||||
break;
|
||||
chg += rg->to - rg->from;
|
||||
list_del(&rg->link);
|
||||
kfree(rg);
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock(&resv->lock);
|
||||
return chg;
|
||||
kfree(nrg);
|
||||
return del;
|
||||
}
|
||||
|
||||
/*
|
||||
* A rare out of memory error was encountered which prevented removal of
|
||||
* the reserve map region for a page. The huge page itself was free'ed
|
||||
* and removed from the page cache. This routine will adjust the subpool
|
||||
* usage count, and the global reserve count if needed. By incrementing
|
||||
* these counts, the reserve map entry which could not be deleted will
|
||||
* appear as a "reserved" entry instead of simply dangling with incorrect
|
||||
* counts.
|
||||
*/
|
||||
void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
|
||||
{
|
||||
struct hugepage_subpool *spool = subpool_inode(inode);
|
||||
long rsv_adjust;
|
||||
|
||||
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
|
||||
if (restore_reserve && rsv_adjust) {
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
|
||||
hugetlb_acct_memory(h, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
|
||||
struct resv_map *resv_map_alloc(void)
|
||||
{
|
||||
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
|
||||
if (!resv_map)
|
||||
struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
|
||||
|
||||
if (!resv_map || !rg) {
|
||||
kfree(resv_map);
|
||||
kfree(rg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
kref_init(&resv_map->refs);
|
||||
spin_lock_init(&resv_map->lock);
|
||||
INIT_LIST_HEAD(&resv_map->regions);
|
||||
|
||||
resv_map->adds_in_progress = 0;
|
||||
|
||||
INIT_LIST_HEAD(&resv_map->region_cache);
|
||||
list_add(&rg->link, &resv_map->region_cache);
|
||||
resv_map->region_cache_count = 1;
|
||||
|
||||
return resv_map;
|
||||
}
|
||||
|
||||
void resv_map_release(struct kref *ref)
|
||||
{
|
||||
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
|
||||
struct list_head *head = &resv_map->region_cache;
|
||||
struct file_region *rg, *trg;
|
||||
|
||||
/* Clear out any active regions before we release the map. */
|
||||
region_truncate(resv_map, 0);
|
||||
region_del(resv_map, 0, LONG_MAX);
|
||||
|
||||
/* ... and any entries left in the cache */
|
||||
list_for_each_entry_safe(rg, trg, head, link) {
|
||||
list_del(&rg->link);
|
||||
kfree(rg);
|
||||
}
|
||||
|
||||
VM_BUG_ON(resv_map->adds_in_progress);
|
||||
|
||||
kfree(resv_map);
|
||||
}
|
||||
|
||||
@ -635,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
|
||||
}
|
||||
|
||||
/* Shared mappings always use reserves */
|
||||
if (vma->vm_flags & VM_MAYSHARE)
|
||||
return true;
|
||||
if (vma->vm_flags & VM_MAYSHARE) {
|
||||
/*
|
||||
* We know VM_NORESERVE is not set. Therefore, there SHOULD
|
||||
* be a region map for all pages. The only situation where
|
||||
* there is no region map is if a hole was punched via
|
||||
* fallocate. In this case, there really are no reverves to
|
||||
* use. This situation is indicated if chg != 0.
|
||||
*/
|
||||
if (chg)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Only the process that called mmap() has reserves for
|
||||
@ -1154,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = alloc_pages_exact_node(nid,
|
||||
page = __alloc_pages_node(nid,
|
||||
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
|
||||
__GFP_REPEAT|__GFP_NOWARN,
|
||||
huge_page_order(h));
|
||||
@ -1306,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
|
||||
__GFP_REPEAT|__GFP_NOWARN,
|
||||
huge_page_order(h));
|
||||
else
|
||||
page = alloc_pages_exact_node(nid,
|
||||
page = __alloc_pages_node(nid,
|
||||
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
|
||||
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
|
||||
|
||||
@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* vma_needs_reservation and vma_commit_reservation are used by the huge
|
||||
* page allocation routines to manage reservations.
|
||||
* vma_needs_reservation, vma_commit_reservation and vma_end_reservation
|
||||
* are used by the huge page allocation routines to manage reservations.
|
||||
*
|
||||
* vma_needs_reservation is called to determine if the huge page at addr
|
||||
* within the vma has an associated reservation. If a reservation is
|
||||
* needed, the value 1 is returned. The caller is then responsible for
|
||||
* managing the global reservation and subpool usage counts. After
|
||||
* the huge page has been allocated, vma_commit_reservation is called
|
||||
* to add the page to the reservation map.
|
||||
* to add the page to the reservation map. If the page allocation fails,
|
||||
* the reservation must be ended instead of committed. vma_end_reservation
|
||||
* is called in such cases.
|
||||
*
|
||||
* In the normal case, vma_commit_reservation returns the same value
|
||||
* as the preceding vma_needs_reservation call. The only time this
|
||||
@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h,
|
||||
* is the responsibility of the caller to notice the difference and
|
||||
* take appropriate action.
|
||||
*/
|
||||
enum vma_resv_mode {
|
||||
VMA_NEEDS_RESV,
|
||||
VMA_COMMIT_RESV,
|
||||
VMA_END_RESV,
|
||||
};
|
||||
static long __vma_reservation_common(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
bool commit)
|
||||
enum vma_resv_mode mode)
|
||||
{
|
||||
struct resv_map *resv;
|
||||
pgoff_t idx;
|
||||
@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h,
|
||||
return 1;
|
||||
|
||||
idx = vma_hugecache_offset(h, vma, addr);
|
||||
if (commit)
|
||||
ret = region_add(resv, idx, idx + 1);
|
||||
else
|
||||
switch (mode) {
|
||||
case VMA_NEEDS_RESV:
|
||||
ret = region_chg(resv, idx, idx + 1);
|
||||
break;
|
||||
case VMA_COMMIT_RESV:
|
||||
ret = region_add(resv, idx, idx + 1);
|
||||
break;
|
||||
case VMA_END_RESV:
|
||||
region_abort(resv, idx, idx + 1);
|
||||
ret = 0;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (vma->vm_flags & VM_MAYSHARE)
|
||||
return ret;
|
||||
@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h,
|
||||
static long vma_needs_reservation(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
return __vma_reservation_common(h, vma, addr, false);
|
||||
return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
|
||||
}
|
||||
|
||||
static long vma_commit_reservation(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
return __vma_reservation_common(h, vma, addr, true);
|
||||
return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
|
||||
}
|
||||
|
||||
static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
static void vma_end_reservation(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
|
||||
}
|
||||
|
||||
struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
unsigned long addr, int avoid_reserve)
|
||||
{
|
||||
struct hugepage_subpool *spool = subpool_vma(vma);
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
struct page *page;
|
||||
long chg, commit;
|
||||
long map_chg, map_commit;
|
||||
long gbl_chg;
|
||||
int ret, idx;
|
||||
struct hugetlb_cgroup *h_cg;
|
||||
|
||||
idx = hstate_index(h);
|
||||
/*
|
||||
* Processes that did not create the mapping will have no
|
||||
* reserves and will not have accounted against subpool
|
||||
* limit. Check that the subpool limit can be made before
|
||||
* satisfying the allocation MAP_NORESERVE mappings may also
|
||||
* need pages and subpool limit allocated allocated if no reserve
|
||||
* mapping overlaps.
|
||||
* Examine the region/reserve map to determine if the process
|
||||
* has a reservation for the page to be allocated. A return
|
||||
* code of zero indicates a reservation exists (no change).
|
||||
*/
|
||||
chg = vma_needs_reservation(h, vma, addr);
|
||||
if (chg < 0)
|
||||
map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
|
||||
if (map_chg < 0)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
if (chg || avoid_reserve)
|
||||
if (hugepage_subpool_get_pages(spool, 1) < 0)
|
||||
|
||||
/*
|
||||
* Processes that did not create the mapping will have no
|
||||
* reserves as indicated by the region/reserve map. Check
|
||||
* that the allocation will not exceed the subpool limit.
|
||||
* Allocations for MAP_NORESERVE mappings also need to be
|
||||
* checked against any subpool limit.
|
||||
*/
|
||||
if (map_chg || avoid_reserve) {
|
||||
gbl_chg = hugepage_subpool_get_pages(spool, 1);
|
||||
if (gbl_chg < 0) {
|
||||
vma_end_reservation(h, vma, addr);
|
||||
return ERR_PTR(-ENOSPC);
|
||||
}
|
||||
|
||||
/*
|
||||
* Even though there was no reservation in the region/reserve
|
||||
* map, there could be reservations associated with the
|
||||
* subpool that can be used. This would be indicated if the
|
||||
* return value of hugepage_subpool_get_pages() is zero.
|
||||
* However, if avoid_reserve is specified we still avoid even
|
||||
* the subpool reservations.
|
||||
*/
|
||||
if (avoid_reserve)
|
||||
gbl_chg = 1;
|
||||
}
|
||||
|
||||
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
|
||||
if (ret)
|
||||
goto out_subpool_put;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
|
||||
/*
|
||||
* glb_chg is passed to indicate whether or not a page must be taken
|
||||
* from the global free pool (global change). gbl_chg == 0 indicates
|
||||
* a reservation exists for the allocation.
|
||||
*/
|
||||
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
|
||||
if (!page) {
|
||||
spin_unlock(&hugetlb_lock);
|
||||
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
||||
@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
|
||||
set_page_private(page, (unsigned long)spool);
|
||||
|
||||
commit = vma_commit_reservation(h, vma, addr);
|
||||
if (unlikely(chg > commit)) {
|
||||
map_commit = vma_commit_reservation(h, vma, addr);
|
||||
if (unlikely(map_chg > map_commit)) {
|
||||
/*
|
||||
* The page was added to the reservation map between
|
||||
* vma_needs_reservation and vma_commit_reservation.
|
||||
@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
out_uncharge_cgroup:
|
||||
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
|
||||
out_subpool_put:
|
||||
if (chg || avoid_reserve)
|
||||
if (map_chg || avoid_reserve)
|
||||
hugepage_subpool_put_pages(spool, 1);
|
||||
vma_end_reservation(h, vma, addr);
|
||||
return ERR_PTR(-ENOSPC);
|
||||
}
|
||||
|
||||
@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void)
|
||||
}
|
||||
|
||||
kobject_put(hugepages_kobj);
|
||||
kfree(htlb_fault_mutex_table);
|
||||
kfree(hugetlb_fault_mutex_table);
|
||||
}
|
||||
module_exit(hugetlb_exit);
|
||||
|
||||
@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void)
|
||||
#else
|
||||
num_fault_mutexes = 1;
|
||||
#endif
|
||||
htlb_fault_mutex_table =
|
||||
hugetlb_fault_mutex_table =
|
||||
kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
|
||||
BUG_ON(!htlb_fault_mutex_table);
|
||||
BUG_ON(!hugetlb_fault_mutex_table);
|
||||
|
||||
for (i = 0; i < num_fault_mutexes; i++)
|
||||
mutex_init(&htlb_fault_mutex_table[i]);
|
||||
mutex_init(&hugetlb_fault_mutex_table[i]);
|
||||
return 0;
|
||||
}
|
||||
module_init(hugetlb_init);
|
||||
@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
|
||||
return page != NULL;
|
||||
}
|
||||
|
||||
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
|
||||
pgoff_t idx)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
ClearPagePrivate(page);
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_blocks += blocks_per_huge_page(h);
|
||||
spin_unlock(&inode->i_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct address_space *mapping, pgoff_t idx,
|
||||
unsigned long address, pte_t *ptep, unsigned int flags)
|
||||
@ -3194,21 +3439,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
set_page_huge_active(page);
|
||||
|
||||
if (vma->vm_flags & VM_MAYSHARE) {
|
||||
int err;
|
||||
struct inode *inode = mapping->host;
|
||||
|
||||
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
|
||||
int err = huge_add_to_page_cache(page, mapping, idx);
|
||||
if (err) {
|
||||
put_page(page);
|
||||
if (err == -EEXIST)
|
||||
goto retry;
|
||||
goto out;
|
||||
}
|
||||
ClearPagePrivate(page);
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_blocks += blocks_per_huge_page(h);
|
||||
spin_unlock(&inode->i_lock);
|
||||
} else {
|
||||
lock_page(page);
|
||||
if (unlikely(anon_vma_prepare(vma))) {
|
||||
@ -3236,11 +3473,14 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* any allocations necessary to record that reservation occur outside
|
||||
* the spinlock.
|
||||
*/
|
||||
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
|
||||
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
|
||||
if (vma_needs_reservation(h, vma, address) < 0) {
|
||||
ret = VM_FAULT_OOM;
|
||||
goto backout_unlocked;
|
||||
}
|
||||
/* Just decrements count, does not deallocate */
|
||||
vma_end_reservation(h, vma, address);
|
||||
}
|
||||
|
||||
ptl = huge_pte_lockptr(h, mm, ptep);
|
||||
spin_lock(ptl);
|
||||
@ -3280,7 +3520,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||
struct vm_area_struct *vma,
|
||||
struct address_space *mapping,
|
||||
pgoff_t idx, unsigned long address)
|
||||
@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||
* For uniprocesor systems we always use a single mutex, so just
|
||||
* return 0 and avoid the hashing overhead.
|
||||
*/
|
||||
static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||
struct vm_area_struct *vma,
|
||||
struct address_space *mapping,
|
||||
pgoff_t idx, unsigned long address)
|
||||
@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* get spurious allocation failures if two CPUs race to instantiate
|
||||
* the same page in the page cache.
|
||||
*/
|
||||
hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
|
||||
mutex_lock(&htlb_fault_mutex_table[hash]);
|
||||
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
||||
entry = huge_ptep_get(ptep);
|
||||
if (huge_pte_none(entry)) {
|
||||
@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out_mutex;
|
||||
}
|
||||
/* Just decrements count, does not deallocate */
|
||||
vma_end_reservation(h, vma, address);
|
||||
|
||||
if (!(vma->vm_flags & VM_MAYSHARE))
|
||||
pagecache_page = hugetlbfs_pagecache_page(h,
|
||||
@ -3437,7 +3679,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
put_page(pagecache_page);
|
||||
}
|
||||
out_mutex:
|
||||
mutex_unlock(&htlb_fault_mutex_table[hash]);
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
/*
|
||||
* Generally it's safe to hold refcount during waiting page lock. But
|
||||
* here we just wait to defer the next page fault to avoid busy loop and
|
||||
@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode,
|
||||
}
|
||||
return 0;
|
||||
out_err:
|
||||
if (!vma || vma->vm_flags & VM_MAYSHARE)
|
||||
region_abort(resv_map, from, to);
|
||||
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
|
||||
kref_put(&resv_map->refs, resv_map_release);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
||||
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
|
||||
long freed)
|
||||
{
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
struct resv_map *resv_map = inode_resv_map(inode);
|
||||
@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
||||
struct hugepage_subpool *spool = subpool_inode(inode);
|
||||
long gbl_reserve;
|
||||
|
||||
if (resv_map)
|
||||
chg = region_truncate(resv_map, offset);
|
||||
if (resv_map) {
|
||||
chg = region_del(resv_map, start, end);
|
||||
/*
|
||||
* region_del() can fail in the rare case where a region
|
||||
* must be split and another region descriptor can not be
|
||||
* allocated. If end == LONG_MAX, it will not fail.
|
||||
*/
|
||||
if (chg < 0)
|
||||
return chg;
|
||||
}
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
|
||||
spin_unlock(&inode->i_lock);
|
||||
@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
||||
*/
|
||||
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
|
||||
hugetlb_acct_memory(h, -gbl_reserve);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
|
||||
|
@ -58,7 +58,7 @@ static int hwpoison_inject(void *data, u64 val)
|
||||
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
|
||||
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
|
||||
put_out:
|
||||
put_page(p);
|
||||
put_hwpoison_page(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -182,6 +182,7 @@ struct compact_control {
|
||||
unsigned long nr_migratepages; /* Number of pages to migrate */
|
||||
unsigned long free_pfn; /* isolate_freepages search base */
|
||||
unsigned long migrate_pfn; /* isolate_migratepages search base */
|
||||
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
|
||||
enum migrate_mode mode; /* Async or sync migration mode */
|
||||
bool ignore_skip_hint; /* Scan blocks even if marked skip */
|
||||
int order; /* order a direct compactor needs */
|
||||
|
@ -838,6 +838,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
|
||||
}
|
||||
|
||||
if (crt_early_log >= ARRAY_SIZE(early_log)) {
|
||||
crt_early_log++;
|
||||
kmemleak_disable();
|
||||
return;
|
||||
}
|
||||
@ -1882,7 +1883,7 @@ void __init kmemleak_init(void)
|
||||
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
|
||||
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
|
||||
|
||||
if (crt_early_log >= ARRAY_SIZE(early_log))
|
||||
if (crt_early_log > ARRAY_SIZE(early_log))
|
||||
pr_warning("Early log buffer exceeded (%d), please increase "
|
||||
"DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
|
||||
|
||||
|
@ -99,8 +99,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
|
||||
struct list_lru_one *l;
|
||||
|
||||
spin_lock(&nlru->lock);
|
||||
l = list_lru_from_kmem(nlru, item);
|
||||
if (list_empty(item)) {
|
||||
l = list_lru_from_kmem(nlru, item);
|
||||
list_add_tail(item, &l->list);
|
||||
l->nr_items++;
|
||||
spin_unlock(&nlru->lock);
|
||||
@ -118,8 +118,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
|
||||
struct list_lru_one *l;
|
||||
|
||||
spin_lock(&nlru->lock);
|
||||
l = list_lru_from_kmem(nlru, item);
|
||||
if (!list_empty(item)) {
|
||||
l = list_lru_from_kmem(nlru, item);
|
||||
list_del_init(item);
|
||||
l->nr_items--;
|
||||
spin_unlock(&nlru->lock);
|
||||
|
@ -301,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma,
|
||||
|
||||
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
|
||||
|
||||
if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
return -EINVAL;
|
||||
|
||||
f = vma->vm_file;
|
||||
|
@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
|
||||
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
|
||||
}
|
||||
|
||||
static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
||||
bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
||||
phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
unsigned long i;
|
||||
@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
||||
break;
|
||||
}
|
||||
|
||||
return (i < type->cnt) ? i : -1;
|
||||
return i < type->cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -569,6 +569,7 @@ int __init_memblock memblock_add_range(struct memblock_type *type,
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
WARN_ON(nid != memblock_get_region_node(rgn));
|
||||
#endif
|
||||
WARN_ON(flags != rgn->flags);
|
||||
nr_new++;
|
||||
if (insert)
|
||||
memblock_insert_region(type, i++, base,
|
||||
@ -614,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
|
||||
int nid,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct memblock_type *_rgn = &memblock.memory;
|
||||
struct memblock_type *type = &memblock.memory;
|
||||
|
||||
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
|
||||
(unsigned long long)base,
|
||||
(unsigned long long)base + size - 1,
|
||||
flags, (void *)_RET_IP_);
|
||||
|
||||
return memblock_add_range(_rgn, base, size, nid, flags);
|
||||
return memblock_add_range(type, base, size, nid, flags);
|
||||
}
|
||||
|
||||
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
|
||||
@ -761,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
|
||||
*
|
||||
* This function isolates region [@base, @base + @size), and sets/clears flag
|
||||
*
|
||||
* Return 0 on succees, -errno on failure.
|
||||
* Return 0 on success, -errno on failure.
|
||||
*/
|
||||
static int __init_memblock memblock_setclr_flag(phys_addr_t base,
|
||||
phys_addr_t size, int set, int flag)
|
||||
@ -788,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
|
||||
* @base: the base phys addr of the region
|
||||
* @size: the size of the region
|
||||
*
|
||||
* Return 0 on succees, -errno on failure.
|
||||
* Return 0 on success, -errno on failure.
|
||||
*/
|
||||
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
@ -800,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
|
||||
* @base: the base phys addr of the region
|
||||
* @size: the size of the region
|
||||
*
|
||||
* Return 0 on succees, -errno on failure.
|
||||
* Return 0 on success, -errno on failure.
|
||||
*/
|
||||
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
@ -812,7 +813,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
|
||||
* @base: the base phys addr of the region
|
||||
* @size: the size of the region
|
||||
*
|
||||
* Return 0 on succees, -errno on failure.
|
||||
* Return 0 on success, -errno on failure.
|
||||
*/
|
||||
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
@ -834,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
|
||||
phys_addr_t *out_start,
|
||||
phys_addr_t *out_end)
|
||||
{
|
||||
struct memblock_type *rsv = &memblock.reserved;
|
||||
struct memblock_type *type = &memblock.reserved;
|
||||
|
||||
if (*idx >= 0 && *idx < rsv->cnt) {
|
||||
struct memblock_region *r = &rsv->regions[*idx];
|
||||
if (*idx >= 0 && *idx < type->cnt) {
|
||||
struct memblock_region *r = &type->regions[*idx];
|
||||
phys_addr_t base = r->base;
|
||||
phys_addr_t size = r->size;
|
||||
|
||||
@ -975,7 +976,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
|
||||
* in type_b.
|
||||
*
|
||||
* @idx: pointer to u64 loop variable
|
||||
* @nid: nid: node selector, %NUMA_NO_NODE for all nodes
|
||||
* @nid: node selector, %NUMA_NO_NODE for all nodes
|
||||
* @flags: pick from blocks based on memory attributes
|
||||
* @type_a: pointer to memblock_type from where the range is taken
|
||||
* @type_b: pointer to memblock_type which excludes memory from being taken
|
||||
@ -1565,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
|
||||
* Check if the region [@base, @base+@size) intersects a reserved memory block.
|
||||
*
|
||||
* RETURNS:
|
||||
* 0 if false, non-zero if true
|
||||
* True if they intersect, false if not.
|
||||
*/
|
||||
int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
|
||||
bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
memblock_cap_size(base, &size);
|
||||
return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
|
||||
return memblock_overlaps_region(&memblock.reserved, base, size);
|
||||
}
|
||||
|
||||
void __init_memblock memblock_trim_memory(phys_addr_t align)
|
||||
|
390
mm/memcontrol.c
390
mm/memcontrol.c
@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
|
||||
"unevictable",
|
||||
};
|
||||
|
||||
/*
|
||||
* Per memcg event counter is incremented at every pagein/pageout. With THP,
|
||||
* it will be incremated by the number of pages. This counter is used for
|
||||
* for trigger some periodic events. This is straightforward and better
|
||||
* than using jiffies etc. to handle periodic memcg event.
|
||||
*/
|
||||
enum mem_cgroup_events_target {
|
||||
MEM_CGROUP_TARGET_THRESH,
|
||||
MEM_CGROUP_TARGET_SOFTLIMIT,
|
||||
MEM_CGROUP_TARGET_NUMAINFO,
|
||||
MEM_CGROUP_NTARGETS,
|
||||
};
|
||||
#define THRESHOLDS_EVENTS_TARGET 128
|
||||
#define SOFTLIMIT_EVENTS_TARGET 1024
|
||||
#define NUMAINFO_EVENTS_TARGET 1024
|
||||
|
||||
struct mem_cgroup_stat_cpu {
|
||||
long count[MEM_CGROUP_STAT_NSTATS];
|
||||
unsigned long events[MEMCG_NR_EVENTS];
|
||||
unsigned long nr_page_events;
|
||||
unsigned long targets[MEM_CGROUP_NTARGETS];
|
||||
};
|
||||
|
||||
struct reclaim_iter {
|
||||
struct mem_cgroup *position;
|
||||
/* scan generation, increased every round-trip */
|
||||
unsigned int generation;
|
||||
};
|
||||
|
||||
/*
|
||||
* per-zone information in memory controller.
|
||||
*/
|
||||
struct mem_cgroup_per_zone {
|
||||
struct lruvec lruvec;
|
||||
unsigned long lru_size[NR_LRU_LISTS];
|
||||
|
||||
struct reclaim_iter iter[DEF_PRIORITY + 1];
|
||||
|
||||
struct rb_node tree_node; /* RB tree node */
|
||||
unsigned long usage_in_excess;/* Set to the value by which */
|
||||
/* the soft limit is exceeded*/
|
||||
bool on_tree;
|
||||
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
||||
/* use container_of */
|
||||
};
|
||||
|
||||
struct mem_cgroup_per_node {
|
||||
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
|
||||
};
|
||||
|
||||
/*
|
||||
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
||||
* their hierarchy representation
|
||||
@ -181,32 +135,6 @@ struct mem_cgroup_tree {
|
||||
|
||||
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
|
||||
|
||||
struct mem_cgroup_threshold {
|
||||
struct eventfd_ctx *eventfd;
|
||||
unsigned long threshold;
|
||||
};
|
||||
|
||||
/* For threshold */
|
||||
struct mem_cgroup_threshold_ary {
|
||||
/* An array index points to threshold just below or equal to usage. */
|
||||
int current_threshold;
|
||||
/* Size of entries[] */
|
||||
unsigned int size;
|
||||
/* Array of thresholds */
|
||||
struct mem_cgroup_threshold entries[0];
|
||||
};
|
||||
|
||||
struct mem_cgroup_thresholds {
|
||||
/* Primary thresholds array */
|
||||
struct mem_cgroup_threshold_ary *primary;
|
||||
/*
|
||||
* Spare threshold array.
|
||||
* This is needed to make mem_cgroup_unregister_event() "never fail".
|
||||
* It must be able to store at least primary->size - 1 entries.
|
||||
*/
|
||||
struct mem_cgroup_threshold_ary *spare;
|
||||
};
|
||||
|
||||
/* for OOM */
|
||||
struct mem_cgroup_eventfd_list {
|
||||
struct list_head list;
|
||||
@ -256,113 +184,6 @@ struct mem_cgroup_event {
|
||||
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
|
||||
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
|
||||
|
||||
/*
|
||||
* The memory controller data structure. The memory controller controls both
|
||||
* page cache and RSS per cgroup. We would eventually like to provide
|
||||
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
|
||||
* to help the administrator determine what knobs to tune.
|
||||
*/
|
||||
struct mem_cgroup {
|
||||
struct cgroup_subsys_state css;
|
||||
|
||||
/* Accounted resources */
|
||||
struct page_counter memory;
|
||||
struct page_counter memsw;
|
||||
struct page_counter kmem;
|
||||
|
||||
/* Normal memory consumption range */
|
||||
unsigned long low;
|
||||
unsigned long high;
|
||||
|
||||
unsigned long soft_limit;
|
||||
|
||||
/* vmpressure notifications */
|
||||
struct vmpressure vmpressure;
|
||||
|
||||
/* css_online() has been completed */
|
||||
int initialized;
|
||||
|
||||
/*
|
||||
* Should the accounting and control be hierarchical, per subtree?
|
||||
*/
|
||||
bool use_hierarchy;
|
||||
|
||||
/* protected by memcg_oom_lock */
|
||||
bool oom_lock;
|
||||
int under_oom;
|
||||
|
||||
int swappiness;
|
||||
/* OOM-Killer disable */
|
||||
int oom_kill_disable;
|
||||
|
||||
/* protect arrays of thresholds */
|
||||
struct mutex thresholds_lock;
|
||||
|
||||
/* thresholds for memory usage. RCU-protected */
|
||||
struct mem_cgroup_thresholds thresholds;
|
||||
|
||||
/* thresholds for mem+swap usage. RCU-protected */
|
||||
struct mem_cgroup_thresholds memsw_thresholds;
|
||||
|
||||
/* For oom notifier event fd */
|
||||
struct list_head oom_notify;
|
||||
|
||||
/*
|
||||
* Should we move charges of a task when a task is moved into this
|
||||
* mem_cgroup ? And what type of charges should we move ?
|
||||
*/
|
||||
unsigned long move_charge_at_immigrate;
|
||||
/*
|
||||
* set > 0 if pages under this cgroup are moving to other cgroup.
|
||||
*/
|
||||
atomic_t moving_account;
|
||||
/* taken only while moving_account > 0 */
|
||||
spinlock_t move_lock;
|
||||
struct task_struct *move_lock_task;
|
||||
unsigned long move_lock_flags;
|
||||
/*
|
||||
* percpu counter.
|
||||
*/
|
||||
struct mem_cgroup_stat_cpu __percpu *stat;
|
||||
spinlock_t pcp_counter_lock;
|
||||
|
||||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
|
||||
struct cg_proto tcp_mem;
|
||||
#endif
|
||||
#if defined(CONFIG_MEMCG_KMEM)
|
||||
/* Index in the kmem_cache->memcg_params.memcg_caches array */
|
||||
int kmemcg_id;
|
||||
bool kmem_acct_activated;
|
||||
bool kmem_acct_active;
|
||||
#endif
|
||||
|
||||
int last_scanned_node;
|
||||
#if MAX_NUMNODES > 1
|
||||
nodemask_t scan_nodes;
|
||||
atomic_t numainfo_events;
|
||||
atomic_t numainfo_updating;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
struct list_head cgwb_list;
|
||||
struct wb_domain cgwb_domain;
|
||||
#endif
|
||||
|
||||
/* List of events which userspace want to receive */
|
||||
struct list_head event_list;
|
||||
spinlock_t event_list_lock;
|
||||
|
||||
struct mem_cgroup_per_node *nodeinfo[0];
|
||||
/* WARNING: nodeinfo must be the last member here */
|
||||
};
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
bool memcg_kmem_is_active(struct mem_cgroup *memcg)
|
||||
{
|
||||
return memcg->kmem_acct_active;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Stuffs for move charges at task migration. */
|
||||
/*
|
||||
* Types of charges to be moved.
|
||||
@ -423,11 +244,6 @@ enum res_type {
|
||||
*/
|
||||
static DEFINE_MUTEX(memcg_create_mutex);
|
||||
|
||||
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
|
||||
{
|
||||
return s ? container_of(s, struct mem_cgroup, css) : NULL;
|
||||
}
|
||||
|
||||
/* Some nice accessors for the vmpressure. */
|
||||
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
|
||||
{
|
||||
@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
|
||||
rcu_read_lock();
|
||||
memcg = mem_cgroup_from_task(current);
|
||||
cg_proto = sk->sk_prot->proto_cgroup(memcg);
|
||||
if (!mem_cgroup_is_root(memcg) &&
|
||||
memcg_proto_active(cg_proto) &&
|
||||
if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
|
||||
css_tryget_online(&memcg->css)) {
|
||||
sk->sk_cgrp = cg_proto;
|
||||
}
|
||||
@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
|
||||
return &memcg->nodeinfo[nid]->zoneinfo[zid];
|
||||
}
|
||||
|
||||
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
|
||||
{
|
||||
return &memcg->css;
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_css_from_page - css of the memcg associated with a page
|
||||
* @page: page of interest
|
||||
@ -876,14 +686,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
|
||||
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
|
||||
}
|
||||
|
||||
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
|
||||
{
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
|
||||
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
||||
return mz->lru_size[lru];
|
||||
}
|
||||
|
||||
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
|
||||
int nid,
|
||||
unsigned int lru_mask)
|
||||
@ -986,6 +788,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
||||
|
||||
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
|
||||
}
|
||||
EXPORT_SYMBOL(mem_cgroup_from_task);
|
||||
|
||||
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
|
||||
{
|
||||
@ -1031,7 +834,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
||||
struct mem_cgroup *prev,
|
||||
struct mem_cgroup_reclaim_cookie *reclaim)
|
||||
{
|
||||
struct reclaim_iter *uninitialized_var(iter);
|
||||
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
|
||||
struct cgroup_subsys_state *css = NULL;
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
struct mem_cgroup *pos = NULL;
|
||||
@ -1173,30 +976,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
|
||||
iter != NULL; \
|
||||
iter = mem_cgroup_iter(NULL, iter, NULL))
|
||||
|
||||
void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
rcu_read_lock();
|
||||
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
||||
if (unlikely(!memcg))
|
||||
goto out;
|
||||
|
||||
switch (idx) {
|
||||
case PGFAULT:
|
||||
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
|
||||
break;
|
||||
case PGMAJFAULT:
|
||||
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
|
||||
|
||||
/**
|
||||
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
|
||||
* @zone: zone of the wanted lruvec
|
||||
@ -1295,15 +1074,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
||||
VM_BUG_ON((long)(*lru_size) < 0);
|
||||
}
|
||||
|
||||
bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
|
||||
{
|
||||
if (root == memcg)
|
||||
return true;
|
||||
if (!root->use_hierarchy)
|
||||
return false;
|
||||
return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
|
||||
}
|
||||
|
||||
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *task_memcg;
|
||||
@ -1330,39 +1100,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
|
||||
return ret;
|
||||
}
|
||||
|
||||
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
||||
{
|
||||
unsigned long inactive_ratio;
|
||||
unsigned long inactive;
|
||||
unsigned long active;
|
||||
unsigned long gb;
|
||||
|
||||
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
|
||||
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
|
||||
|
||||
gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
||||
if (gb)
|
||||
inactive_ratio = int_sqrt(10 * gb);
|
||||
else
|
||||
inactive_ratio = 1;
|
||||
|
||||
return inactive * inactive_ratio < active;
|
||||
}
|
||||
|
||||
bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
|
||||
{
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return true;
|
||||
|
||||
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
||||
memcg = mz->memcg;
|
||||
|
||||
return !!(memcg->css.flags & CSS_ONLINE);
|
||||
}
|
||||
|
||||
#define mem_cgroup_from_counter(counter, member) \
|
||||
container_of(counter, struct mem_cgroup, member)
|
||||
|
||||
@ -1394,15 +1131,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
|
||||
return margin;
|
||||
}
|
||||
|
||||
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
|
||||
{
|
||||
/* root ? */
|
||||
if (mem_cgroup_disabled() || !memcg->css.parent)
|
||||
return vm_swappiness;
|
||||
|
||||
return memcg->swappiness;
|
||||
}
|
||||
|
||||
/*
|
||||
* A routine for checking "mem" is under move_account() or not.
|
||||
*
|
||||
@ -1545,6 +1273,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
||||
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
int order)
|
||||
{
|
||||
struct oom_control oc = {
|
||||
.zonelist = NULL,
|
||||
.nodemask = NULL,
|
||||
.gfp_mask = gfp_mask,
|
||||
.order = order,
|
||||
};
|
||||
struct mem_cgroup *iter;
|
||||
unsigned long chosen_points = 0;
|
||||
unsigned long totalpages;
|
||||
@ -1563,7 +1297,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
|
||||
check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
|
||||
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
struct css_task_iter it;
|
||||
@ -1571,8 +1305,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
|
||||
css_task_iter_start(&iter->css, &it);
|
||||
while ((task = css_task_iter_next(&it))) {
|
||||
switch (oom_scan_process_thread(task, totalpages, NULL,
|
||||
false)) {
|
||||
switch (oom_scan_process_thread(&oc, task, totalpages)) {
|
||||
case OOM_SCAN_SELECT:
|
||||
if (chosen)
|
||||
put_task_struct(chosen);
|
||||
@ -1610,8 +1343,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
|
||||
if (chosen) {
|
||||
points = chosen_points * 1000 / totalpages;
|
||||
oom_kill_process(chosen, gfp_mask, order, points, totalpages,
|
||||
memcg, NULL, "Memory cgroup out of memory");
|
||||
oom_kill_process(&oc, chosen, points, totalpages, memcg,
|
||||
"Memory cgroup out of memory");
|
||||
}
|
||||
unlock:
|
||||
mutex_unlock(&oom_lock);
|
||||
@ -2062,23 +1795,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
|
||||
}
|
||||
EXPORT_SYMBOL(mem_cgroup_end_page_stat);
|
||||
|
||||
/**
|
||||
* mem_cgroup_update_page_stat - update page state statistics
|
||||
* @memcg: memcg to account against
|
||||
* @idx: page state item to account
|
||||
* @val: number of pages (positive or negative)
|
||||
*
|
||||
* See mem_cgroup_begin_page_stat() for locking requirements.
|
||||
*/
|
||||
void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
|
||||
enum mem_cgroup_stat_index idx, int val)
|
||||
{
|
||||
VM_BUG_ON(!rcu_read_lock_held());
|
||||
|
||||
if (memcg)
|
||||
this_cpu_add(memcg->stat->count[idx], val);
|
||||
}
|
||||
|
||||
/*
|
||||
* size of first charge trial. "32" comes from vmscan.c's magic value.
|
||||
* TODO: maybe necessary to use big numbers in big irons.
|
||||
@ -2504,16 +2220,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
|
||||
css_put_many(&memcg->css, nr_pages);
|
||||
}
|
||||
|
||||
/*
|
||||
* helper for acessing a memcg's index. It will be used as an index in the
|
||||
* child cache array in kmem_cache, and also to derive its name. This function
|
||||
* will return -1 when this is not a kmem-limited memcg.
|
||||
*/
|
||||
int memcg_cache_id(struct mem_cgroup *memcg)
|
||||
{
|
||||
return memcg ? memcg->kmemcg_id : -1;
|
||||
}
|
||||
|
||||
static int memcg_alloc_cache_id(void)
|
||||
{
|
||||
int id, size;
|
||||
@ -5127,10 +4833,12 @@ static void mem_cgroup_clear_mc(void)
|
||||
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
|
||||
struct cgroup_taskset *tset)
|
||||
{
|
||||
struct task_struct *p = cgroup_taskset_first(tset);
|
||||
int ret = 0;
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
struct mem_cgroup *from;
|
||||
struct task_struct *p;
|
||||
struct mm_struct *mm;
|
||||
unsigned long move_flags;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* We are now commited to this value whatever it is. Changes in this
|
||||
@ -5138,36 +4846,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
|
||||
* So we need to save it, and keep it going.
|
||||
*/
|
||||
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
|
||||
if (move_flags) {
|
||||
struct mm_struct *mm;
|
||||
struct mem_cgroup *from = mem_cgroup_from_task(p);
|
||||
if (!move_flags)
|
||||
return 0;
|
||||
|
||||
VM_BUG_ON(from == memcg);
|
||||
p = cgroup_taskset_first(tset);
|
||||
from = mem_cgroup_from_task(p);
|
||||
|
||||
mm = get_task_mm(p);
|
||||
if (!mm)
|
||||
return 0;
|
||||
/* We move charges only when we move a owner of the mm */
|
||||
if (mm->owner == p) {
|
||||
VM_BUG_ON(mc.from);
|
||||
VM_BUG_ON(mc.to);
|
||||
VM_BUG_ON(mc.precharge);
|
||||
VM_BUG_ON(mc.moved_charge);
|
||||
VM_BUG_ON(mc.moved_swap);
|
||||
VM_BUG_ON(from == memcg);
|
||||
|
||||
spin_lock(&mc.lock);
|
||||
mc.from = from;
|
||||
mc.to = memcg;
|
||||
mc.flags = move_flags;
|
||||
spin_unlock(&mc.lock);
|
||||
/* We set mc.moving_task later */
|
||||
mm = get_task_mm(p);
|
||||
if (!mm)
|
||||
return 0;
|
||||
/* We move charges only when we move a owner of the mm */
|
||||
if (mm->owner == p) {
|
||||
VM_BUG_ON(mc.from);
|
||||
VM_BUG_ON(mc.to);
|
||||
VM_BUG_ON(mc.precharge);
|
||||
VM_BUG_ON(mc.moved_charge);
|
||||
VM_BUG_ON(mc.moved_swap);
|
||||
|
||||
ret = mem_cgroup_precharge_mc(mm);
|
||||
if (ret)
|
||||
mem_cgroup_clear_mc();
|
||||
}
|
||||
mmput(mm);
|
||||
spin_lock(&mc.lock);
|
||||
mc.from = from;
|
||||
mc.to = memcg;
|
||||
mc.flags = move_flags;
|
||||
spin_unlock(&mc.lock);
|
||||
/* We set mc.moving_task later */
|
||||
|
||||
ret = mem_cgroup_precharge_mc(mm);
|
||||
if (ret)
|
||||
mem_cgroup_clear_mc();
|
||||
}
|
||||
mmput(mm);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -5520,19 +5229,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
|
||||
.early_init = 0,
|
||||
};
|
||||
|
||||
/**
|
||||
* mem_cgroup_events - count memory events against a cgroup
|
||||
* @memcg: the memory cgroup
|
||||
* @idx: the event index
|
||||
* @nr: the number of events to account for
|
||||
*/
|
||||
void mem_cgroup_events(struct mem_cgroup *memcg,
|
||||
enum mem_cgroup_events_index idx,
|
||||
unsigned int nr)
|
||||
{
|
||||
this_cpu_add(memcg->stat->events[idx], nr);
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_low - check if memory consumption is below the normal range
|
||||
* @root: the highest ancestor to consider
|
||||
|
@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p)
|
||||
if (!mem)
|
||||
return -EINVAL;
|
||||
|
||||
css = mem_cgroup_css(mem);
|
||||
css = &mem->css;
|
||||
ino = cgroup_ino(css->cgroup);
|
||||
css_put(css);
|
||||
|
||||
@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(get_hwpoison_page);
|
||||
|
||||
/**
|
||||
* put_hwpoison_page() - Put refcount for memory error handling:
|
||||
* @page: raw error page (hit by memory error)
|
||||
*/
|
||||
void put_hwpoison_page(struct page *page)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
|
||||
if (PageHuge(head)) {
|
||||
put_page(head);
|
||||
return;
|
||||
}
|
||||
|
||||
if (PageTransHuge(head))
|
||||
if (page != head)
|
||||
put_page(head);
|
||||
|
||||
put_page(page);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(put_hwpoison_page);
|
||||
|
||||
/*
|
||||
* Do all that is necessary to remove user space mappings. Unmap
|
||||
* the pages and send SIGBUS to the processes if the data was dirty.
|
||||
@ -1100,7 +1121,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
nr_pages = 1 << compound_order(hpage);
|
||||
else /* normal page or thp */
|
||||
nr_pages = 1;
|
||||
atomic_long_add(nr_pages, &num_poisoned_pages);
|
||||
num_poisoned_pages_add(nr_pages);
|
||||
|
||||
/*
|
||||
* We need/can do nothing about count=0 pages.
|
||||
@ -1128,7 +1149,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
if (PageHWPoison(hpage)) {
|
||||
if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
|
||||
|| (p != hpage && TestSetPageHWPoison(hpage))) {
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
unlock_page(hpage);
|
||||
return 0;
|
||||
}
|
||||
@ -1152,10 +1173,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
else
|
||||
pr_err("MCE: %#lx: thp split failed\n", pfn);
|
||||
if (TestClearPageHWPoison(p))
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
put_page(p);
|
||||
if (p != hpage)
|
||||
put_page(hpage);
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
put_hwpoison_page(p);
|
||||
return -EBUSY;
|
||||
}
|
||||
VM_BUG_ON_PAGE(!page_count(p), p);
|
||||
@ -1214,16 +1233,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
*/
|
||||
if (!PageHWPoison(p)) {
|
||||
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
unlock_page(hpage);
|
||||
put_page(hpage);
|
||||
put_hwpoison_page(hpage);
|
||||
return 0;
|
||||
}
|
||||
if (hwpoison_filter(p)) {
|
||||
if (TestClearPageHWPoison(p))
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
unlock_page(hpage);
|
||||
put_page(hpage);
|
||||
put_hwpoison_page(hpage);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1237,7 +1256,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
|
||||
action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
|
||||
unlock_page(hpage);
|
||||
put_page(hpage);
|
||||
put_hwpoison_page(hpage);
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
@ -1426,6 +1445,22 @@ int unpoison_memory(unsigned long pfn)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (page_count(page) > 1) {
|
||||
pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (page_mapped(page)) {
|
||||
pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (page_mapping(page)) {
|
||||
pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
|
||||
pfn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* unpoison_memory() can encounter thp only when the thp is being
|
||||
* worked by memory_failure() and the page lock is not held yet.
|
||||
@ -1450,7 +1485,7 @@ int unpoison_memory(unsigned long pfn)
|
||||
return 0;
|
||||
}
|
||||
if (TestClearPageHWPoison(p))
|
||||
atomic_long_dec(&num_poisoned_pages);
|
||||
num_poisoned_pages_dec();
|
||||
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
|
||||
return 0;
|
||||
}
|
||||
@ -1464,16 +1499,16 @@ int unpoison_memory(unsigned long pfn)
|
||||
*/
|
||||
if (TestClearPageHWPoison(page)) {
|
||||
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
freeit = 1;
|
||||
if (PageHuge(page))
|
||||
clear_page_hwpoison_huge_page(page);
|
||||
}
|
||||
unlock_page(page);
|
||||
|
||||
put_page(page);
|
||||
put_hwpoison_page(page);
|
||||
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
|
||||
put_page(page);
|
||||
put_hwpoison_page(page);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -1486,7 +1521,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
|
||||
return alloc_huge_page_node(page_hstate(compound_head(p)),
|
||||
nid);
|
||||
else
|
||||
return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
|
||||
return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1533,7 +1568,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
|
||||
/*
|
||||
* Try to free it.
|
||||
*/
|
||||
put_page(page);
|
||||
put_hwpoison_page(page);
|
||||
shake_page(page, 1);
|
||||
|
||||
/*
|
||||
@ -1542,7 +1577,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
|
||||
ret = __get_any_page(page, pfn, 0);
|
||||
if (!PageLRU(page)) {
|
||||
/* Drop page reference which is from __get_any_page() */
|
||||
put_page(page);
|
||||
put_hwpoison_page(page);
|
||||
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
||||
pfn, page->flags);
|
||||
return -EIO;
|
||||
@ -1565,7 +1600,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||
lock_page(hpage);
|
||||
if (PageHWPoison(hpage)) {
|
||||
unlock_page(hpage);
|
||||
put_page(hpage);
|
||||
put_hwpoison_page(hpage);
|
||||
pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
@ -1576,7 +1611,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||
* get_any_page() and isolate_huge_page() takes a refcount each,
|
||||
* so need to drop one here.
|
||||
*/
|
||||
put_page(hpage);
|
||||
put_hwpoison_page(hpage);
|
||||
if (!ret) {
|
||||
pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
|
||||
return -EBUSY;
|
||||
@ -1600,11 +1635,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||
if (PageHuge(page)) {
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
dequeue_hwpoisoned_huge_page(hpage);
|
||||
atomic_long_add(1 << compound_order(hpage),
|
||||
&num_poisoned_pages);
|
||||
num_poisoned_pages_add(1 << compound_order(hpage));
|
||||
} else {
|
||||
SetPageHWPoison(page);
|
||||
atomic_long_inc(&num_poisoned_pages);
|
||||
num_poisoned_pages_inc();
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
@ -1625,7 +1659,7 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||
wait_on_page_writeback(page);
|
||||
if (PageHWPoison(page)) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
put_hwpoison_page(page);
|
||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
@ -1640,10 +1674,10 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||
* would need to fix isolation locking first.
|
||||
*/
|
||||
if (ret == 1) {
|
||||
put_page(page);
|
||||
put_hwpoison_page(page);
|
||||
pr_info("soft_offline: %#lx: invalidated\n", pfn);
|
||||
SetPageHWPoison(page);
|
||||
atomic_long_inc(&num_poisoned_pages);
|
||||
num_poisoned_pages_inc();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1657,14 +1691,12 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||
* Drop page reference which is came from get_any_page()
|
||||
* successful isolate_lru_page() already took another one.
|
||||
*/
|
||||
put_page(page);
|
||||
put_hwpoison_page(page);
|
||||
if (!ret) {
|
||||
LIST_HEAD(pagelist);
|
||||
inc_zone_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
list_add(&page->lru, &pagelist);
|
||||
if (!TestSetPageHWPoison(page))
|
||||
atomic_long_inc(&num_poisoned_pages);
|
||||
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
|
||||
MIGRATE_SYNC, MR_MEMORY_FAILURE);
|
||||
if (ret) {
|
||||
@ -1679,8 +1711,6 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||
pfn, ret, page->flags);
|
||||
if (ret > 0)
|
||||
ret = -EIO;
|
||||
if (TestClearPageHWPoison(page))
|
||||
atomic_long_dec(&num_poisoned_pages);
|
||||
}
|
||||
} else {
|
||||
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
|
||||
@ -1719,12 +1749,16 @@ int soft_offline_page(struct page *page, int flags)
|
||||
|
||||
if (PageHWPoison(page)) {
|
||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||
if (flags & MF_COUNT_INCREASED)
|
||||
put_hwpoison_page(page);
|
||||
return -EBUSY;
|
||||
}
|
||||
if (!PageHuge(page) && PageTransHuge(hpage)) {
|
||||
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
|
||||
pr_info("soft offline: %#lx: failed to split THP\n",
|
||||
pfn);
|
||||
if (flags & MF_COUNT_INCREASED)
|
||||
put_hwpoison_page(page);
|
||||
return -EBUSY;
|
||||
}
|
||||
}
|
||||
@ -1742,11 +1776,10 @@ int soft_offline_page(struct page *page, int flags)
|
||||
if (PageHuge(page)) {
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
if (!dequeue_hwpoisoned_huge_page(hpage))
|
||||
atomic_long_add(1 << compound_order(hpage),
|
||||
&num_poisoned_pages);
|
||||
num_poisoned_pages_add(1 << compound_order(hpage));
|
||||
} else {
|
||||
if (!TestSetPageHWPoison(page))
|
||||
atomic_long_inc(&num_poisoned_pages);
|
||||
num_poisoned_pages_inc();
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
48
mm/memory.c
48
mm/memory.c
@ -2426,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping,
|
||||
if (details.last_index < details.first_index)
|
||||
details.last_index = ULONG_MAX;
|
||||
|
||||
|
||||
/* DAX uses i_mmap_lock to serialise file truncate vs page fault */
|
||||
i_mmap_lock_write(mapping);
|
||||
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
|
||||
unmap_mapping_range_tree(&mapping->i_mmap, &details);
|
||||
@ -3015,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
} else {
|
||||
/*
|
||||
* The fault handler has no page to lock, so it holds
|
||||
* i_mmap_lock for read to protect against truncate.
|
||||
* i_mmap_lock for write to protect against truncate.
|
||||
*/
|
||||
i_mmap_unlock_read(vma->vm_file->f_mapping);
|
||||
i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||
}
|
||||
goto uncharge_out;
|
||||
}
|
||||
@ -3031,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
} else {
|
||||
/*
|
||||
* The fault handler has no page to lock, so it holds
|
||||
* i_mmap_lock for read to protect against truncate.
|
||||
* i_mmap_lock for write to protect against truncate.
|
||||
*/
|
||||
i_mmap_unlock_read(vma->vm_file->f_mapping);
|
||||
i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||
}
|
||||
return ret;
|
||||
uncharge_out:
|
||||
@ -3232,6 +3230,27 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmd, unsigned int flags)
|
||||
{
|
||||
if (!vma->vm_ops)
|
||||
return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
|
||||
if (vma->vm_ops->pmd_fault)
|
||||
return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
|
||||
return VM_FAULT_FALLBACK;
|
||||
}
|
||||
|
||||
static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
|
||||
unsigned int flags)
|
||||
{
|
||||
if (!vma->vm_ops)
|
||||
return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
|
||||
if (vma->vm_ops->pmd_fault)
|
||||
return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
|
||||
return VM_FAULT_FALLBACK;
|
||||
}
|
||||
|
||||
/*
|
||||
* These routines also need to handle stuff like marking pages dirty
|
||||
* and/or accessed for architectures that don't do it in hardware (most
|
||||
@ -3267,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm,
|
||||
barrier();
|
||||
if (!pte_present(entry)) {
|
||||
if (pte_none(entry)) {
|
||||
if (vma->vm_ops)
|
||||
if (vma_is_anonymous(vma))
|
||||
return do_anonymous_page(mm, vma, address,
|
||||
pte, pmd, flags);
|
||||
else
|
||||
return do_fault(mm, vma, address, pte, pmd,
|
||||
flags, entry);
|
||||
|
||||
return do_anonymous_page(mm, vma, address, pte, pmd,
|
||||
flags);
|
||||
}
|
||||
return do_swap_page(mm, vma, address,
|
||||
pte, pmd, flags, entry);
|
||||
@ -3334,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
if (!pmd)
|
||||
return VM_FAULT_OOM;
|
||||
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
|
||||
int ret = VM_FAULT_FALLBACK;
|
||||
if (!vma->vm_ops)
|
||||
ret = do_huge_pmd_anonymous_page(mm, vma, address,
|
||||
pmd, flags);
|
||||
int ret = create_huge_pmd(mm, vma, address, pmd, flags);
|
||||
if (!(ret & VM_FAULT_FALLBACK))
|
||||
return ret;
|
||||
} else {
|
||||
@ -3361,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
orig_pmd, pmd);
|
||||
|
||||
if (dirty && !pmd_write(orig_pmd)) {
|
||||
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
|
||||
orig_pmd);
|
||||
ret = wp_huge_pmd(mm, vma, address, pmd,
|
||||
orig_pmd, flags);
|
||||
if (!(ret & VM_FAULT_FALLBACK))
|
||||
return ret;
|
||||
} else {
|
||||
|
@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
|
||||
|
||||
qp->prev = vma;
|
||||
|
||||
if (vma->vm_flags & VM_PFNMAP)
|
||||
return 1;
|
||||
|
||||
if (flags & MPOL_MF_LAZY) {
|
||||
/* Similar to task_numa_work, skip inaccessible VMAs */
|
||||
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
|
||||
@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
|
||||
return alloc_huge_page_node(page_hstate(compound_head(page)),
|
||||
node);
|
||||
else
|
||||
return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
|
||||
return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
|
||||
__GFP_THISNODE, 0);
|
||||
}
|
||||
|
||||
@ -2001,7 +1998,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
nmask = policy_nodemask(gfp, pol);
|
||||
if (!nmask || node_isset(hpage_node, *nmask)) {
|
||||
mpol_cond_put(pol);
|
||||
page = alloc_pages_exact_node(hpage_node,
|
||||
page = __alloc_pages_node(hpage_node,
|
||||
gfp | __GFP_THISNODE, order);
|
||||
goto out;
|
||||
}
|
||||
|
@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool)
|
||||
*/
|
||||
void mempool_destroy(mempool_t *pool)
|
||||
{
|
||||
if (unlikely(!pool))
|
||||
return;
|
||||
|
||||
while (pool->curr_nr) {
|
||||
void *element = remove_element(pool);
|
||||
pool->free(element, pool->pool_data);
|
||||
|
27
mm/memtest.c
27
mm/memtest.c
@ -1,11 +1,6 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/memblock.h>
|
||||
|
||||
static u64 patterns[] __initdata = {
|
||||
@ -31,10 +26,8 @@ static u64 patterns[] __initdata = {
|
||||
|
||||
static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
|
||||
{
|
||||
printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
|
||||
(unsigned long long) pattern,
|
||||
(unsigned long long) start_bad,
|
||||
(unsigned long long) end_bad);
|
||||
pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
|
||||
cpu_to_be64(pattern), &start_bad, &end_bad);
|
||||
memblock_reserve(start_bad, end_bad - start_bad);
|
||||
}
|
||||
|
||||
@ -79,26 +72,26 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
|
||||
this_start = clamp(this_start, start, end);
|
||||
this_end = clamp(this_end, start, end);
|
||||
if (this_start < this_end) {
|
||||
printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
|
||||
(unsigned long long)this_start,
|
||||
(unsigned long long)this_end,
|
||||
(unsigned long long)cpu_to_be64(pattern));
|
||||
pr_info(" %pa - %pa pattern %016llx\n",
|
||||
&this_start, &this_end, cpu_to_be64(pattern));
|
||||
memtest(pattern, this_start, this_end - this_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* default is disabled */
|
||||
static int memtest_pattern __initdata;
|
||||
static unsigned int memtest_pattern __initdata;
|
||||
|
||||
static int __init parse_memtest(char *arg)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (arg)
|
||||
memtest_pattern = simple_strtoul(arg, NULL, 0);
|
||||
ret = kstrtouint(arg, 0, &memtest_pattern);
|
||||
else
|
||||
memtest_pattern = ARRAY_SIZE(patterns);
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
early_param("memtest", parse_memtest);
|
||||
@ -111,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
|
||||
if (!memtest_pattern)
|
||||
return;
|
||||
|
||||
printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
|
||||
pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
|
||||
for (i = memtest_pattern-1; i < UINT_MAX; --i) {
|
||||
idx = i % ARRAY_SIZE(patterns);
|
||||
do_one_pass(patterns[idx], start, end);
|
||||
|
13
mm/migrate.c
13
mm/migrate.c
@ -880,8 +880,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
/* Establish migration ptes or remove ptes */
|
||||
if (page_mapped(page)) {
|
||||
try_to_unmap(page,
|
||||
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
|
||||
TTU_IGNORE_HWPOISON);
|
||||
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
|
||||
page_was_mapped = 1;
|
||||
}
|
||||
|
||||
@ -952,9 +951,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
|
||||
dec_zone_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
/* Soft-offlined page shouldn't go through lru cache list */
|
||||
if (reason == MR_MEMORY_FAILURE)
|
||||
if (reason == MR_MEMORY_FAILURE) {
|
||||
put_page(page);
|
||||
else
|
||||
if (!test_set_page_hwpoison(page))
|
||||
num_poisoned_pages_inc();
|
||||
} else
|
||||
putback_lru_page(page);
|
||||
}
|
||||
|
||||
@ -1194,7 +1195,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
|
||||
return alloc_huge_page_node(page_hstate(compound_head(p)),
|
||||
pm->node);
|
||||
else
|
||||
return alloc_pages_exact_node(pm->node,
|
||||
return __alloc_pages_node(pm->node,
|
||||
GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
|
||||
}
|
||||
|
||||
@ -1554,7 +1555,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
|
||||
int nid = (int) data;
|
||||
struct page *newpage;
|
||||
|
||||
newpage = alloc_pages_exact_node(nid,
|
||||
newpage = __alloc_pages_node(nid,
|
||||
(GFP_HIGHUSER_MOVABLE |
|
||||
__GFP_THISNODE | __GFP_NOMEMALLOC |
|
||||
__GFP_NORETRY | __GFP_NOWARN) &
|
||||
|
71
mm/mmap.c
71
mm/mmap.c
@ -2455,7 +2455,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, int new_below)
|
||||
{
|
||||
struct vm_area_struct *new;
|
||||
int err = -ENOMEM;
|
||||
int err;
|
||||
|
||||
if (is_vm_hugetlb_page(vma) && (addr &
|
||||
~(huge_page_mask(hstate_vma(vma)))))
|
||||
@ -2463,7 +2463,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
|
||||
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
||||
if (!new)
|
||||
goto out_err;
|
||||
return -ENOMEM;
|
||||
|
||||
/* most fields are the same, copy all, and then fixup */
|
||||
*new = *vma;
|
||||
@ -2511,7 +2511,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
mpol_put(vma_policy(new));
|
||||
out_free_vma:
|
||||
kmem_cache_free(vm_area_cachep, new);
|
||||
out_err:
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -2872,6 +2871,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||
struct vm_area_struct *prev;
|
||||
struct rb_node **rb_link, *rb_parent;
|
||||
|
||||
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
|
||||
&prev, &rb_link, &rb_parent))
|
||||
return -ENOMEM;
|
||||
if ((vma->vm_flags & VM_ACCOUNT) &&
|
||||
security_vm_enough_memory_mm(mm, vma_pages(vma)))
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* The vm_pgoff of a purely anonymous vma should be irrelevant
|
||||
* until its first write fault, when page's anon_vma and index
|
||||
@ -2884,16 +2890,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||
* using the existing file pgoff checks and manipulations.
|
||||
* Similarly in do_mmap_pgoff and in do_brk.
|
||||
*/
|
||||
if (!vma->vm_file) {
|
||||
if (vma_is_anonymous(vma)) {
|
||||
BUG_ON(vma->anon_vma);
|
||||
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
|
||||
}
|
||||
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
|
||||
&prev, &rb_link, &rb_parent))
|
||||
return -ENOMEM;
|
||||
if ((vma->vm_flags & VM_ACCOUNT) &&
|
||||
security_vm_enough_memory_mm(mm, vma_pages(vma)))
|
||||
return -ENOMEM;
|
||||
|
||||
vma_link(mm, vma, prev, rb_link, rb_parent);
|
||||
return 0;
|
||||
@ -2918,7 +2918,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
|
||||
* If anonymous vma has not yet been faulted, update new pgoff
|
||||
* to match new location, to increase its chance of merging.
|
||||
*/
|
||||
if (unlikely(!vma->vm_file && !vma->anon_vma)) {
|
||||
if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
|
||||
pgoff = addr >> PAGE_SHIFT;
|
||||
faulted_in_anon_vma = false;
|
||||
}
|
||||
@ -2952,30 +2952,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
|
||||
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
|
||||
} else {
|
||||
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
||||
if (new_vma) {
|
||||
*new_vma = *vma;
|
||||
new_vma->vm_start = addr;
|
||||
new_vma->vm_end = addr + len;
|
||||
new_vma->vm_pgoff = pgoff;
|
||||
if (vma_dup_policy(vma, new_vma))
|
||||
goto out_free_vma;
|
||||
INIT_LIST_HEAD(&new_vma->anon_vma_chain);
|
||||
if (anon_vma_clone(new_vma, vma))
|
||||
goto out_free_mempol;
|
||||
if (new_vma->vm_file)
|
||||
get_file(new_vma->vm_file);
|
||||
if (new_vma->vm_ops && new_vma->vm_ops->open)
|
||||
new_vma->vm_ops->open(new_vma);
|
||||
vma_link(mm, new_vma, prev, rb_link, rb_parent);
|
||||
*need_rmap_locks = false;
|
||||
}
|
||||
if (!new_vma)
|
||||
goto out;
|
||||
*new_vma = *vma;
|
||||
new_vma->vm_start = addr;
|
||||
new_vma->vm_end = addr + len;
|
||||
new_vma->vm_pgoff = pgoff;
|
||||
if (vma_dup_policy(vma, new_vma))
|
||||
goto out_free_vma;
|
||||
INIT_LIST_HEAD(&new_vma->anon_vma_chain);
|
||||
if (anon_vma_clone(new_vma, vma))
|
||||
goto out_free_mempol;
|
||||
if (new_vma->vm_file)
|
||||
get_file(new_vma->vm_file);
|
||||
if (new_vma->vm_ops && new_vma->vm_ops->open)
|
||||
new_vma->vm_ops->open(new_vma);
|
||||
vma_link(mm, new_vma, prev, rb_link, rb_parent);
|
||||
*need_rmap_locks = false;
|
||||
}
|
||||
return new_vma;
|
||||
|
||||
out_free_mempol:
|
||||
out_free_mempol:
|
||||
mpol_put(vma_policy(new_vma));
|
||||
out_free_vma:
|
||||
out_free_vma:
|
||||
kmem_cache_free(vm_area_cachep, new_vma);
|
||||
out:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -3027,21 +3028,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
|
||||
pgoff_t pgoff;
|
||||
struct page **pages;
|
||||
|
||||
/*
|
||||
* special mappings have no vm_file, and in that case, the mm
|
||||
* uses vm_pgoff internally. So we have to subtract it from here.
|
||||
* We are allowed to do this because we are the mm; do not copy
|
||||
* this code into drivers!
|
||||
*/
|
||||
pgoff = vmf->pgoff - vma->vm_pgoff;
|
||||
|
||||
if (vma->vm_ops == &legacy_special_mapping_vmops)
|
||||
pages = vma->vm_private_data;
|
||||
else
|
||||
pages = ((struct vm_special_mapping *)vma->vm_private_data)->
|
||||
pages;
|
||||
|
||||
for (; pgoff && *pages; ++pages)
|
||||
for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
|
||||
pgoff--;
|
||||
|
||||
if (*pages) {
|
||||
|
142
mm/oom_kill.c
142
mm/oom_kill.c
@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
|
||||
* Determine the type of allocation constraint.
|
||||
*/
|
||||
#ifdef CONFIG_NUMA
|
||||
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
||||
gfp_t gfp_mask, nodemask_t *nodemask,
|
||||
unsigned long *totalpages)
|
||||
static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||
unsigned long *totalpages)
|
||||
{
|
||||
struct zone *zone;
|
||||
struct zoneref *z;
|
||||
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
|
||||
enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
|
||||
bool cpuset_limited = false;
|
||||
int nid;
|
||||
|
||||
/* Default to all available memory */
|
||||
*totalpages = totalram_pages + total_swap_pages;
|
||||
|
||||
if (!zonelist)
|
||||
if (!oc->zonelist)
|
||||
return CONSTRAINT_NONE;
|
||||
/*
|
||||
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
|
||||
* to kill current.We have to random task kill in this case.
|
||||
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
|
||||
*/
|
||||
if (gfp_mask & __GFP_THISNODE)
|
||||
if (oc->gfp_mask & __GFP_THISNODE)
|
||||
return CONSTRAINT_NONE;
|
||||
|
||||
/*
|
||||
@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
||||
* the page allocator means a mempolicy is in effect. Cpuset policy
|
||||
* is enforced in get_page_from_freelist().
|
||||
*/
|
||||
if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
|
||||
if (oc->nodemask &&
|
||||
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
|
||||
*totalpages = total_swap_pages;
|
||||
for_each_node_mask(nid, *nodemask)
|
||||
for_each_node_mask(nid, *oc->nodemask)
|
||||
*totalpages += node_spanned_pages(nid);
|
||||
return CONSTRAINT_MEMORY_POLICY;
|
||||
}
|
||||
|
||||
/* Check this allocation failure is caused by cpuset's wall function */
|
||||
for_each_zone_zonelist_nodemask(zone, z, zonelist,
|
||||
high_zoneidx, nodemask)
|
||||
if (!cpuset_zone_allowed(zone, gfp_mask))
|
||||
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
|
||||
high_zoneidx, oc->nodemask)
|
||||
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
|
||||
cpuset_limited = true;
|
||||
|
||||
if (cpuset_limited) {
|
||||
@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
||||
return CONSTRAINT_NONE;
|
||||
}
|
||||
#else
|
||||
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
||||
gfp_t gfp_mask, nodemask_t *nodemask,
|
||||
unsigned long *totalpages)
|
||||
static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||
unsigned long *totalpages)
|
||||
{
|
||||
*totalpages = totalram_pages + total_swap_pages;
|
||||
return CONSTRAINT_NONE;
|
||||
}
|
||||
#endif
|
||||
|
||||
enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||
unsigned long totalpages, const nodemask_t *nodemask,
|
||||
bool force_kill)
|
||||
enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
|
||||
struct task_struct *task, unsigned long totalpages)
|
||||
{
|
||||
if (oom_unkillable_task(task, NULL, nodemask))
|
||||
if (oom_unkillable_task(task, NULL, oc->nodemask))
|
||||
return OOM_SCAN_CONTINUE;
|
||||
|
||||
/*
|
||||
@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||
* Don't allow any other task to have access to the reserves.
|
||||
*/
|
||||
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
|
||||
if (!force_kill)
|
||||
if (oc->order != -1)
|
||||
return OOM_SCAN_ABORT;
|
||||
}
|
||||
if (!task->mm)
|
||||
@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||
if (oom_task_origin(task))
|
||||
return OOM_SCAN_SELECT;
|
||||
|
||||
if (task_will_free_mem(task) && !force_kill)
|
||||
if (task_will_free_mem(task) && oc->order != -1)
|
||||
return OOM_SCAN_ABORT;
|
||||
|
||||
return OOM_SCAN_OK;
|
||||
@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||
/*
|
||||
* Simple selection loop. We chose the process with the highest
|
||||
* number of 'points'. Returns -1 on scan abort.
|
||||
*
|
||||
* (not docbooked, we don't want this one cluttering up the manual)
|
||||
*/
|
||||
static struct task_struct *select_bad_process(unsigned int *ppoints,
|
||||
unsigned long totalpages, const nodemask_t *nodemask,
|
||||
bool force_kill)
|
||||
static struct task_struct *select_bad_process(struct oom_control *oc,
|
||||
unsigned int *ppoints, unsigned long totalpages)
|
||||
{
|
||||
struct task_struct *g, *p;
|
||||
struct task_struct *chosen = NULL;
|
||||
@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
|
||||
for_each_process_thread(g, p) {
|
||||
unsigned int points;
|
||||
|
||||
switch (oom_scan_process_thread(p, totalpages, nodemask,
|
||||
force_kill)) {
|
||||
switch (oom_scan_process_thread(oc, p, totalpages)) {
|
||||
case OOM_SCAN_SELECT:
|
||||
chosen = p;
|
||||
chosen_points = ULONG_MAX;
|
||||
@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
|
||||
case OOM_SCAN_OK:
|
||||
break;
|
||||
};
|
||||
points = oom_badness(p, NULL, nodemask, totalpages);
|
||||
points = oom_badness(p, NULL, oc->nodemask, totalpages);
|
||||
if (!points || points < chosen_points)
|
||||
continue;
|
||||
/* Prefer thread group leaders for display purposes */
|
||||
@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
static void dump_header(struct oom_control *oc, struct task_struct *p,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
task_lock(current);
|
||||
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
|
||||
"oom_score_adj=%hd\n",
|
||||
current->comm, gfp_mask, order,
|
||||
current->comm, oc->gfp_mask, oc->order,
|
||||
current->signal->oom_score_adj);
|
||||
cpuset_print_task_mems_allowed(current);
|
||||
task_unlock(current);
|
||||
@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
else
|
||||
show_mem(SHOW_MEM_FILTER_NODES);
|
||||
if (sysctl_oom_dump_tasks)
|
||||
dump_tasks(memcg, nodemask);
|
||||
dump_tasks(memcg, oc->nodemask);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -487,10 +481,9 @@ void oom_killer_enable(void)
|
||||
* Must be called while holding a reference to p, which will be released upon
|
||||
* returning.
|
||||
*/
|
||||
void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||
unsigned int points, unsigned long totalpages,
|
||||
struct mem_cgroup *memcg, nodemask_t *nodemask,
|
||||
const char *message)
|
||||
struct mem_cgroup *memcg, const char *message)
|
||||
{
|
||||
struct task_struct *victim = p;
|
||||
struct task_struct *child;
|
||||
@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
task_unlock(p);
|
||||
|
||||
if (__ratelimit(&oom_rs))
|
||||
dump_header(p, gfp_mask, order, memcg, nodemask);
|
||||
dump_header(oc, p, memcg);
|
||||
|
||||
task_lock(p);
|
||||
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
||||
@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
/*
|
||||
* oom_badness() returns 0 if the thread is unkillable
|
||||
*/
|
||||
child_points = oom_badness(child, memcg, nodemask,
|
||||
child_points = oom_badness(child, memcg, oc->nodemask,
|
||||
totalpages);
|
||||
if (child_points > victim_points) {
|
||||
put_task_struct(victim);
|
||||
@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
/*
|
||||
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
||||
*/
|
||||
void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
||||
int order, const nodemask_t *nodemask,
|
||||
void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
if (likely(!sysctl_panic_on_oom))
|
||||
@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
||||
if (constraint != CONSTRAINT_NONE)
|
||||
return;
|
||||
}
|
||||
dump_header(NULL, gfp_mask, order, memcg, nodemask);
|
||||
/* Do not panic for oom kills triggered by sysrq */
|
||||
if (oc->order == -1)
|
||||
return;
|
||||
dump_header(oc, NULL, memcg);
|
||||
panic("Out of memory: %s panic_on_oom is enabled\n",
|
||||
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
|
||||
}
|
||||
@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb)
|
||||
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
|
||||
|
||||
/**
|
||||
* __out_of_memory - kill the "best" process when we run out of memory
|
||||
* @zonelist: zonelist pointer
|
||||
* @gfp_mask: memory allocation flags
|
||||
* @order: amount of memory being requested as a power of 2
|
||||
* @nodemask: nodemask passed to page allocator
|
||||
* @force_kill: true if a task must be killed, even if others are exiting
|
||||
* out_of_memory - kill the "best" process when we run out of memory
|
||||
* @oc: pointer to struct oom_control
|
||||
*
|
||||
* If we run out of memory, we have the choice between either
|
||||
* killing a random task (bad), letting the system crash (worse)
|
||||
* OR try to be smart about which process to kill. Note that we
|
||||
* don't have to be perfect here, we just have to be good.
|
||||
*/
|
||||
bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
int order, nodemask_t *nodemask, bool force_kill)
|
||||
bool out_of_memory(struct oom_control *oc)
|
||||
{
|
||||
const nodemask_t *mpol_mask;
|
||||
struct task_struct *p;
|
||||
unsigned long totalpages;
|
||||
unsigned long freed = 0;
|
||||
unsigned int uninitialized_var(points);
|
||||
enum oom_constraint constraint = CONSTRAINT_NONE;
|
||||
int killed = 0;
|
||||
|
||||
if (oom_killer_disabled)
|
||||
return false;
|
||||
@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
||||
if (freed > 0)
|
||||
/* Got some memory back in the last second. */
|
||||
goto out;
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If current has a pending SIGKILL or is exiting, then automatically
|
||||
@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
if (current->mm &&
|
||||
(fatal_signal_pending(current) || task_will_free_mem(current))) {
|
||||
mark_oom_victim(current);
|
||||
goto out;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if there were limitations on the allocation (only relevant for
|
||||
* NUMA) that may require different handling.
|
||||
*/
|
||||
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
|
||||
&totalpages);
|
||||
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
|
||||
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
|
||||
constraint = constrained_alloc(oc, &totalpages);
|
||||
if (constraint != CONSTRAINT_MEMORY_POLICY)
|
||||
oc->nodemask = NULL;
|
||||
check_panic_on_oom(oc, constraint, NULL);
|
||||
|
||||
if (sysctl_oom_kill_allocating_task && current->mm &&
|
||||
!oom_unkillable_task(current, NULL, nodemask) &&
|
||||
!oom_unkillable_task(current, NULL, oc->nodemask) &&
|
||||
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
||||
get_task_struct(current);
|
||||
oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
|
||||
nodemask,
|
||||
oom_kill_process(oc, current, 0, totalpages, NULL,
|
||||
"Out of memory (oom_kill_allocating_task)");
|
||||
goto out;
|
||||
return true;
|
||||
}
|
||||
|
||||
p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
|
||||
p = select_bad_process(oc, &points, totalpages);
|
||||
/* Found nothing?!?! Either we hang forever, or we panic. */
|
||||
if (!p) {
|
||||
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
|
||||
if (!p && oc->order != -1) {
|
||||
dump_header(oc, NULL, NULL);
|
||||
panic("Out of memory and no killable processes...\n");
|
||||
}
|
||||
if (p != (void *)-1UL) {
|
||||
oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
|
||||
nodemask, "Out of memory");
|
||||
killed = 1;
|
||||
}
|
||||
out:
|
||||
/*
|
||||
* Give the killed threads a good chance of exiting before trying to
|
||||
* allocate memory again.
|
||||
*/
|
||||
if (killed)
|
||||
if (p && p != (void *)-1UL) {
|
||||
oom_kill_process(oc, p, points, totalpages, NULL,
|
||||
"Out of memory");
|
||||
/*
|
||||
* Give the killed process a good chance to exit before trying
|
||||
* to allocate memory again.
|
||||
*/
|
||||
schedule_timeout_killable(1);
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -728,13 +711,20 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
*/
|
||||
void pagefault_out_of_memory(void)
|
||||
{
|
||||
struct oom_control oc = {
|
||||
.zonelist = NULL,
|
||||
.nodemask = NULL,
|
||||
.gfp_mask = 0,
|
||||
.order = 0,
|
||||
};
|
||||
|
||||
if (mem_cgroup_oom_synchronize(true))
|
||||
return;
|
||||
|
||||
if (!mutex_trylock(&oom_lock))
|
||||
return;
|
||||
|
||||
if (!out_of_memory(NULL, 0, 0, NULL, false)) {
|
||||
if (!out_of_memory(&oc)) {
|
||||
/*
|
||||
* There shouldn't be any user tasks runnable while the
|
||||
* OOM killer is disabled, so the current task has to
|
||||
|
@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
|
||||
int percpu_pagelist_fraction;
|
||||
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
|
||||
|
||||
/*
|
||||
* A cached value of the page's pageblock's migratetype, used when the page is
|
||||
* put on a pcplist. Used to avoid the pageblock migratetype lookup when
|
||||
* freeing from pcplists in most cases, at the cost of possibly becoming stale.
|
||||
* Also the migratetype set in the page does not necessarily match the pcplist
|
||||
* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
|
||||
* other index - this ensures that it will be put on the correct CMA freelist.
|
||||
*/
|
||||
static inline int get_pcppage_migratetype(struct page *page)
|
||||
{
|
||||
return page->index;
|
||||
}
|
||||
|
||||
static inline void set_pcppage_migratetype(struct page *page, int migratetype)
|
||||
{
|
||||
page->index = migratetype;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PM_SLEEP
|
||||
/*
|
||||
* The following functions are used by the suspend/hibernate code to temporarily
|
||||
@ -791,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
|
||||
page = list_entry(list->prev, struct page, lru);
|
||||
/* must delete as __free_one_page list manipulates */
|
||||
list_del(&page->lru);
|
||||
mt = get_freepage_migratetype(page);
|
||||
|
||||
mt = get_pcppage_migratetype(page);
|
||||
/* MIGRATE_ISOLATE page should not go to pcplists */
|
||||
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
|
||||
/* Pageblock could have been isolated meanwhile */
|
||||
if (unlikely(has_isolate_pageblock(zone)))
|
||||
mt = get_pageblock_migratetype(page);
|
||||
|
||||
@ -955,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
|
||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||
local_irq_save(flags);
|
||||
__count_vm_events(PGFREE, 1 << order);
|
||||
set_freepage_migratetype(page, migratetype);
|
||||
free_one_page(page_zone(page), page, pfn, order, migratetype);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
@ -1383,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
|
||||
rmv_page_order(page);
|
||||
area->nr_free--;
|
||||
expand(zone, page, order, current_order, area, migratetype);
|
||||
set_freepage_migratetype(page, migratetype);
|
||||
set_pcppage_migratetype(page, migratetype);
|
||||
return page;
|
||||
}
|
||||
|
||||
@ -1460,7 +1481,6 @@ int move_freepages(struct zone *zone,
|
||||
order = page_order(page);
|
||||
list_move(&page->lru,
|
||||
&zone->free_area[order].free_list[migratetype]);
|
||||
set_freepage_migratetype(page, migratetype);
|
||||
page += 1 << order;
|
||||
pages_moved += 1 << order;
|
||||
}
|
||||
@ -1630,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
|
||||
expand(zone, page, order, current_order, area,
|
||||
start_migratetype);
|
||||
/*
|
||||
* The freepage_migratetype may differ from pageblock's
|
||||
* The pcppage_migratetype may differ from pageblock's
|
||||
* migratetype depending on the decisions in
|
||||
* try_to_steal_freepages(). This is OK as long as it
|
||||
* does not differ for MIGRATE_CMA pageblocks. For CMA
|
||||
* we need to make sure unallocated pages flushed from
|
||||
* pcp lists are returned to the correct freelist.
|
||||
* find_suitable_fallback(). This is OK as long as it does not
|
||||
* differ for MIGRATE_CMA pageblocks. Those can be used as
|
||||
* fallback only via special __rmqueue_cma_fallback() function
|
||||
*/
|
||||
set_freepage_migratetype(page, start_migratetype);
|
||||
set_pcppage_migratetype(page, start_migratetype);
|
||||
|
||||
trace_mm_page_alloc_extfrag(page, order, current_order,
|
||||
start_migratetype, fallback_mt);
|
||||
@ -1713,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
||||
else
|
||||
list_add_tail(&page->lru, list);
|
||||
list = &page->lru;
|
||||
if (is_migrate_cma(get_freepage_migratetype(page)))
|
||||
if (is_migrate_cma(get_pcppage_migratetype(page)))
|
||||
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
|
||||
-(1 << order));
|
||||
}
|
||||
@ -1910,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold)
|
||||
return;
|
||||
|
||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||
set_freepage_migratetype(page, migratetype);
|
||||
set_pcppage_migratetype(page, migratetype);
|
||||
local_irq_save(flags);
|
||||
__count_vm_event(PGFREE);
|
||||
|
||||
@ -2115,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
|
||||
if (!page)
|
||||
goto failed;
|
||||
__mod_zone_freepage_state(zone, -(1 << order),
|
||||
get_freepage_migratetype(page));
|
||||
get_pcppage_migratetype(page));
|
||||
}
|
||||
|
||||
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
|
||||
@ -2696,6 +2715,12 @@ static inline struct page *
|
||||
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|
||||
const struct alloc_context *ac, unsigned long *did_some_progress)
|
||||
{
|
||||
struct oom_control oc = {
|
||||
.zonelist = ac->zonelist,
|
||||
.nodemask = ac->nodemask,
|
||||
.gfp_mask = gfp_mask,
|
||||
.order = order,
|
||||
};
|
||||
struct page *page;
|
||||
|
||||
*did_some_progress = 0;
|
||||
@ -2747,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|
||||
goto out;
|
||||
}
|
||||
/* Exhausted what can be done so it's blamo time */
|
||||
if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
|
||||
|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
|
||||
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
|
||||
*did_some_progress = 1;
|
||||
out:
|
||||
mutex_unlock(&oom_lock);
|
||||
@ -3490,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact);
|
||||
*
|
||||
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
|
||||
* back.
|
||||
* Note this is not alloc_pages_exact_node() which allocates on a specific node,
|
||||
* but is not exact.
|
||||
*/
|
||||
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
|
||||
{
|
||||
@ -5066,7 +5088,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
|
||||
{
|
||||
unsigned long zone_start_pfn, zone_end_pfn;
|
||||
|
||||
/* When hotadd a new node, the node should be empty */
|
||||
/* When hotadd a new node from cpu_up(), the node should be empty */
|
||||
if (!node_start_pfn && !node_end_pfn)
|
||||
return 0;
|
||||
|
||||
@ -5133,7 +5155,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
|
||||
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
|
||||
unsigned long zone_start_pfn, zone_end_pfn;
|
||||
|
||||
/* When hotadd a new node, the node should be empty */
|
||||
/* When hotadd a new node from cpu_up(), the node should be empty */
|
||||
if (!node_start_pfn && !node_end_pfn)
|
||||
return 0;
|
||||
|
||||
@ -5306,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
|
||||
*
|
||||
* NOTE: pgdat should get zeroed by caller.
|
||||
*/
|
||||
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
unsigned long node_start_pfn, unsigned long node_end_pfn)
|
||||
static void __paginginit free_area_init_core(struct pglist_data *pgdat)
|
||||
{
|
||||
enum zone_type j;
|
||||
int nid = pgdat->node_id;
|
||||
@ -5458,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
|
||||
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
|
||||
(u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
|
||||
(u64)start_pfn << PAGE_SHIFT,
|
||||
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
|
||||
#endif
|
||||
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
|
||||
zones_size, zholes_size);
|
||||
@ -5470,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||
(unsigned long)pgdat->node_mem_map);
|
||||
#endif
|
||||
|
||||
free_area_init_core(pgdat, start_pfn, end_pfn);
|
||||
free_area_init_core(pgdat);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
@ -5481,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||
*/
|
||||
void __init setup_nr_node_ids(void)
|
||||
{
|
||||
unsigned int node;
|
||||
unsigned int highest = 0;
|
||||
unsigned int highest;
|
||||
|
||||
for_each_node_mask(node, node_possible_map)
|
||||
highest = node;
|
||||
highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
|
||||
nr_node_ids = highest + 1;
|
||||
}
|
||||
#endif
|
||||
@ -6006,7 +6026,7 @@ void __init mem_init_print_info(const char *str)
|
||||
* set_dma_reserve - set the specified number of pages reserved in the first zone
|
||||
* @new_dma_reserve: The number of pages to mark reserved
|
||||
*
|
||||
* The per-cpu batchsize and zone watermarks are determined by present_pages.
|
||||
* The per-cpu batchsize and zone watermarks are determined by managed_pages.
|
||||
* In the DMA zone, a significant percentage may be consumed by kernel image
|
||||
* and other unfreeable allocations which can skew the watermarks badly. This
|
||||
* function may optionally be used to account for unfreeable pages in the
|
||||
@ -6059,7 +6079,7 @@ void __init page_alloc_init(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
|
||||
* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
|
||||
* or min_free_kbytes changes.
|
||||
*/
|
||||
static void calculate_totalreserve_pages(void)
|
||||
@ -6103,7 +6123,7 @@ static void calculate_totalreserve_pages(void)
|
||||
|
||||
/*
|
||||
* setup_per_zone_lowmem_reserve - called whenever
|
||||
* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
|
||||
* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
|
||||
* has a correct pages reserved value, so an adequate number of
|
||||
* pages are left in the zone after a successful __alloc_pages().
|
||||
*/
|
||||
|
@ -9,7 +9,8 @@
|
||||
#include <linux/hugetlb.h>
|
||||
#include "internal.h"
|
||||
|
||||
int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
|
||||
static int set_migratetype_isolate(struct page *page,
|
||||
bool skip_hwpoisoned_pages)
|
||||
{
|
||||
struct zone *zone;
|
||||
unsigned long flags, pfn;
|
||||
@ -72,7 +73,7 @@ int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
|
||||
return ret;
|
||||
}
|
||||
|
||||
void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
||||
static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
||||
{
|
||||
struct zone *zone;
|
||||
unsigned long flags, nr_pages;
|
||||
@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
||||
continue;
|
||||
}
|
||||
page = pfn_to_page(pfn);
|
||||
if (PageBuddy(page)) {
|
||||
if (PageBuddy(page))
|
||||
/*
|
||||
* If race between isolatation and allocation happens,
|
||||
* some free pages could be in MIGRATE_MOVABLE list
|
||||
* although pageblock's migratation type of the page
|
||||
* is MIGRATE_ISOLATE. Catch it and move the page into
|
||||
* MIGRATE_ISOLATE list.
|
||||
* If the page is on a free list, it has to be on
|
||||
* the correct MIGRATE_ISOLATE freelist. There is no
|
||||
* simple way to verify that as VM_BUG_ON(), though.
|
||||
*/
|
||||
if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
|
||||
struct page *end_page;
|
||||
|
||||
end_page = page + (1 << page_order(page)) - 1;
|
||||
move_freepages(page_zone(page), page, end_page,
|
||||
MIGRATE_ISOLATE);
|
||||
}
|
||||
pfn += 1 << page_order(page);
|
||||
}
|
||||
else if (page_count(page) == 0 &&
|
||||
get_freepage_migratetype(page) == MIGRATE_ISOLATE)
|
||||
pfn += 1;
|
||||
else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
|
||||
/*
|
||||
* The HWPoisoned page may be not in buddy
|
||||
* system, and page_count() is not 0.
|
||||
*/
|
||||
else if (skip_hwpoisoned_pages && PageHWPoison(page))
|
||||
/* A HWPoisoned page cannot be also PageBuddy */
|
||||
pfn++;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
16
mm/shmem.c
16
mm/shmem.c
@ -542,6 +542,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(shmem_truncate_range);
|
||||
|
||||
static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||
|
||||
spin_lock(&info->lock);
|
||||
shmem_recalc_inode(inode);
|
||||
spin_unlock(&info->lock);
|
||||
|
||||
generic_fillattr(inode, stat);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
struct inode *inode = d_inode(dentry);
|
||||
@ -3122,6 +3137,7 @@ static const struct file_operations shmem_file_operations = {
|
||||
};
|
||||
|
||||
static const struct inode_operations shmem_inode_operations = {
|
||||
.getattr = shmem_getattr,
|
||||
.setattr = shmem_setattr,
|
||||
#ifdef CONFIG_TMPFS_XATTR
|
||||
.setxattr = shmem_setxattr,
|
||||
|
@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
|
||||
if (memcg_charge_slab(cachep, flags, cachep->gfporder))
|
||||
return NULL;
|
||||
|
||||
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
|
||||
page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
|
||||
if (!page) {
|
||||
memcg_uncharge_slab(cachep, cachep->gfporder);
|
||||
slab_out_of_memory(cachep, flags, nodeid);
|
||||
|
@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
|
||||
struct kmem_cache *root_cache)
|
||||
{
|
||||
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
|
||||
struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
|
||||
struct cgroup_subsys_state *css = &memcg->css;
|
||||
struct memcg_cache_array *arr;
|
||||
struct kmem_cache *s = NULL;
|
||||
char *cache_name;
|
||||
@ -640,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
|
||||
bool need_rcu_barrier = false;
|
||||
bool busy = false;
|
||||
|
||||
if (unlikely(!s))
|
||||
return;
|
||||
|
||||
BUG_ON(!is_root_cache(s));
|
||||
|
||||
get_online_cpus();
|
||||
|
@ -45,7 +45,7 @@
|
||||
* NUMA support in SLOB is fairly simplistic, pushing most of the real
|
||||
* logic down to the page allocator, and simply doing the node accounting
|
||||
* on the upper levels. In the event that a node id is explicitly
|
||||
* provided, alloc_pages_exact_node() with the specified node id is used
|
||||
* provided, __alloc_pages_node() with the specified node id is used
|
||||
* instead. The common case (or when the node id isn't explicitly provided)
|
||||
* will default to the current node, as per numa_node_id().
|
||||
*
|
||||
@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
if (node != NUMA_NO_NODE)
|
||||
page = alloc_pages_exact_node(node, gfp, order);
|
||||
page = __alloc_pages_node(node, gfp, order);
|
||||
else
|
||||
#endif
|
||||
page = alloc_pages(gfp, order);
|
||||
|
@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
|
||||
if (node == NUMA_NO_NODE)
|
||||
page = alloc_pages(flags, order);
|
||||
else
|
||||
page = alloc_pages_exact_node(node, flags, order);
|
||||
page = __alloc_pages_node(node, flags, order);
|
||||
|
||||
if (!page)
|
||||
memcg_uncharge_slab(s, order);
|
||||
|
@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Locate a page of swap in physical memory, reserving swap cache space
|
||||
* and reading the disk if it is not already cached.
|
||||
* A failure return means that either the page allocation failed or that
|
||||
* the swap entry is no longer in use.
|
||||
*/
|
||||
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
bool *new_page_allocated)
|
||||
{
|
||||
struct page *found_page, *new_page = NULL;
|
||||
struct address_space *swapper_space = swap_address_space(entry);
|
||||
int err;
|
||||
*new_page_allocated = false;
|
||||
|
||||
do {
|
||||
/*
|
||||
@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
* called after lookup_swap_cache() failed, re-calling
|
||||
* that would confuse statistics.
|
||||
*/
|
||||
found_page = find_get_page(swap_address_space(entry),
|
||||
entry.val);
|
||||
found_page = find_get_page(swapper_space, entry.val);
|
||||
if (found_page)
|
||||
break;
|
||||
|
||||
@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
* Initiate read into locked page and return.
|
||||
*/
|
||||
lru_cache_add_anon(new_page);
|
||||
swap_readpage(new_page);
|
||||
*new_page_allocated = true;
|
||||
return new_page;
|
||||
}
|
||||
radix_tree_preload_end();
|
||||
@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
return found_page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Locate a page of swap in physical memory, reserving swap cache space
|
||||
* and reading the disk if it is not already cached.
|
||||
* A failure return means that either the page allocation failed or that
|
||||
* the swap entry is no longer in use.
|
||||
*/
|
||||
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
bool page_was_allocated;
|
||||
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
|
||||
vma, addr, &page_was_allocated);
|
||||
|
||||
if (page_was_allocated)
|
||||
swap_readpage(retpage);
|
||||
|
||||
return retpage;
|
||||
}
|
||||
|
||||
static unsigned long swapin_nr_pages(unsigned long offset)
|
||||
{
|
||||
static unsigned long prev_offset;
|
||||
|
@ -874,6 +874,48 @@ int page_swapcount(struct page *page)
|
||||
return count;
|
||||
}
|
||||
|
||||
/*
|
||||
* How many references to @entry are currently swapped out?
|
||||
* This considers COUNT_CONTINUED so it returns exact answer.
|
||||
*/
|
||||
int swp_swapcount(swp_entry_t entry)
|
||||
{
|
||||
int count, tmp_count, n;
|
||||
struct swap_info_struct *p;
|
||||
struct page *page;
|
||||
pgoff_t offset;
|
||||
unsigned char *map;
|
||||
|
||||
p = swap_info_get(entry);
|
||||
if (!p)
|
||||
return 0;
|
||||
|
||||
count = swap_count(p->swap_map[swp_offset(entry)]);
|
||||
if (!(count & COUNT_CONTINUED))
|
||||
goto out;
|
||||
|
||||
count &= ~COUNT_CONTINUED;
|
||||
n = SWAP_MAP_MAX + 1;
|
||||
|
||||
offset = swp_offset(entry);
|
||||
page = vmalloc_to_page(p->swap_map + offset);
|
||||
offset &= ~PAGE_MASK;
|
||||
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
|
||||
|
||||
do {
|
||||
page = list_entry(page->lru.next, struct page, lru);
|
||||
map = kmap_atomic(page);
|
||||
tmp_count = map[offset];
|
||||
kunmap_atomic(map);
|
||||
|
||||
count += (tmp_count & ~COUNT_CONTINUED) * n;
|
||||
n *= (SWAP_CONT_MAX + 1);
|
||||
} while (tmp_count & COUNT_CONTINUED);
|
||||
out:
|
||||
spin_unlock(&p->lock);
|
||||
return count;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can write to an anon page without COW if there are no other references
|
||||
* to it. And as a side-effect, free up its swap: because the old content
|
||||
|
14
mm/vmscan.c
14
mm/vmscan.c
@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
|
||||
if (!memcg)
|
||||
return true;
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
|
||||
if (memcg->css.cgroup)
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
@ -985,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||
* __GFP_IO|__GFP_FS for this reason); but more thought
|
||||
* would probably show more reasons.
|
||||
*
|
||||
* 3) Legacy memcg encounters a page that is not already marked
|
||||
* 3) Legacy memcg encounters a page that is already marked
|
||||
* PageReclaim. memcg does not have any dirty pages
|
||||
* throttling so we could easily OOM just because too many
|
||||
* pages are in writeback and there is nothing else to
|
||||
@ -1015,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||
*/
|
||||
SetPageReclaim(page);
|
||||
nr_writeback++;
|
||||
|
||||
goto keep_locked;
|
||||
|
||||
/* Case 3 above */
|
||||
} else {
|
||||
unlock_page(page);
|
||||
wait_on_page_writeback(page);
|
||||
/* then go back and try same page again */
|
||||
list_add_tail(&page->lru, page_list);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1196,7 +1199,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||
if (PageSwapCache(page))
|
||||
try_to_free_swap(page);
|
||||
unlock_page(page);
|
||||
putback_lru_page(page);
|
||||
list_add(&page->lru, &ret_pages);
|
||||
continue;
|
||||
|
||||
activate_locked:
|
||||
@ -1359,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||
unsigned long nr_taken = 0;
|
||||
unsigned long scan;
|
||||
|
||||
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
|
||||
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
|
||||
!list_empty(src); scan++) {
|
||||
struct page *page;
|
||||
int nr_pages;
|
||||
|
||||
|
10
mm/zbud.c
10
mm/zbud.c
@ -96,10 +96,10 @@ struct zbud_pool {
|
||||
struct list_head buddied;
|
||||
struct list_head lru;
|
||||
u64 pages_nr;
|
||||
struct zbud_ops *ops;
|
||||
const struct zbud_ops *ops;
|
||||
#ifdef CONFIG_ZPOOL
|
||||
struct zpool *zpool;
|
||||
struct zpool_ops *zpool_ops;
|
||||
const struct zpool_ops *zpool_ops;
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
static struct zbud_ops zbud_zpool_ops = {
|
||||
static const struct zbud_ops zbud_zpool_ops = {
|
||||
.evict = zbud_zpool_evict
|
||||
};
|
||||
|
||||
static void *zbud_zpool_create(char *name, gfp_t gfp,
|
||||
struct zpool_ops *zpool_ops,
|
||||
const struct zpool_ops *zpool_ops,
|
||||
struct zpool *zpool)
|
||||
{
|
||||
struct zbud_pool *pool;
|
||||
@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
|
||||
* Return: pointer to the new zbud pool or NULL if the metadata allocation
|
||||
* failed.
|
||||
*/
|
||||
struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
|
||||
struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
|
||||
{
|
||||
struct zbud_pool *pool;
|
||||
int i;
|
||||
|
18
mm/zpool.c
18
mm/zpool.c
@ -22,7 +22,7 @@ struct zpool {
|
||||
|
||||
struct zpool_driver *driver;
|
||||
void *pool;
|
||||
struct zpool_ops *ops;
|
||||
const struct zpool_ops *ops;
|
||||
|
||||
struct list_head list;
|
||||
};
|
||||
@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
|
||||
* Returns: New zpool on success, NULL on failure.
|
||||
*/
|
||||
struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
|
||||
struct zpool_ops *ops)
|
||||
const struct zpool_ops *ops)
|
||||
{
|
||||
struct zpool_driver *driver;
|
||||
struct zpool *zpool;
|
||||
@ -320,20 +320,6 @@ u64 zpool_get_total_size(struct zpool *zpool)
|
||||
return zpool->driver->total_size(zpool->pool);
|
||||
}
|
||||
|
||||
static int __init init_zpool(void)
|
||||
{
|
||||
pr_info("loaded\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit exit_zpool(void)
|
||||
{
|
||||
pr_info("unloaded\n");
|
||||
}
|
||||
|
||||
module_init(init_zpool);
|
||||
module_exit(exit_zpool);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
|
||||
MODULE_DESCRIPTION("Common API for compressed memory storage");
|
||||
|
235
mm/zsmalloc.c
235
mm/zsmalloc.c
@ -169,14 +169,12 @@ enum zs_stat_type {
|
||||
NR_ZS_STAT_TYPE,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_ZSMALLOC_STAT
|
||||
|
||||
static struct dentry *zs_stat_root;
|
||||
|
||||
struct zs_size_stat {
|
||||
unsigned long objs[NR_ZS_STAT_TYPE];
|
||||
};
|
||||
|
||||
#ifdef CONFIG_ZSMALLOC_STAT
|
||||
static struct dentry *zs_stat_root;
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -201,6 +199,8 @@ static int zs_size_classes;
|
||||
static const int fullness_threshold_frac = 4;
|
||||
|
||||
struct size_class {
|
||||
spinlock_t lock;
|
||||
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
|
||||
/*
|
||||
* Size of objects stored in this class. Must be multiple
|
||||
* of ZS_ALIGN.
|
||||
@ -210,16 +210,10 @@ struct size_class {
|
||||
|
||||
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
|
||||
int pages_per_zspage;
|
||||
struct zs_size_stat stats;
|
||||
|
||||
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
|
||||
bool huge;
|
||||
|
||||
#ifdef CONFIG_ZSMALLOC_STAT
|
||||
struct zs_size_stat stats;
|
||||
#endif
|
||||
|
||||
spinlock_t lock;
|
||||
|
||||
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
|
||||
};
|
||||
|
||||
/*
|
||||
@ -251,6 +245,15 @@ struct zs_pool {
|
||||
gfp_t flags; /* allocation flags used when growing pool */
|
||||
atomic_long_t pages_allocated;
|
||||
|
||||
struct zs_pool_stats stats;
|
||||
|
||||
/* Compact classes */
|
||||
struct shrinker shrinker;
|
||||
/*
|
||||
* To signify that register_shrinker() was successful
|
||||
* and unregister_shrinker() will not Oops.
|
||||
*/
|
||||
bool shrinker_enabled;
|
||||
#ifdef CONFIG_ZSMALLOC_STAT
|
||||
struct dentry *stat_dentry;
|
||||
#endif
|
||||
@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool)
|
||||
|
||||
static void destroy_handle_cache(struct zs_pool *pool)
|
||||
{
|
||||
if (pool->handle_cachep)
|
||||
kmem_cache_destroy(pool->handle_cachep);
|
||||
kmem_cache_destroy(pool->handle_cachep);
|
||||
}
|
||||
|
||||
static unsigned long alloc_handle(struct zs_pool *pool)
|
||||
@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
|
||||
|
||||
#ifdef CONFIG_ZPOOL
|
||||
|
||||
static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
|
||||
static void *zs_zpool_create(char *name, gfp_t gfp,
|
||||
const struct zpool_ops *zpool_ops,
|
||||
struct zpool *zpool)
|
||||
{
|
||||
return zs_create_pool(name, gfp);
|
||||
@ -441,8 +444,6 @@ static int get_size_class_index(int size)
|
||||
return min(zs_size_classes - 1, idx);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZSMALLOC_STAT
|
||||
|
||||
static inline void zs_stat_inc(struct size_class *class,
|
||||
enum zs_stat_type type, unsigned long cnt)
|
||||
{
|
||||
@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class,
|
||||
return class->stats.objs[type];
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZSMALLOC_STAT
|
||||
|
||||
static int __init zs_stat_init(void)
|
||||
{
|
||||
if (!debugfs_initialized())
|
||||
@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
|
||||
}
|
||||
|
||||
#else /* CONFIG_ZSMALLOC_STAT */
|
||||
|
||||
static inline void zs_stat_inc(struct size_class *class,
|
||||
enum zs_stat_type type, unsigned long cnt)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void zs_stat_dec(struct size_class *class,
|
||||
enum zs_stat_type type, unsigned long cnt)
|
||||
{
|
||||
}
|
||||
|
||||
static inline unsigned long zs_stat_get(struct size_class *class,
|
||||
enum zs_stat_type type)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __init zs_stat_init(void)
|
||||
{
|
||||
return 0;
|
||||
@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
|
||||
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class,
|
||||
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
|
||||
return;
|
||||
|
||||
head = &class->fullness_list[fullness];
|
||||
if (*head)
|
||||
list_add_tail(&page->lru, &(*head)->lru);
|
||||
|
||||
*head = page;
|
||||
zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
|
||||
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
|
||||
|
||||
head = &class->fullness_list[fullness];
|
||||
if (!*head) {
|
||||
*head = page;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We want to see more ZS_FULL pages and less almost
|
||||
* empty/full. Put pages with higher ->inuse first.
|
||||
*/
|
||||
list_add_tail(&page->lru, &(*head)->lru);
|
||||
if (page->inuse >= (*head)->inuse)
|
||||
*head = page;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(zs_free);
|
||||
|
||||
static void zs_object_copy(unsigned long src, unsigned long dst,
|
||||
static void zs_object_copy(unsigned long dst, unsigned long src,
|
||||
struct size_class *class)
|
||||
{
|
||||
struct page *s_page, *d_page;
|
||||
@ -1602,8 +1596,6 @@ struct zs_compact_control {
|
||||
/* Starting object index within @s_page which used for live object
|
||||
* in the subpage. */
|
||||
int index;
|
||||
/* how many of objects are migrated */
|
||||
int nr_migrated;
|
||||
};
|
||||
|
||||
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
|
||||
@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
|
||||
struct page *s_page = cc->s_page;
|
||||
struct page *d_page = cc->d_page;
|
||||
unsigned long index = cc->index;
|
||||
int nr_migrated = 0;
|
||||
int ret = 0;
|
||||
|
||||
while (1) {
|
||||
@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
|
||||
|
||||
used_obj = handle_to_obj(handle);
|
||||
free_obj = obj_malloc(d_page, class, handle);
|
||||
zs_object_copy(used_obj, free_obj, class);
|
||||
zs_object_copy(free_obj, used_obj, class);
|
||||
index++;
|
||||
record_obj(handle, free_obj);
|
||||
unpin_tag(handle);
|
||||
obj_free(pool, class, used_obj);
|
||||
nr_migrated++;
|
||||
}
|
||||
|
||||
/* Remember last position in this iteration */
|
||||
cc->s_page = s_page;
|
||||
cc->index = index;
|
||||
cc->nr_migrated = nr_migrated;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct page *alloc_target_page(struct size_class *class)
|
||||
static struct page *isolate_target_page(struct size_class *class)
|
||||
{
|
||||
int i;
|
||||
struct page *page;
|
||||
@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class)
|
||||
return page;
|
||||
}
|
||||
|
||||
static void putback_zspage(struct zs_pool *pool, struct size_class *class,
|
||||
struct page *first_page)
|
||||
/*
|
||||
* putback_zspage - add @first_page into right class's fullness list
|
||||
* @pool: target pool
|
||||
* @class: destination class
|
||||
* @first_page: target page
|
||||
*
|
||||
* Return @fist_page's fullness_group
|
||||
*/
|
||||
static enum fullness_group putback_zspage(struct zs_pool *pool,
|
||||
struct size_class *class,
|
||||
struct page *first_page)
|
||||
{
|
||||
enum fullness_group fullness;
|
||||
|
||||
@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
|
||||
|
||||
free_zspage(first_page);
|
||||
}
|
||||
|
||||
return fullness;
|
||||
}
|
||||
|
||||
static struct page *isolate_source_page(struct size_class *class)
|
||||
{
|
||||
struct page *page;
|
||||
int i;
|
||||
struct page *page = NULL;
|
||||
|
||||
page = class->fullness_list[ZS_ALMOST_EMPTY];
|
||||
if (page)
|
||||
remove_zspage(page, class, ZS_ALMOST_EMPTY);
|
||||
for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
|
||||
page = class->fullness_list[i];
|
||||
if (!page)
|
||||
continue;
|
||||
|
||||
remove_zspage(page, class, i);
|
||||
break;
|
||||
}
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
static unsigned long __zs_compact(struct zs_pool *pool,
|
||||
struct size_class *class)
|
||||
/*
|
||||
*
|
||||
* Based on the number of unused allocated objects calculate
|
||||
* and return the number of pages that we can free.
|
||||
*/
|
||||
static unsigned long zs_can_compact(struct size_class *class)
|
||||
{
|
||||
unsigned long obj_wasted;
|
||||
|
||||
obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
|
||||
zs_stat_get(class, OBJ_USED);
|
||||
|
||||
obj_wasted /= get_maxobj_per_zspage(class->size,
|
||||
class->pages_per_zspage);
|
||||
|
||||
return obj_wasted * class->pages_per_zspage;
|
||||
}
|
||||
|
||||
static void __zs_compact(struct zs_pool *pool, struct size_class *class)
|
||||
{
|
||||
int nr_to_migrate;
|
||||
struct zs_compact_control cc;
|
||||
struct page *src_page;
|
||||
struct page *dst_page = NULL;
|
||||
unsigned long nr_total_migrated = 0;
|
||||
|
||||
spin_lock(&class->lock);
|
||||
while ((src_page = isolate_source_page(class))) {
|
||||
|
||||
BUG_ON(!is_first_page(src_page));
|
||||
|
||||
/* The goal is to migrate all live objects in source page */
|
||||
nr_to_migrate = src_page->inuse;
|
||||
if (!zs_can_compact(class))
|
||||
break;
|
||||
|
||||
cc.index = 0;
|
||||
cc.s_page = src_page;
|
||||
|
||||
while ((dst_page = alloc_target_page(class))) {
|
||||
while ((dst_page = isolate_target_page(class))) {
|
||||
cc.d_page = dst_page;
|
||||
/*
|
||||
* If there is no more space in dst_page, try to
|
||||
* allocate another zspage.
|
||||
* If there is no more space in dst_page, resched
|
||||
* and see if anyone had allocated another zspage.
|
||||
*/
|
||||
if (!migrate_zspage(pool, class, &cc))
|
||||
break;
|
||||
|
||||
putback_zspage(pool, class, dst_page);
|
||||
nr_total_migrated += cc.nr_migrated;
|
||||
nr_to_migrate -= cc.nr_migrated;
|
||||
}
|
||||
|
||||
/* Stop if we couldn't find slot */
|
||||
@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
|
||||
break;
|
||||
|
||||
putback_zspage(pool, class, dst_page);
|
||||
putback_zspage(pool, class, src_page);
|
||||
if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
|
||||
pool->stats.pages_compacted += class->pages_per_zspage;
|
||||
spin_unlock(&class->lock);
|
||||
nr_total_migrated += cc.nr_migrated;
|
||||
cond_resched();
|
||||
spin_lock(&class->lock);
|
||||
}
|
||||
@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
|
||||
putback_zspage(pool, class, src_page);
|
||||
|
||||
spin_unlock(&class->lock);
|
||||
|
||||
return nr_total_migrated;
|
||||
}
|
||||
|
||||
unsigned long zs_compact(struct zs_pool *pool)
|
||||
{
|
||||
int i;
|
||||
unsigned long nr_migrated = 0;
|
||||
struct size_class *class;
|
||||
|
||||
for (i = zs_size_classes - 1; i >= 0; i--) {
|
||||
@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool)
|
||||
continue;
|
||||
if (class->index != i)
|
||||
continue;
|
||||
nr_migrated += __zs_compact(pool, class);
|
||||
__zs_compact(pool, class);
|
||||
}
|
||||
|
||||
return nr_migrated;
|
||||
return pool->stats.pages_compacted;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(zs_compact);
|
||||
|
||||
void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
|
||||
{
|
||||
memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(zs_pool_stats);
|
||||
|
||||
static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
unsigned long pages_freed;
|
||||
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
|
||||
shrinker);
|
||||
|
||||
pages_freed = pool->stats.pages_compacted;
|
||||
/*
|
||||
* Compact classes and calculate compaction delta.
|
||||
* Can run concurrently with a manually triggered
|
||||
* (by user) compaction.
|
||||
*/
|
||||
pages_freed = zs_compact(pool) - pages_freed;
|
||||
|
||||
return pages_freed ? pages_freed : SHRINK_STOP;
|
||||
}
|
||||
|
||||
static unsigned long zs_shrinker_count(struct shrinker *shrinker,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
int i;
|
||||
struct size_class *class;
|
||||
unsigned long pages_to_free = 0;
|
||||
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
|
||||
shrinker);
|
||||
|
||||
if (!pool->shrinker_enabled)
|
||||
return 0;
|
||||
|
||||
for (i = zs_size_classes - 1; i >= 0; i--) {
|
||||
class = pool->size_class[i];
|
||||
if (!class)
|
||||
continue;
|
||||
if (class->index != i)
|
||||
continue;
|
||||
|
||||
pages_to_free += zs_can_compact(class);
|
||||
}
|
||||
|
||||
return pages_to_free;
|
||||
}
|
||||
|
||||
static void zs_unregister_shrinker(struct zs_pool *pool)
|
||||
{
|
||||
if (pool->shrinker_enabled) {
|
||||
unregister_shrinker(&pool->shrinker);
|
||||
pool->shrinker_enabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
static int zs_register_shrinker(struct zs_pool *pool)
|
||||
{
|
||||
pool->shrinker.scan_objects = zs_shrinker_scan;
|
||||
pool->shrinker.count_objects = zs_shrinker_count;
|
||||
pool->shrinker.batch = 0;
|
||||
pool->shrinker.seeks = DEFAULT_SEEKS;
|
||||
|
||||
return register_shrinker(&pool->shrinker);
|
||||
}
|
||||
|
||||
/**
|
||||
* zs_create_pool - Creates an allocation pool to work from.
|
||||
* @flags: allocation flags used to allocate pool metadata
|
||||
@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
|
||||
if (zs_pool_stat_create(name, pool))
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* Not critical, we still can use the pool
|
||||
* and user can trigger compaction manually.
|
||||
*/
|
||||
if (zs_register_shrinker(pool) == 0)
|
||||
pool->shrinker_enabled = true;
|
||||
return pool;
|
||||
|
||||
err:
|
||||
@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool)
|
||||
{
|
||||
int i;
|
||||
|
||||
zs_unregister_shrinker(pool);
|
||||
zs_pool_stat_destroy(pool);
|
||||
|
||||
for (i = 0; i < zs_size_classes; i++) {
|
||||
|
75
mm/zswap.c
75
mm/zswap.c
@ -446,75 +446,14 @@ enum zswap_get_swap_ret {
|
||||
static int zswap_get_swap_cache_page(swp_entry_t entry,
|
||||
struct page **retpage)
|
||||
{
|
||||
struct page *found_page, *new_page = NULL;
|
||||
struct address_space *swapper_space = swap_address_space(entry);
|
||||
int err;
|
||||
bool page_was_allocated;
|
||||
|
||||
*retpage = NULL;
|
||||
do {
|
||||
/*
|
||||
* First check the swap cache. Since this is normally
|
||||
* called after lookup_swap_cache() failed, re-calling
|
||||
* that would confuse statistics.
|
||||
*/
|
||||
found_page = find_get_page(swapper_space, entry.val);
|
||||
if (found_page)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Get a new page to read into from swap.
|
||||
*/
|
||||
if (!new_page) {
|
||||
new_page = alloc_page(GFP_KERNEL);
|
||||
if (!new_page)
|
||||
break; /* Out of memory */
|
||||
}
|
||||
|
||||
/*
|
||||
* call radix_tree_preload() while we can wait.
|
||||
*/
|
||||
err = radix_tree_preload(GFP_KERNEL);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Swap entry may have been freed since our caller observed it.
|
||||
*/
|
||||
err = swapcache_prepare(entry);
|
||||
if (err == -EEXIST) { /* seems racy */
|
||||
radix_tree_preload_end();
|
||||
continue;
|
||||
}
|
||||
if (err) { /* swp entry is obsolete ? */
|
||||
radix_tree_preload_end();
|
||||
break;
|
||||
}
|
||||
|
||||
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
|
||||
__set_page_locked(new_page);
|
||||
SetPageSwapBacked(new_page);
|
||||
err = __add_to_swap_cache(new_page, entry);
|
||||
if (likely(!err)) {
|
||||
radix_tree_preload_end();
|
||||
lru_cache_add_anon(new_page);
|
||||
*retpage = new_page;
|
||||
return ZSWAP_SWAPCACHE_NEW;
|
||||
}
|
||||
radix_tree_preload_end();
|
||||
ClearPageSwapBacked(new_page);
|
||||
__clear_page_locked(new_page);
|
||||
/*
|
||||
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
|
||||
* clear SWAP_HAS_CACHE flag.
|
||||
*/
|
||||
swapcache_free(entry);
|
||||
} while (err != -ENOMEM);
|
||||
|
||||
if (new_page)
|
||||
page_cache_release(new_page);
|
||||
if (!found_page)
|
||||
*retpage = __read_swap_cache_async(entry, GFP_KERNEL,
|
||||
NULL, 0, &page_was_allocated);
|
||||
if (page_was_allocated)
|
||||
return ZSWAP_SWAPCACHE_NEW;
|
||||
if (!*retpage)
|
||||
return ZSWAP_SWAPCACHE_FAIL;
|
||||
*retpage = found_page;
|
||||
return ZSWAP_SWAPCACHE_EXIST;
|
||||
}
|
||||
|
||||
@ -816,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
|
||||
zswap_trees[type] = NULL;
|
||||
}
|
||||
|
||||
static struct zpool_ops zswap_zpool_ops = {
|
||||
static const struct zpool_ops zswap_zpool_ops = {
|
||||
.evict = zswap_writeback_entry
|
||||
};
|
||||
|
||||
|
84
scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci
Normal file
84
scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci
Normal file
@ -0,0 +1,84 @@
|
||||
///
|
||||
/// Use *_pool_zalloc rather than *_pool_alloc followed by memset with 0
|
||||
///
|
||||
// Copyright: (C) 2015 Intel Corp. GPLv2.
|
||||
// Options: --no-includes --include-headers
|
||||
//
|
||||
// Keywords: dma_pool_zalloc, pci_pool_zalloc
|
||||
//
|
||||
|
||||
virtual context
|
||||
virtual patch
|
||||
virtual org
|
||||
virtual report
|
||||
|
||||
//----------------------------------------------------------
|
||||
// For context mode
|
||||
//----------------------------------------------------------
|
||||
|
||||
@depends on context@
|
||||
expression x;
|
||||
statement S;
|
||||
@@
|
||||
|
||||
* x = \(dma_pool_alloc\|pci_pool_alloc\)(...);
|
||||
if ((x==NULL) || ...) S
|
||||
* memset(x,0, ...);
|
||||
|
||||
//----------------------------------------------------------
|
||||
// For patch mode
|
||||
//----------------------------------------------------------
|
||||
|
||||
@depends on patch@
|
||||
expression x;
|
||||
expression a,b,c;
|
||||
statement S;
|
||||
@@
|
||||
|
||||
- x = dma_pool_alloc(a,b,c);
|
||||
+ x = dma_pool_zalloc(a,b,c);
|
||||
if ((x==NULL) || ...) S
|
||||
- memset(x,0,...);
|
||||
|
||||
@depends on patch@
|
||||
expression x;
|
||||
expression a,b,c;
|
||||
statement S;
|
||||
@@
|
||||
|
||||
- x = pci_pool_alloc(a,b,c);
|
||||
+ x = pci_pool_zalloc(a,b,c);
|
||||
if ((x==NULL) || ...) S
|
||||
- memset(x,0,...);
|
||||
|
||||
//----------------------------------------------------------
|
||||
// For org and report mode
|
||||
//----------------------------------------------------------
|
||||
|
||||
@r depends on org || report@
|
||||
expression x;
|
||||
expression a,b,c;
|
||||
statement S;
|
||||
position p;
|
||||
@@
|
||||
|
||||
x = @p\(dma_pool_alloc\|pci_pool_alloc\)(a,b,c);
|
||||
if ((x==NULL) || ...) S
|
||||
memset(x,0, ...);
|
||||
|
||||
@script:python depends on org@
|
||||
p << r.p;
|
||||
x << r.x;
|
||||
@@
|
||||
|
||||
msg="%s" % (x)
|
||||
msg_safe=msg.replace("[","@(").replace("]",")")
|
||||
coccilib.org.print_todo(p[0], msg_safe)
|
||||
|
||||
@script:python depends on report@
|
||||
p << r.p;
|
||||
x << r.x;
|
||||
@@
|
||||
|
||||
msg="WARNING: *_pool_zalloc should be used for %s, instead of *_pool_alloc/memset" % (x)
|
||||
coccilib.report.print_report(p[0], msg)
|
@ -4,7 +4,6 @@ CFLAGS = -Wall
|
||||
BINARIES = compaction_test
|
||||
BINARIES += hugepage-mmap
|
||||
BINARIES += hugepage-shm
|
||||
BINARIES += hugetlbfstest
|
||||
BINARIES += map_hugetlb
|
||||
BINARIES += thuge-gen
|
||||
BINARIES += transhuge-stress
|
||||
|
@ -1,86 +0,0 @@
|
||||
#define _GNU_SOURCE
|
||||
#include <assert.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
typedef unsigned long long u64;
|
||||
|
||||
static size_t length = 1 << 24;
|
||||
|
||||
static u64 read_rss(void)
|
||||
{
|
||||
char buf[4096], *s = buf;
|
||||
int i, fd;
|
||||
u64 rss;
|
||||
|
||||
fd = open("/proc/self/statm", O_RDONLY);
|
||||
assert(fd > 2);
|
||||
memset(buf, 0, sizeof(buf));
|
||||
read(fd, buf, sizeof(buf) - 1);
|
||||
for (i = 0; i < 1; i++)
|
||||
s = strchr(s, ' ') + 1;
|
||||
rss = strtoull(s, NULL, 10);
|
||||
return rss << 12; /* assumes 4k pagesize */
|
||||
}
|
||||
|
||||
static void do_mmap(int fd, int extra_flags, int unmap)
|
||||
{
|
||||
int *p;
|
||||
int flags = MAP_PRIVATE | MAP_POPULATE | extra_flags;
|
||||
u64 before, after;
|
||||
int ret;
|
||||
|
||||
before = read_rss();
|
||||
p = mmap(NULL, length, PROT_READ | PROT_WRITE, flags, fd, 0);
|
||||
assert(p != MAP_FAILED ||
|
||||
!"mmap returned an unexpected error");
|
||||
after = read_rss();
|
||||
assert(llabs(after - before - length) < 0x40000 ||
|
||||
!"rss didn't grow as expected");
|
||||
if (!unmap)
|
||||
return;
|
||||
ret = munmap(p, length);
|
||||
assert(!ret || !"munmap returned an unexpected error");
|
||||
after = read_rss();
|
||||
assert(llabs(after - before) < 0x40000 ||
|
||||
!"rss didn't shrink as expected");
|
||||
}
|
||||
|
||||
static int open_file(const char *path)
|
||||
{
|
||||
int fd, err;
|
||||
|
||||
unlink(path);
|
||||
fd = open(path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL
|
||||
| O_LARGEFILE | O_CLOEXEC, 0600);
|
||||
assert(fd > 2);
|
||||
unlink(path);
|
||||
err = ftruncate(fd, length);
|
||||
assert(!err);
|
||||
return fd;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int hugefd, fd;
|
||||
|
||||
fd = open_file("/dev/shm/hugetlbhog");
|
||||
hugefd = open_file("/hugepages/hugetlbhog");
|
||||
|
||||
system("echo 100 > /proc/sys/vm/nr_hugepages");
|
||||
do_mmap(-1, MAP_ANONYMOUS, 1);
|
||||
do_mmap(fd, 0, 1);
|
||||
do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 1);
|
||||
do_mmap(hugefd, 0, 1);
|
||||
do_mmap(hugefd, MAP_HUGETLB, 1);
|
||||
/* Leak the last one to test do_exit() */
|
||||
do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 0);
|
||||
printf("oll korrekt.\n");
|
||||
return 0;
|
||||
}
|
@ -75,16 +75,9 @@ else
|
||||
echo "[PASS]"
|
||||
fi
|
||||
|
||||
echo "--------------------"
|
||||
echo "running hugetlbfstest"
|
||||
echo "--------------------"
|
||||
./hugetlbfstest
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[FAIL]"
|
||||
exitcode=1
|
||||
else
|
||||
echo "[PASS]"
|
||||
fi
|
||||
echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
|
||||
echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
|
||||
echo " hugetlb regression testing."
|
||||
|
||||
echo "--------------------"
|
||||
echo "running userfaultfd"
|
||||
|
@ -147,7 +147,8 @@ static void *locking_thread(void *arg)
|
||||
if (sizeof(page_nr) > sizeof(rand_nr)) {
|
||||
if (random_r(&rand, &rand_nr))
|
||||
fprintf(stderr, "random_r 2 error\n"), exit(1);
|
||||
page_nr |= ((unsigned long) rand_nr) << 32;
|
||||
page_nr |= (((unsigned long) rand_nr) << 16) <<
|
||||
16;
|
||||
}
|
||||
} else
|
||||
page_nr += 1;
|
||||
@ -290,7 +291,8 @@ static void *uffd_poll_thread(void *arg)
|
||||
msg.event), exit(1);
|
||||
if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
|
||||
fprintf(stderr, "unexpected write fault\n"), exit(1);
|
||||
offset = (char *)msg.arg.pagefault.address - area_dst;
|
||||
offset = (char *)(unsigned long)msg.arg.pagefault.address -
|
||||
area_dst;
|
||||
offset &= ~(page_size-1);
|
||||
if (copy_page(offset))
|
||||
userfaults++;
|
||||
@ -327,7 +329,8 @@ static void *uffd_read_thread(void *arg)
|
||||
if (bounces & BOUNCE_VERIFY &&
|
||||
msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
|
||||
fprintf(stderr, "unexpected write fault\n"), exit(1);
|
||||
offset = (char *)msg.arg.pagefault.address - area_dst;
|
||||
offset = (char *)(unsigned long)msg.arg.pagefault.address -
|
||||
area_dst;
|
||||
offset &= ~(page_size-1);
|
||||
if (copy_page(offset))
|
||||
(*this_cpu_userfaults)++;
|
||||
|
@ -57,23 +57,15 @@
|
||||
* pagemap kernel ABI bits
|
||||
*/
|
||||
|
||||
#define PM_ENTRY_BYTES sizeof(uint64_t)
|
||||
#define PM_STATUS_BITS 3
|
||||
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
|
||||
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
|
||||
#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
|
||||
#define PM_PSHIFT_BITS 6
|
||||
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
|
||||
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
|
||||
#define __PM_PSHIFT(x) (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
|
||||
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
|
||||
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
|
||||
|
||||
#define __PM_SOFT_DIRTY (1LL)
|
||||
#define PM_PRESENT PM_STATUS(4LL)
|
||||
#define PM_SWAP PM_STATUS(2LL)
|
||||
#define PM_SOFT_DIRTY __PM_PSHIFT(__PM_SOFT_DIRTY)
|
||||
|
||||
#define PM_ENTRY_BYTES 8
|
||||
#define PM_PFRAME_BITS 55
|
||||
#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
|
||||
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
|
||||
#define PM_SOFT_DIRTY (1ULL << 55)
|
||||
#define PM_MMAP_EXCLUSIVE (1ULL << 56)
|
||||
#define PM_FILE (1ULL << 61)
|
||||
#define PM_SWAP (1ULL << 62)
|
||||
#define PM_PRESENT (1ULL << 63)
|
||||
|
||||
/*
|
||||
* kernel page flags
|
||||
@ -100,6 +92,8 @@
|
||||
#define KPF_SLOB_FREE 49
|
||||
#define KPF_SLUB_FROZEN 50
|
||||
#define KPF_SLUB_DEBUG 51
|
||||
#define KPF_FILE 62
|
||||
#define KPF_MMAP_EXCLUSIVE 63
|
||||
|
||||
#define KPF_ALL_BITS ((uint64_t)~0ULL)
|
||||
#define KPF_HACKERS_BITS (0xffffULL << 32)
|
||||
@ -149,6 +143,9 @@ static const char * const page_flag_names[] = {
|
||||
[KPF_SLOB_FREE] = "P:slob_free",
|
||||
[KPF_SLUB_FROZEN] = "A:slub_frozen",
|
||||
[KPF_SLUB_DEBUG] = "E:slub_debug",
|
||||
|
||||
[KPF_FILE] = "F:file",
|
||||
[KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive",
|
||||
};
|
||||
|
||||
|
||||
@ -452,6 +449,10 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
|
||||
|
||||
if (pme & PM_SOFT_DIRTY)
|
||||
flags |= BIT(SOFTDIRTY);
|
||||
if (pme & PM_FILE)
|
||||
flags |= BIT(FILE);
|
||||
if (pme & PM_MMAP_EXCLUSIVE)
|
||||
flags |= BIT(MMAP_EXCLUSIVE);
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user