mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-04 04:02:26 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - various misc bits - DAX updates - OCFS2 - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (119 commits) mm,fork: introduce MADV_WIPEONFORK x86,mpx: make mpx depend on x86-64 to free up VMA flag mm: add /proc/pid/smaps_rollup mm: hugetlb: clear target sub-page last when clearing huge page mm: oom: let oom_reap_task and exit_mmap run concurrently swap: choose swap device according to numa node mm: replace TIF_MEMDIE checks by tsk_is_oom_victim mm, oom: do not rely on TIF_MEMDIE for memory reserves access z3fold: use per-cpu unbuddied lists mm, swap: don't use VMA based swap readahead if HDD is used as swap mm, swap: add sysfs interface for VMA based swap readahead mm, swap: VMA based swap readahead mm, swap: fix swap readahead marking mm, swap: add swap readahead hit statistics mm/vmalloc.c: don't reinvent the wheel but use existing llist API mm/vmstat.c: fix wrong comment selftests/memfd: add memfd_create hugetlbfs selftest mm/shmem: add hugetlbfs support to memfd_create() mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas() ...
This commit is contained in:
commit
d34fc1adf0
31
Documentation/ABI/testing/procfs-smaps_rollup
Normal file
31
Documentation/ABI/testing/procfs-smaps_rollup
Normal file
@ -0,0 +1,31 @@
|
||||
What: /proc/pid/smaps_rollup
|
||||
Date: August 2017
|
||||
Contact: Daniel Colascione <dancol@google.com>
|
||||
Description:
|
||||
This file provides pre-summed memory information for a
|
||||
process. The format is identical to /proc/pid/smaps,
|
||||
except instead of an entry for each VMA in a process,
|
||||
smaps_rollup has a single entry (tagged "[rollup]")
|
||||
for which each field is the sum of the corresponding
|
||||
fields from all the maps in /proc/pid/smaps.
|
||||
For more details, see the procfs man page.
|
||||
|
||||
Typical output looks like this:
|
||||
|
||||
00100000-ff709000 ---p 00000000 00:00 0 [rollup]
|
||||
Rss: 884 kB
|
||||
Pss: 385 kB
|
||||
Shared_Clean: 696 kB
|
||||
Shared_Dirty: 0 kB
|
||||
Private_Clean: 120 kB
|
||||
Private_Dirty: 68 kB
|
||||
Referenced: 884 kB
|
||||
Anonymous: 68 kB
|
||||
LazyFree: 0 kB
|
||||
AnonHugePages: 0 kB
|
||||
ShmemPmdMapped: 0 kB
|
||||
Shared_Hugetlb: 0 kB
|
||||
Private_Hugetlb: 0 kB
|
||||
Swap: 0 kB
|
||||
SwapPss: 0 kB
|
||||
Locked: 385 kB
|
@ -90,3 +90,11 @@ Description:
|
||||
device's debugging info useful for kernel developers. Its
|
||||
format is not documented intentionally and may change
|
||||
anytime without any notice.
|
||||
|
||||
What: /sys/block/zram<id>/backing_dev
|
||||
Date: June 2017
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The backing_dev file is read-write and set up backing
|
||||
device for zram to write incompressible pages.
|
||||
For using, user should enable CONFIG_ZRAM_WRITEBACK.
|
||||
|
26
Documentation/ABI/testing/sysfs-kernel-mm-swap
Normal file
26
Documentation/ABI/testing/sysfs-kernel-mm-swap
Normal file
@ -0,0 +1,26 @@
|
||||
What: /sys/kernel/mm/swap/
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Interface for swapping
|
||||
|
||||
What: /sys/kernel/mm/swap/vma_ra_enabled
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Enable/disable VMA based swap readahead.
|
||||
|
||||
If set to true, the VMA based swap readahead algorithm
|
||||
will be used for swappable anonymous pages mapped in a
|
||||
VMA, and the global swap readahead algorithm will be
|
||||
still used for tmpfs etc. other users. If set to
|
||||
false, the global swap readahead algorithm will be
|
||||
used for all swappable pages.
|
||||
|
||||
What: /sys/kernel/mm/swap/vma_ra_max_order
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: The max readahead size in order for VMA based swap readahead
|
||||
|
||||
VMA based swap readahead algorithm will readahead at
|
||||
most 1 << max_order pages for each readahead. The
|
||||
real readahead size for each readahead will be scaled
|
||||
according to the estimation algorithm.
|
@ -2783,7 +2783,7 @@
|
||||
Allowed values are enable and disable
|
||||
|
||||
numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
|
||||
one of ['zone', 'node', 'default'] can be specified
|
||||
'node', 'default' can be specified
|
||||
This can be set from sysctl after boot.
|
||||
See Documentation/sysctl/vm.txt for details.
|
||||
|
||||
|
@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations
|
||||
comp_algorithm RW show and change the compression algorithm
|
||||
compact WO trigger memory compaction
|
||||
debug_stat RO this file is used for zram debugging purposes
|
||||
backing_dev RW set up backend storage for zram to write out
|
||||
|
||||
|
||||
User space is advised to use the following files to read the device statistics.
|
||||
@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace:
|
||||
resets the disksize to zero. You must set the disksize again
|
||||
before reusing the device.
|
||||
|
||||
* Optional Feature
|
||||
|
||||
= writeback
|
||||
|
||||
With incompressible pages, there is no memory saving with zram.
|
||||
Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page
|
||||
to backing storage rather than keeping it in memory.
|
||||
User should set up backing device via /sys/block/zramX/backing_dev
|
||||
before disksize setting.
|
||||
|
||||
Nitin Gupta
|
||||
ngupta@vflare.org
|
||||
|
@ -151,8 +151,6 @@ To define an object, a structure of the following type should be filled out:
|
||||
void (*mark_pages_cached)(void *cookie_netfs_data,
|
||||
struct address_space *mapping,
|
||||
struct pagevec *cached_pvec);
|
||||
|
||||
void (*now_uncached)(void *cookie_netfs_data);
|
||||
};
|
||||
|
||||
This has the following fields:
|
||||
|
@ -63,9 +63,8 @@ Filesystem support consists of
|
||||
- implementing an mmap file operation for DAX files which sets the
|
||||
VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
|
||||
include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
|
||||
handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
|
||||
handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
|
||||
iomap operations.
|
||||
handlers should probably call dax_iomap_fault() passing the appropriate
|
||||
fault size and iomap operations.
|
||||
- calling iomap_zero_range() passing appropriate iomap operations instead of
|
||||
block_truncate_page() for DAX files
|
||||
- ensuring that there is sufficient locking between reads, writes,
|
||||
|
@ -572,7 +572,9 @@ See Documentation/nommu-mmap.txt for more information.
|
||||
|
||||
numa_zonelist_order
|
||||
|
||||
This sysctl is only for NUMA.
|
||||
This sysctl is only for NUMA and it is deprecated. Anything but
|
||||
Node order will fail!
|
||||
|
||||
'where the memory is allocated from' is controlled by zonelists.
|
||||
(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
|
||||
you may be able to read ZONE_DMA as ZONE_DMA32...)
|
||||
|
@ -79,11 +79,8 @@ memory, Linux must decide whether to order the zonelists such that allocations
|
||||
fall back to the same zone type on a different node, or to a different zone
|
||||
type on the same node. This is an important consideration because some zones,
|
||||
such as DMA or DMA32, represent relatively scarce resources. Linux chooses
|
||||
a default zonelist order based on the sizes of the various zone types relative
|
||||
to the total memory of the node and the total memory of the system. The
|
||||
default zonelist order may be overridden using the numa_zonelist_order kernel
|
||||
boot parameter or sysctl. [see Documentation/admin-guide/kernel-parameters.rst and
|
||||
Documentation/sysctl/vm.txt]
|
||||
a default Node ordered zonelist. This means it tries to fallback to other zones
|
||||
from the same node before using remote nodes which are ordered by NUMA distance.
|
||||
|
||||
By default, Linux will attempt to satisfy memory allocation requests from the
|
||||
node to which the CPU that executes the request is assigned. Specifically,
|
||||
|
69
Documentation/vm/swap_numa.txt
Normal file
69
Documentation/vm/swap_numa.txt
Normal file
@ -0,0 +1,69 @@
|
||||
Automatically bind swap device to numa node
|
||||
-------------------------------------------
|
||||
|
||||
If the system has more than one swap device and swap device has the node
|
||||
information, we can make use of this information to decide which swap
|
||||
device to use in get_swap_pages() to get better performance.
|
||||
|
||||
|
||||
How to use this feature
|
||||
-----------------------
|
||||
|
||||
Swap device has priority and that decides the order of it to be used. To make
|
||||
use of automatically binding, there is no need to manipulate priority settings
|
||||
for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
|
||||
swapB, with swapA attached to node 0 and swapB attached to node 1, are going
|
||||
to be swapped on. Simply swapping them on by doing:
|
||||
# swapon /dev/swapA
|
||||
# swapon /dev/swapB
|
||||
|
||||
Then node 0 will use the two swap devices in the order of swapA then swapB and
|
||||
node 1 will use the two swap devices in the order of swapB then swapA. Note
|
||||
that the order of them being swapped on doesn't matter.
|
||||
|
||||
A more complex example on a 4 node machine. Assume 6 swap devices are going to
|
||||
be swapped on: swapA and swapB are attached to node 0, swapC is attached to
|
||||
node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
|
||||
The way to swap them on is the same as above:
|
||||
# swapon /dev/swapA
|
||||
# swapon /dev/swapB
|
||||
# swapon /dev/swapC
|
||||
# swapon /dev/swapD
|
||||
# swapon /dev/swapE
|
||||
# swapon /dev/swapF
|
||||
|
||||
Then node 0 will use them in the order of:
|
||||
swapA/swapB -> swapC -> swapD -> swapE -> swapF
|
||||
swapA and swapB will be used in a round robin mode before any other swap device.
|
||||
|
||||
node 1 will use them in the order of:
|
||||
swapC -> swapA -> swapB -> swapD -> swapE -> swapF
|
||||
|
||||
node 2 will use them in the order of:
|
||||
swapD/swapE -> swapA -> swapB -> swapC -> swapF
|
||||
Similaly, swapD and swapE will be used in a round robin mode before any
|
||||
other swap devices.
|
||||
|
||||
node 3 will use them in the order of:
|
||||
swapF -> swapA -> swapB -> swapC -> swapD -> swapE
|
||||
|
||||
|
||||
Implementation details
|
||||
----------------------
|
||||
|
||||
The current code uses a priority based list, swap_avail_list, to decide
|
||||
which swap device to use and if multiple swap devices share the same
|
||||
priority, they are used round robin. This change here replaces the single
|
||||
global swap_avail_list with a per-numa-node list, i.e. for each numa node,
|
||||
it sees its own priority based list of available swap devices. Swap
|
||||
device's priority can be promoted on its matching node's swap_avail_list.
|
||||
|
||||
The current swap device's priority is set as: user can set a >=0 value,
|
||||
or the system will pick one starting from -1 then downwards. The priority
|
||||
value in the swap_avail_list is the negated value of the swap device's
|
||||
due to plist being sorted from low to high. The new policy doesn't change
|
||||
the semantics for priority >=0 cases, the previous starting from -1 then
|
||||
downwards now becomes starting from -2 then downwards and -1 is reserved
|
||||
as the promoted value. So if multiple swap devices are attached to the same
|
||||
node, they will all be promoted to priority -1 on that node's plist and will
|
||||
be used round robin before any other swap devices.
|
@ -64,20 +64,12 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
@ -4,7 +4,6 @@
|
||||
#ifdef CONFIG_NUMA
|
||||
|
||||
#define cpu_to_node(cpu) ((void)(cpu), 0)
|
||||
#define parent_node(node) ((void)(node), 0)
|
||||
|
||||
#define cpumask_of_node(node) ((void)node, cpu_online_mask)
|
||||
|
||||
|
@ -91,20 +91,12 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
@ -57,6 +57,9 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
#define MADV_HWPOISON 100 /* poison a page for testing */
|
||||
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
|
||||
|
||||
@ -64,17 +67,6 @@
|
||||
#define MAP_FILE 0
|
||||
#define MAP_VARIABLE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
@ -29,20 +29,4 @@
|
||||
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
|
||||
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
|
||||
* encode the log2 of the huge page size. A value of zero indicates that the
|
||||
* default huge page size should be used. To use a non-default huge page size,
|
||||
* one of these defines can be used, or the size can be encoded by hand. Note
|
||||
* that on most systems only a subset, or possibly none, of these sizes will be
|
||||
* available.
|
||||
*/
|
||||
#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */
|
||||
#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */
|
||||
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */
|
||||
#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */
|
||||
#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */
|
||||
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */
|
||||
#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */
|
||||
|
||||
#endif /* _UAPI_ASM_POWERPC_MMAN_H */
|
||||
|
@ -1806,7 +1806,9 @@ config X86_SMAP
|
||||
config X86_INTEL_MPX
|
||||
prompt "Intel MPX (Memory Protection Extensions)"
|
||||
def_bool n
|
||||
depends on CPU_SUP_INTEL
|
||||
# Note: only available in 64-bit mode due to VMA flags shortage
|
||||
depends on CPU_SUP_INTEL && X86_64
|
||||
select ARCH_USES_HIGH_VMA_FLAGS
|
||||
---help---
|
||||
MPX provides hardware features that can be used in
|
||||
conjunction with compiler-instrumented code to check
|
||||
|
@ -3,9 +3,6 @@
|
||||
|
||||
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
|
||||
|
||||
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
|
||||
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
|
||||
|
||||
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
||||
/*
|
||||
* Take the 4 protection key bits out of the vma->vm_flags
|
||||
|
@ -103,20 +103,12 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
@ -388,6 +388,19 @@ static ssize_t show_phys_device(struct device *dev,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
|
||||
unsigned long nr_pages, int online_type,
|
||||
struct zone *default_zone)
|
||||
{
|
||||
struct zone *zone;
|
||||
|
||||
zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
|
||||
if (zone != default_zone) {
|
||||
strcat(buf, " ");
|
||||
strcat(buf, zone->name);
|
||||
}
|
||||
}
|
||||
|
||||
static ssize_t show_valid_zones(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
@ -395,7 +408,7 @@ static ssize_t show_valid_zones(struct device *dev,
|
||||
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
|
||||
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
|
||||
unsigned long valid_start_pfn, valid_end_pfn;
|
||||
bool append = false;
|
||||
struct zone *default_zone;
|
||||
int nid;
|
||||
|
||||
/*
|
||||
@ -418,16 +431,13 @@ static ssize_t show_valid_zones(struct device *dev,
|
||||
}
|
||||
|
||||
nid = pfn_to_nid(start_pfn);
|
||||
if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
|
||||
strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
|
||||
append = true;
|
||||
}
|
||||
default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
|
||||
strcat(buf, default_zone->name);
|
||||
|
||||
if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
|
||||
if (append)
|
||||
strcat(buf, " ");
|
||||
strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
|
||||
}
|
||||
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
|
||||
default_zone);
|
||||
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
|
||||
default_zone);
|
||||
out:
|
||||
strcat(buf, "\n");
|
||||
|
||||
|
@ -326,7 +326,11 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
|
||||
struct page *page, bool is_write)
|
||||
{
|
||||
struct brd_device *brd = bdev->bd_disk->private_data;
|
||||
int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
|
||||
int err;
|
||||
|
||||
if (PageTransHuge(page))
|
||||
return -ENOTSUPP;
|
||||
err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
|
||||
page_endio(page, is_write, err);
|
||||
return err;
|
||||
}
|
||||
|
@ -13,3 +13,15 @@ config ZRAM
|
||||
disks and maybe many more.
|
||||
|
||||
See zram.txt for more information.
|
||||
|
||||
config ZRAM_WRITEBACK
|
||||
bool "Write back incompressible page to backing device"
|
||||
depends on ZRAM
|
||||
default n
|
||||
help
|
||||
With incompressible page, there is no memory saving to keep it
|
||||
in memory. Instead, write it out to backing device.
|
||||
For this feature, admin should set up backing device via
|
||||
/sys/block/zramX/backing_dev.
|
||||
|
||||
See zram.txt for more infomration.
|
||||
|
@ -270,6 +270,349 @@ static ssize_t mem_used_max_store(struct device *dev,
|
||||
return len;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
static bool zram_wb_enabled(struct zram *zram)
|
||||
{
|
||||
return zram->backing_dev;
|
||||
}
|
||||
|
||||
static void reset_bdev(struct zram *zram)
|
||||
{
|
||||
struct block_device *bdev;
|
||||
|
||||
if (!zram_wb_enabled(zram))
|
||||
return;
|
||||
|
||||
bdev = zram->bdev;
|
||||
if (zram->old_block_size)
|
||||
set_blocksize(bdev, zram->old_block_size);
|
||||
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
|
||||
/* hope filp_close flush all of IO */
|
||||
filp_close(zram->backing_dev, NULL);
|
||||
zram->backing_dev = NULL;
|
||||
zram->old_block_size = 0;
|
||||
zram->bdev = NULL;
|
||||
|
||||
kvfree(zram->bitmap);
|
||||
zram->bitmap = NULL;
|
||||
}
|
||||
|
||||
static ssize_t backing_dev_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
struct file *file = zram->backing_dev;
|
||||
char *p;
|
||||
ssize_t ret;
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
if (!zram_wb_enabled(zram)) {
|
||||
memcpy(buf, "none\n", 5);
|
||||
up_read(&zram->init_lock);
|
||||
return 5;
|
||||
}
|
||||
|
||||
p = file_path(file, buf, PAGE_SIZE - 1);
|
||||
if (IS_ERR(p)) {
|
||||
ret = PTR_ERR(p);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = strlen(p);
|
||||
memmove(buf, p, ret);
|
||||
buf[ret++] = '\n';
|
||||
out:
|
||||
up_read(&zram->init_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t backing_dev_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
char *file_name;
|
||||
struct file *backing_dev = NULL;
|
||||
struct inode *inode;
|
||||
struct address_space *mapping;
|
||||
unsigned int bitmap_sz, old_block_size = 0;
|
||||
unsigned long nr_pages, *bitmap = NULL;
|
||||
struct block_device *bdev = NULL;
|
||||
int err;
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
|
||||
file_name = kmalloc(PATH_MAX, GFP_KERNEL);
|
||||
if (!file_name)
|
||||
return -ENOMEM;
|
||||
|
||||
down_write(&zram->init_lock);
|
||||
if (init_done(zram)) {
|
||||
pr_info("Can't setup backing device for initialized device\n");
|
||||
err = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
strlcpy(file_name, buf, len);
|
||||
|
||||
backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
|
||||
if (IS_ERR(backing_dev)) {
|
||||
err = PTR_ERR(backing_dev);
|
||||
backing_dev = NULL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mapping = backing_dev->f_mapping;
|
||||
inode = mapping->host;
|
||||
|
||||
/* Support only block device in this moment */
|
||||
if (!S_ISBLK(inode->i_mode)) {
|
||||
err = -ENOTBLK;
|
||||
goto out;
|
||||
}
|
||||
|
||||
bdev = bdgrab(I_BDEV(inode));
|
||||
err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
|
||||
if (err < 0)
|
||||
goto out;
|
||||
|
||||
nr_pages = i_size_read(inode) >> PAGE_SHIFT;
|
||||
bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
|
||||
bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
|
||||
if (!bitmap) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
old_block_size = block_size(bdev);
|
||||
err = set_blocksize(bdev, PAGE_SIZE);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
reset_bdev(zram);
|
||||
spin_lock_init(&zram->bitmap_lock);
|
||||
|
||||
zram->old_block_size = old_block_size;
|
||||
zram->bdev = bdev;
|
||||
zram->backing_dev = backing_dev;
|
||||
zram->bitmap = bitmap;
|
||||
zram->nr_pages = nr_pages;
|
||||
up_write(&zram->init_lock);
|
||||
|
||||
pr_info("setup backing device %s\n", file_name);
|
||||
kfree(file_name);
|
||||
|
||||
return len;
|
||||
out:
|
||||
if (bitmap)
|
||||
kvfree(bitmap);
|
||||
|
||||
if (bdev)
|
||||
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
||||
|
||||
if (backing_dev)
|
||||
filp_close(backing_dev, NULL);
|
||||
|
||||
up_write(&zram->init_lock);
|
||||
|
||||
kfree(file_name);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static unsigned long get_entry_bdev(struct zram *zram)
|
||||
{
|
||||
unsigned long entry;
|
||||
|
||||
spin_lock(&zram->bitmap_lock);
|
||||
/* skip 0 bit to confuse zram.handle = 0 */
|
||||
entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
|
||||
if (entry == zram->nr_pages) {
|
||||
spin_unlock(&zram->bitmap_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
set_bit(entry, zram->bitmap);
|
||||
spin_unlock(&zram->bitmap_lock);
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
static void put_entry_bdev(struct zram *zram, unsigned long entry)
|
||||
{
|
||||
int was_set;
|
||||
|
||||
spin_lock(&zram->bitmap_lock);
|
||||
was_set = test_and_clear_bit(entry, zram->bitmap);
|
||||
spin_unlock(&zram->bitmap_lock);
|
||||
WARN_ON_ONCE(!was_set);
|
||||
}
|
||||
|
||||
void zram_page_end_io(struct bio *bio)
|
||||
{
|
||||
struct page *page = bio->bi_io_vec[0].bv_page;
|
||||
|
||||
page_endio(page, op_is_write(bio_op(bio)),
|
||||
blk_status_to_errno(bio->bi_status));
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns 1 if the submission is successful.
|
||||
*/
|
||||
static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
|
||||
unsigned long entry, struct bio *parent)
|
||||
{
|
||||
struct bio *bio;
|
||||
|
||||
bio = bio_alloc(GFP_ATOMIC, 1);
|
||||
if (!bio)
|
||||
return -ENOMEM;
|
||||
|
||||
bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
|
||||
bio->bi_bdev = zram->bdev;
|
||||
if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
|
||||
bio_put(bio);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (!parent) {
|
||||
bio->bi_opf = REQ_OP_READ;
|
||||
bio->bi_end_io = zram_page_end_io;
|
||||
} else {
|
||||
bio->bi_opf = parent->bi_opf;
|
||||
bio_chain(bio, parent);
|
||||
}
|
||||
|
||||
submit_bio(bio);
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct zram_work {
|
||||
struct work_struct work;
|
||||
struct zram *zram;
|
||||
unsigned long entry;
|
||||
struct bio *bio;
|
||||
};
|
||||
|
||||
#if PAGE_SIZE != 4096
|
||||
static void zram_sync_read(struct work_struct *work)
|
||||
{
|
||||
struct bio_vec bvec;
|
||||
struct zram_work *zw = container_of(work, struct zram_work, work);
|
||||
struct zram *zram = zw->zram;
|
||||
unsigned long entry = zw->entry;
|
||||
struct bio *bio = zw->bio;
|
||||
|
||||
read_from_bdev_async(zram, &bvec, entry, bio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Block layer want one ->make_request_fn to be active at a time
|
||||
* so if we use chained IO with parent IO in same context,
|
||||
* it's a deadlock. To avoid, it, it uses worker thread context.
|
||||
*/
|
||||
static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
|
||||
unsigned long entry, struct bio *bio)
|
||||
{
|
||||
struct zram_work work;
|
||||
|
||||
work.zram = zram;
|
||||
work.entry = entry;
|
||||
work.bio = bio;
|
||||
|
||||
INIT_WORK_ONSTACK(&work.work, zram_sync_read);
|
||||
queue_work(system_unbound_wq, &work.work);
|
||||
flush_work(&work.work);
|
||||
destroy_work_on_stack(&work.work);
|
||||
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
|
||||
unsigned long entry, struct bio *bio)
|
||||
{
|
||||
WARN_ON(1);
|
||||
return -EIO;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
|
||||
unsigned long entry, struct bio *parent, bool sync)
|
||||
{
|
||||
if (sync)
|
||||
return read_from_bdev_sync(zram, bvec, entry, parent);
|
||||
else
|
||||
return read_from_bdev_async(zram, bvec, entry, parent);
|
||||
}
|
||||
|
||||
static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
|
||||
u32 index, struct bio *parent,
|
||||
unsigned long *pentry)
|
||||
{
|
||||
struct bio *bio;
|
||||
unsigned long entry;
|
||||
|
||||
bio = bio_alloc(GFP_ATOMIC, 1);
|
||||
if (!bio)
|
||||
return -ENOMEM;
|
||||
|
||||
entry = get_entry_bdev(zram);
|
||||
if (!entry) {
|
||||
bio_put(bio);
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
|
||||
bio->bi_bdev = zram->bdev;
|
||||
if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
|
||||
bvec->bv_offset)) {
|
||||
bio_put(bio);
|
||||
put_entry_bdev(zram, entry);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (!parent) {
|
||||
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
|
||||
bio->bi_end_io = zram_page_end_io;
|
||||
} else {
|
||||
bio->bi_opf = parent->bi_opf;
|
||||
bio_chain(bio, parent);
|
||||
}
|
||||
|
||||
submit_bio(bio);
|
||||
*pentry = entry;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void zram_wb_clear(struct zram *zram, u32 index)
|
||||
{
|
||||
unsigned long entry;
|
||||
|
||||
zram_clear_flag(zram, index, ZRAM_WB);
|
||||
entry = zram_get_element(zram, index);
|
||||
zram_set_element(zram, index, 0);
|
||||
put_entry_bdev(zram, entry);
|
||||
}
|
||||
|
||||
#else
|
||||
static bool zram_wb_enabled(struct zram *zram) { return false; }
|
||||
static inline void reset_bdev(struct zram *zram) {};
|
||||
static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
|
||||
u32 index, struct bio *parent,
|
||||
unsigned long *pentry)
|
||||
|
||||
{
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
|
||||
unsigned long entry, struct bio *parent, bool sync)
|
||||
{
|
||||
return -EIO;
|
||||
}
|
||||
static void zram_wb_clear(struct zram *zram, u32 index) {}
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* We switched to per-cpu streams and this attr is not needed anymore.
|
||||
* However, we will keep it around for some time, because:
|
||||
@ -453,30 +796,6 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool zram_same_page_write(struct zram *zram, u32 index,
|
||||
struct page *page)
|
||||
{
|
||||
unsigned long element;
|
||||
void *mem = kmap_atomic(page);
|
||||
|
||||
if (page_same_filled(mem, &element)) {
|
||||
kunmap_atomic(mem);
|
||||
/* Free memory associated with this sector now. */
|
||||
zram_slot_lock(zram, index);
|
||||
zram_free_page(zram, index);
|
||||
zram_set_flag(zram, index, ZRAM_SAME);
|
||||
zram_set_element(zram, index, element);
|
||||
zram_slot_unlock(zram, index);
|
||||
|
||||
atomic64_inc(&zram->stats.same_pages);
|
||||
atomic64_inc(&zram->stats.pages_stored);
|
||||
return true;
|
||||
}
|
||||
kunmap_atomic(mem);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void zram_meta_free(struct zram *zram, u64 disksize)
|
||||
{
|
||||
size_t num_pages = disksize >> PAGE_SHIFT;
|
||||
@ -515,7 +834,13 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
|
||||
*/
|
||||
static void zram_free_page(struct zram *zram, size_t index)
|
||||
{
|
||||
unsigned long handle = zram_get_handle(zram, index);
|
||||
unsigned long handle;
|
||||
|
||||
if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
|
||||
zram_wb_clear(zram, index);
|
||||
atomic64_dec(&zram->stats.pages_stored);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* No memory is allocated for same element filled pages.
|
||||
@ -529,6 +854,7 @@ static void zram_free_page(struct zram *zram, size_t index)
|
||||
return;
|
||||
}
|
||||
|
||||
handle = zram_get_handle(zram, index);
|
||||
if (!handle)
|
||||
return;
|
||||
|
||||
@ -542,13 +868,31 @@ static void zram_free_page(struct zram *zram, size_t index)
|
||||
zram_set_obj_size(zram, index, 0);
|
||||
}
|
||||
|
||||
static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
|
||||
static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
|
||||
struct bio *bio, bool partial_io)
|
||||
{
|
||||
int ret;
|
||||
unsigned long handle;
|
||||
unsigned int size;
|
||||
void *src, *dst;
|
||||
|
||||
if (zram_wb_enabled(zram)) {
|
||||
zram_slot_lock(zram, index);
|
||||
if (zram_test_flag(zram, index, ZRAM_WB)) {
|
||||
struct bio_vec bvec;
|
||||
|
||||
zram_slot_unlock(zram, index);
|
||||
|
||||
bvec.bv_page = page;
|
||||
bvec.bv_len = PAGE_SIZE;
|
||||
bvec.bv_offset = 0;
|
||||
return read_from_bdev(zram, &bvec,
|
||||
zram_get_element(zram, index),
|
||||
bio, partial_io);
|
||||
}
|
||||
zram_slot_unlock(zram, index);
|
||||
}
|
||||
|
||||
if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
|
||||
return 0;
|
||||
|
||||
@ -581,7 +925,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
|
||||
}
|
||||
|
||||
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
|
||||
u32 index, int offset)
|
||||
u32 index, int offset, struct bio *bio)
|
||||
{
|
||||
int ret;
|
||||
struct page *page;
|
||||
@ -594,7 +938,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = zram_decompress_page(zram, page, index);
|
||||
ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
|
||||
if (unlikely(ret))
|
||||
goto out;
|
||||
|
||||
@ -613,30 +957,57 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
|
||||
struct page *page,
|
||||
unsigned long *out_handle, unsigned int *out_comp_len)
|
||||
static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
u32 index, struct bio *bio)
|
||||
{
|
||||
int ret;
|
||||
unsigned int comp_len;
|
||||
void *src;
|
||||
int ret = 0;
|
||||
unsigned long alloced_pages;
|
||||
unsigned long handle = 0;
|
||||
unsigned int comp_len = 0;
|
||||
void *src, *dst, *mem;
|
||||
struct zcomp_strm *zstrm;
|
||||
struct page *page = bvec->bv_page;
|
||||
unsigned long element = 0;
|
||||
enum zram_pageflags flags = 0;
|
||||
bool allow_wb = true;
|
||||
|
||||
mem = kmap_atomic(page);
|
||||
if (page_same_filled(mem, &element)) {
|
||||
kunmap_atomic(mem);
|
||||
/* Free memory associated with this sector now. */
|
||||
flags = ZRAM_SAME;
|
||||
atomic64_inc(&zram->stats.same_pages);
|
||||
goto out;
|
||||
}
|
||||
kunmap_atomic(mem);
|
||||
|
||||
compress_again:
|
||||
zstrm = zcomp_stream_get(zram->comp);
|
||||
src = kmap_atomic(page);
|
||||
ret = zcomp_compress(*zstrm, src, &comp_len);
|
||||
ret = zcomp_compress(zstrm, src, &comp_len);
|
||||
kunmap_atomic(src);
|
||||
|
||||
if (unlikely(ret)) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
pr_err("Compression failed! err=%d\n", ret);
|
||||
if (handle)
|
||||
zs_free(zram->mem_pool, handle);
|
||||
zs_free(zram->mem_pool, handle);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (unlikely(comp_len > max_zpage_size))
|
||||
if (unlikely(comp_len > max_zpage_size)) {
|
||||
if (zram_wb_enabled(zram) && allow_wb) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
ret = write_to_bdev(zram, bvec, index, bio, &element);
|
||||
if (!ret) {
|
||||
flags = ZRAM_WB;
|
||||
ret = 1;
|
||||
goto out;
|
||||
}
|
||||
allow_wb = false;
|
||||
goto compress_again;
|
||||
}
|
||||
comp_len = PAGE_SIZE;
|
||||
}
|
||||
|
||||
/*
|
||||
* handle allocation has 2 paths:
|
||||
@ -663,7 +1034,6 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
|
||||
handle = zs_malloc(zram->mem_pool, comp_len,
|
||||
GFP_NOIO | __GFP_HIGHMEM |
|
||||
__GFP_MOVABLE);
|
||||
*zstrm = zcomp_stream_get(zram->comp);
|
||||
if (handle)
|
||||
goto compress_again;
|
||||
return -ENOMEM;
|
||||
@ -673,34 +1043,11 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
|
||||
update_used_max(zram, alloced_pages);
|
||||
|
||||
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
zs_free(zram->mem_pool, handle);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
*out_handle = handle;
|
||||
*out_comp_len = comp_len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
|
||||
{
|
||||
int ret;
|
||||
unsigned long handle;
|
||||
unsigned int comp_len;
|
||||
void *src, *dst;
|
||||
struct zcomp_strm *zstrm;
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
if (zram_same_page_write(zram, index, page))
|
||||
return 0;
|
||||
|
||||
zstrm = zcomp_stream_get(zram->comp);
|
||||
ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
|
||||
if (ret) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
return ret;
|
||||
}
|
||||
|
||||
dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
|
||||
|
||||
src = zstrm->buffer;
|
||||
@ -712,25 +1059,31 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
|
||||
|
||||
zcomp_stream_put(zram->comp);
|
||||
zs_unmap_object(zram->mem_pool, handle);
|
||||
|
||||
atomic64_add(comp_len, &zram->stats.compr_data_size);
|
||||
out:
|
||||
/*
|
||||
* Free memory associated with this sector
|
||||
* before overwriting unused sectors.
|
||||
*/
|
||||
zram_slot_lock(zram, index);
|
||||
zram_free_page(zram, index);
|
||||
zram_set_handle(zram, index, handle);
|
||||
zram_set_obj_size(zram, index, comp_len);
|
||||
|
||||
if (flags) {
|
||||
zram_set_flag(zram, index, flags);
|
||||
zram_set_element(zram, index, element);
|
||||
} else {
|
||||
zram_set_handle(zram, index, handle);
|
||||
zram_set_obj_size(zram, index, comp_len);
|
||||
}
|
||||
zram_slot_unlock(zram, index);
|
||||
|
||||
/* Update stats */
|
||||
atomic64_add(comp_len, &zram->stats.compr_data_size);
|
||||
atomic64_inc(&zram->stats.pages_stored);
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
u32 index, int offset)
|
||||
u32 index, int offset, struct bio *bio)
|
||||
{
|
||||
int ret;
|
||||
struct page *page = NULL;
|
||||
@ -748,7 +1101,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = zram_decompress_page(zram, page, index);
|
||||
ret = __zram_bvec_read(zram, page, index, bio, true);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@ -763,7 +1116,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
vec.bv_offset = 0;
|
||||
}
|
||||
|
||||
ret = __zram_bvec_write(zram, &vec, index);
|
||||
ret = __zram_bvec_write(zram, &vec, index, bio);
|
||||
out:
|
||||
if (is_partial_io(bvec))
|
||||
__free_page(page);
|
||||
@ -808,8 +1161,13 @@ static void zram_bio_discard(struct zram *zram, u32 index,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns errno if it has some problem. Otherwise return 0 or 1.
|
||||
* Returns 0 if IO request was done synchronously
|
||||
* Returns 1 if IO request was successfully submitted.
|
||||
*/
|
||||
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
|
||||
int offset, bool is_write)
|
||||
int offset, bool is_write, struct bio *bio)
|
||||
{
|
||||
unsigned long start_time = jiffies;
|
||||
int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
|
||||
@ -820,16 +1178,16 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
|
||||
|
||||
if (!is_write) {
|
||||
atomic64_inc(&zram->stats.num_reads);
|
||||
ret = zram_bvec_read(zram, bvec, index, offset);
|
||||
ret = zram_bvec_read(zram, bvec, index, offset, bio);
|
||||
flush_dcache_page(bvec->bv_page);
|
||||
} else {
|
||||
atomic64_inc(&zram->stats.num_writes);
|
||||
ret = zram_bvec_write(zram, bvec, index, offset);
|
||||
ret = zram_bvec_write(zram, bvec, index, offset, bio);
|
||||
}
|
||||
|
||||
generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
|
||||
|
||||
if (unlikely(ret)) {
|
||||
if (unlikely(ret < 0)) {
|
||||
if (!is_write)
|
||||
atomic64_inc(&zram->stats.failed_reads);
|
||||
else
|
||||
@ -868,7 +1226,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
|
||||
bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
|
||||
unwritten);
|
||||
if (zram_bvec_rw(zram, &bv, index, offset,
|
||||
op_is_write(bio_op(bio))) < 0)
|
||||
op_is_write(bio_op(bio)), bio) < 0)
|
||||
goto out;
|
||||
|
||||
bv.bv_offset += bv.bv_len;
|
||||
@ -922,16 +1280,18 @@ static void zram_slot_free_notify(struct block_device *bdev,
|
||||
static int zram_rw_page(struct block_device *bdev, sector_t sector,
|
||||
struct page *page, bool is_write)
|
||||
{
|
||||
int offset, err = -EIO;
|
||||
int offset, ret;
|
||||
u32 index;
|
||||
struct zram *zram;
|
||||
struct bio_vec bv;
|
||||
|
||||
if (PageTransHuge(page))
|
||||
return -ENOTSUPP;
|
||||
zram = bdev->bd_disk->private_data;
|
||||
|
||||
if (!valid_io_request(zram, sector, PAGE_SIZE)) {
|
||||
atomic64_inc(&zram->stats.invalid_io);
|
||||
err = -EINVAL;
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -942,7 +1302,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
|
||||
bv.bv_len = PAGE_SIZE;
|
||||
bv.bv_offset = 0;
|
||||
|
||||
err = zram_bvec_rw(zram, &bv, index, offset, is_write);
|
||||
ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
|
||||
out:
|
||||
/*
|
||||
* If I/O fails, just return error(ie, non-zero) without
|
||||
@ -952,9 +1312,20 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
|
||||
* bio->bi_end_io does things to handle the error
|
||||
* (e.g., SetPageError, set_page_dirty and extra works).
|
||||
*/
|
||||
if (err == 0)
|
||||
if (unlikely(ret < 0))
|
||||
return ret;
|
||||
|
||||
switch (ret) {
|
||||
case 0:
|
||||
page_endio(page, is_write, 0);
|
||||
return err;
|
||||
break;
|
||||
case 1:
|
||||
ret = 0;
|
||||
break;
|
||||
default:
|
||||
WARN_ON(1);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void zram_reset_device(struct zram *zram)
|
||||
@ -983,6 +1354,7 @@ static void zram_reset_device(struct zram *zram)
|
||||
zram_meta_free(zram, disksize);
|
||||
memset(&zram->stats, 0, sizeof(zram->stats));
|
||||
zcomp_destroy(comp);
|
||||
reset_bdev(zram);
|
||||
}
|
||||
|
||||
static ssize_t disksize_store(struct device *dev,
|
||||
@ -1108,6 +1480,9 @@ static DEVICE_ATTR_WO(mem_limit);
|
||||
static DEVICE_ATTR_WO(mem_used_max);
|
||||
static DEVICE_ATTR_RW(max_comp_streams);
|
||||
static DEVICE_ATTR_RW(comp_algorithm);
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
static DEVICE_ATTR_RW(backing_dev);
|
||||
#endif
|
||||
|
||||
static struct attribute *zram_disk_attrs[] = {
|
||||
&dev_attr_disksize.attr,
|
||||
@ -1118,6 +1493,9 @@ static struct attribute *zram_disk_attrs[] = {
|
||||
&dev_attr_mem_used_max.attr,
|
||||
&dev_attr_max_comp_streams.attr,
|
||||
&dev_attr_comp_algorithm.attr,
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
&dev_attr_backing_dev.attr,
|
||||
#endif
|
||||
&dev_attr_io_stat.attr,
|
||||
&dev_attr_mm_stat.attr,
|
||||
&dev_attr_debug_stat.attr,
|
||||
|
@ -60,9 +60,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
|
||||
|
||||
/* Flags for zram pages (table[page_no].value) */
|
||||
enum zram_pageflags {
|
||||
/* Page consists entirely of zeros */
|
||||
/* Page consists the same element */
|
||||
ZRAM_SAME = ZRAM_FLAG_SHIFT,
|
||||
ZRAM_ACCESS, /* page is now accessed */
|
||||
ZRAM_WB, /* page is stored on backing_device */
|
||||
|
||||
__NR_ZRAM_PAGEFLAGS,
|
||||
};
|
||||
@ -115,5 +116,13 @@ struct zram {
|
||||
* zram is claimed so open request will be failed
|
||||
*/
|
||||
bool claim; /* Protected by bdev->bd_mutex */
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
struct file *backing_dev;
|
||||
struct block_device *bdev;
|
||||
unsigned int old_block_size;
|
||||
unsigned long *bitmap;
|
||||
unsigned long nr_pages;
|
||||
spinlock_t bitmap_lock;
|
||||
#endif
|
||||
};
|
||||
#endif
|
||||
|
@ -4308,10 +4308,10 @@ i915_drop_caches_set(void *data, u64 val)
|
||||
|
||||
fs_reclaim_acquire(GFP_KERNEL);
|
||||
if (val & DROP_BOUND)
|
||||
i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND);
|
||||
i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_BOUND);
|
||||
|
||||
if (val & DROP_UNBOUND)
|
||||
i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_UNBOUND);
|
||||
i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_UNBOUND);
|
||||
|
||||
if (val & DROP_SHRINK_ALL)
|
||||
i915_gem_shrink_all(dev_priv);
|
||||
|
@ -3742,6 +3742,7 @@ i915_gem_object_create_internal(struct drm_i915_private *dev_priv,
|
||||
/* i915_gem_shrinker.c */
|
||||
unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv,
|
||||
unsigned long target,
|
||||
unsigned long *nr_scanned,
|
||||
unsigned flags);
|
||||
#define I915_SHRINK_PURGEABLE 0x1
|
||||
#define I915_SHRINK_UNBOUND 0x2
|
||||
|
@ -2354,7 +2354,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
|
||||
goto err_sg;
|
||||
}
|
||||
|
||||
i915_gem_shrink(dev_priv, 2 * page_count, *s++);
|
||||
i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
|
||||
cond_resched();
|
||||
|
||||
/* We've tried hard to allocate the memory by reaping
|
||||
@ -5015,7 +5015,7 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
|
||||
* the objects as well, see i915_gem_freeze()
|
||||
*/
|
||||
|
||||
i915_gem_shrink(dev_priv, -1UL, I915_SHRINK_UNBOUND);
|
||||
i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
|
||||
i915_gem_drain_freed_objects(dev_priv);
|
||||
|
||||
mutex_lock(&dev_priv->drm.struct_mutex);
|
||||
|
@ -2062,7 +2062,7 @@ int i915_gem_gtt_prepare_pages(struct drm_i915_gem_object *obj,
|
||||
*/
|
||||
GEM_BUG_ON(obj->mm.pages == pages);
|
||||
} while (i915_gem_shrink(to_i915(obj->base.dev),
|
||||
obj->base.size >> PAGE_SHIFT,
|
||||
obj->base.size >> PAGE_SHIFT, NULL,
|
||||
I915_SHRINK_BOUND |
|
||||
I915_SHRINK_UNBOUND |
|
||||
I915_SHRINK_ACTIVE));
|
||||
|
@ -136,6 +136,7 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
|
||||
* i915_gem_shrink - Shrink buffer object caches
|
||||
* @dev_priv: i915 device
|
||||
* @target: amount of memory to make available, in pages
|
||||
* @nr_scanned: optional output for number of pages scanned (incremental)
|
||||
* @flags: control flags for selecting cache types
|
||||
*
|
||||
* This function is the main interface to the shrinker. It will try to release
|
||||
@ -158,7 +159,9 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
|
||||
*/
|
||||
unsigned long
|
||||
i915_gem_shrink(struct drm_i915_private *dev_priv,
|
||||
unsigned long target, unsigned flags)
|
||||
unsigned long target,
|
||||
unsigned long *nr_scanned,
|
||||
unsigned flags)
|
||||
{
|
||||
const struct {
|
||||
struct list_head *list;
|
||||
@ -169,6 +172,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
|
||||
{ NULL, 0 },
|
||||
}, *phase;
|
||||
unsigned long count = 0;
|
||||
unsigned long scanned = 0;
|
||||
bool unlock;
|
||||
|
||||
if (!shrinker_lock(dev_priv, &unlock))
|
||||
@ -249,6 +253,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
|
||||
count += obj->base.size >> PAGE_SHIFT;
|
||||
}
|
||||
mutex_unlock(&obj->mm.lock);
|
||||
scanned += obj->base.size >> PAGE_SHIFT;
|
||||
}
|
||||
}
|
||||
list_splice_tail(&still_in_list, phase->list);
|
||||
@ -261,6 +266,8 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
|
||||
|
||||
shrinker_unlock(dev_priv, unlock);
|
||||
|
||||
if (nr_scanned)
|
||||
*nr_scanned += scanned;
|
||||
return count;
|
||||
}
|
||||
|
||||
@ -283,7 +290,7 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv)
|
||||
unsigned long freed;
|
||||
|
||||
intel_runtime_pm_get(dev_priv);
|
||||
freed = i915_gem_shrink(dev_priv, -1UL,
|
||||
freed = i915_gem_shrink(dev_priv, -1UL, NULL,
|
||||
I915_SHRINK_BOUND |
|
||||
I915_SHRINK_UNBOUND |
|
||||
I915_SHRINK_ACTIVE);
|
||||
@ -329,23 +336,28 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
|
||||
unsigned long freed;
|
||||
bool unlock;
|
||||
|
||||
sc->nr_scanned = 0;
|
||||
|
||||
if (!shrinker_lock(dev_priv, &unlock))
|
||||
return SHRINK_STOP;
|
||||
|
||||
freed = i915_gem_shrink(dev_priv,
|
||||
sc->nr_to_scan,
|
||||
&sc->nr_scanned,
|
||||
I915_SHRINK_BOUND |
|
||||
I915_SHRINK_UNBOUND |
|
||||
I915_SHRINK_PURGEABLE);
|
||||
if (freed < sc->nr_to_scan)
|
||||
freed += i915_gem_shrink(dev_priv,
|
||||
sc->nr_to_scan - freed,
|
||||
sc->nr_to_scan - sc->nr_scanned,
|
||||
&sc->nr_scanned,
|
||||
I915_SHRINK_BOUND |
|
||||
I915_SHRINK_UNBOUND);
|
||||
if (freed < sc->nr_to_scan && current_is_kswapd()) {
|
||||
intel_runtime_pm_get(dev_priv);
|
||||
freed += i915_gem_shrink(dev_priv,
|
||||
sc->nr_to_scan - freed,
|
||||
sc->nr_to_scan - sc->nr_scanned,
|
||||
&sc->nr_scanned,
|
||||
I915_SHRINK_ACTIVE |
|
||||
I915_SHRINK_BOUND |
|
||||
I915_SHRINK_UNBOUND);
|
||||
@ -354,7 +366,7 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
|
||||
|
||||
shrinker_unlock(dev_priv, unlock);
|
||||
|
||||
return freed;
|
||||
return sc->nr_scanned ? freed : SHRINK_STOP;
|
||||
}
|
||||
|
||||
static bool
|
||||
@ -453,7 +465,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
|
||||
goto out;
|
||||
|
||||
intel_runtime_pm_get(dev_priv);
|
||||
freed_pages += i915_gem_shrink(dev_priv, -1UL,
|
||||
freed_pages += i915_gem_shrink(dev_priv, -1UL, NULL,
|
||||
I915_SHRINK_BOUND |
|
||||
I915_SHRINK_UNBOUND |
|
||||
I915_SHRINK_ACTIVE |
|
||||
|
@ -1241,8 +1241,10 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector,
|
||||
{
|
||||
struct btt *btt = bdev->bd_disk->private_data;
|
||||
int rc;
|
||||
unsigned int len;
|
||||
|
||||
rc = btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector);
|
||||
len = hpage_nr_pages(page) * PAGE_SIZE;
|
||||
rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
|
||||
if (rc == 0)
|
||||
page_endio(page, is_write, 0);
|
||||
|
||||
|
@ -80,22 +80,40 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
|
||||
static void write_pmem(void *pmem_addr, struct page *page,
|
||||
unsigned int off, unsigned int len)
|
||||
{
|
||||
void *mem = kmap_atomic(page);
|
||||
unsigned int chunk;
|
||||
void *mem;
|
||||
|
||||
memcpy_flushcache(pmem_addr, mem + off, len);
|
||||
kunmap_atomic(mem);
|
||||
while (len) {
|
||||
mem = kmap_atomic(page);
|
||||
chunk = min_t(unsigned int, len, PAGE_SIZE);
|
||||
memcpy_flushcache(pmem_addr, mem + off, chunk);
|
||||
kunmap_atomic(mem);
|
||||
len -= chunk;
|
||||
off = 0;
|
||||
page++;
|
||||
pmem_addr += PAGE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static blk_status_t read_pmem(struct page *page, unsigned int off,
|
||||
void *pmem_addr, unsigned int len)
|
||||
{
|
||||
unsigned int chunk;
|
||||
int rc;
|
||||
void *mem = kmap_atomic(page);
|
||||
void *mem;
|
||||
|
||||
rc = memcpy_mcsafe(mem + off, pmem_addr, len);
|
||||
kunmap_atomic(mem);
|
||||
if (rc)
|
||||
return BLK_STS_IOERR;
|
||||
while (len) {
|
||||
mem = kmap_atomic(page);
|
||||
chunk = min_t(unsigned int, len, PAGE_SIZE);
|
||||
rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
|
||||
kunmap_atomic(mem);
|
||||
if (rc)
|
||||
return BLK_STS_IOERR;
|
||||
len -= chunk;
|
||||
off = 0;
|
||||
page++;
|
||||
pmem_addr += PAGE_SIZE;
|
||||
}
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
@ -188,7 +206,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
|
||||
struct pmem_device *pmem = bdev->bd_queue->queuedata;
|
||||
blk_status_t rc;
|
||||
|
||||
rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
|
||||
rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
|
||||
0, is_write, sector);
|
||||
|
||||
/*
|
||||
* The ->rw_page interface is subtle and tricky. The core
|
||||
|
@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
|
||||
return FSCACHE_CHECKAUX_OKAY;
|
||||
}
|
||||
|
||||
static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
|
||||
{
|
||||
struct v9fs_inode *v9inode = cookie_netfs_data;
|
||||
struct pagevec pvec;
|
||||
pgoff_t first;
|
||||
int loop, nr_pages;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
first = 0;
|
||||
|
||||
for (;;) {
|
||||
nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
|
||||
first,
|
||||
PAGEVEC_SIZE - pagevec_count(&pvec));
|
||||
if (!nr_pages)
|
||||
break;
|
||||
|
||||
for (loop = 0; loop < nr_pages; loop++)
|
||||
ClearPageFsCache(pvec.pages[loop]);
|
||||
|
||||
first = pvec.pages[nr_pages - 1]->index + 1;
|
||||
|
||||
pvec.nr = nr_pages;
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
const struct fscache_cookie_def v9fs_cache_inode_index_def = {
|
||||
.name = "9p.inode",
|
||||
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
|
||||
@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
|
||||
.get_attr = v9fs_cache_inode_get_attr,
|
||||
.get_aux = v9fs_cache_inode_get_aux,
|
||||
.check_aux = v9fs_cache_inode_check_aux,
|
||||
.now_uncached = v9fs_cache_inode_now_uncached,
|
||||
};
|
||||
|
||||
void v9fs_cache_inode_get_cookie(struct inode *inode)
|
||||
|
@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
|
||||
static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
|
||||
const void *buffer,
|
||||
uint16_t buflen);
|
||||
static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
|
||||
|
||||
struct fscache_netfs afs_cache_netfs = {
|
||||
.name = "afs",
|
||||
@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = {
|
||||
.get_attr = afs_vnode_cache_get_attr,
|
||||
.get_aux = afs_vnode_cache_get_aux,
|
||||
.check_aux = afs_vnode_cache_check_aux,
|
||||
.now_uncached = afs_vnode_cache_now_uncached,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
|
||||
_leave(" = SUCCESS");
|
||||
return FSCACHE_CHECKAUX_OKAY;
|
||||
}
|
||||
|
||||
/*
|
||||
* indication the cookie is no longer uncached
|
||||
* - this function is called when the backing store currently caching a cookie
|
||||
* is removed
|
||||
* - the netfs should use this to clean up any markers indicating cached pages
|
||||
* - this is mandatory for any object that may have data
|
||||
*/
|
||||
static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
|
||||
{
|
||||
struct afs_vnode *vnode = cookie_netfs_data;
|
||||
struct pagevec pvec;
|
||||
pgoff_t first;
|
||||
int loop, nr_pages;
|
||||
|
||||
_enter("{%x,%x,%Lx}",
|
||||
vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
first = 0;
|
||||
|
||||
for (;;) {
|
||||
/* grab a bunch of pages to clean */
|
||||
nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
|
||||
first,
|
||||
PAGEVEC_SIZE - pagevec_count(&pvec));
|
||||
if (!nr_pages)
|
||||
break;
|
||||
|
||||
for (loop = 0; loop < nr_pages; loop++)
|
||||
ClearPageFsCache(pvec.pages[loop]);
|
||||
|
||||
first = pvec.pages[nr_pages - 1]->index + 1;
|
||||
|
||||
pvec.nr = nr_pages;
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
_leave("");
|
||||
}
|
||||
|
31
fs/buffer.c
31
fs/buffer.c
@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
|
||||
struct pagevec pvec;
|
||||
pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
|
||||
pgoff_t end;
|
||||
int i;
|
||||
int i, count;
|
||||
struct buffer_head *bh;
|
||||
struct buffer_head *head;
|
||||
|
||||
end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
|
||||
pagevec_init(&pvec, 0);
|
||||
while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
|
||||
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
|
||||
count = pagevec_count(&pvec);
|
||||
for (i = 0; i < count; i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
|
||||
index = page->index;
|
||||
if (index > end)
|
||||
break;
|
||||
if (!page_has_buffers(page))
|
||||
continue;
|
||||
/*
|
||||
@ -1670,7 +1667,9 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
index++;
|
||||
/* End of range already reached? */
|
||||
if (index > end || !index)
|
||||
break;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(clean_bdev_aliases);
|
||||
@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
|
||||
pagevec_init(&pvec, 0);
|
||||
|
||||
do {
|
||||
unsigned want, nr_pages, i;
|
||||
unsigned nr_pages, i;
|
||||
|
||||
want = min_t(unsigned, end - index, PAGEVEC_SIZE);
|
||||
nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
|
||||
nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
|
||||
end - 1);
|
||||
if (nr_pages == 0)
|
||||
break;
|
||||
|
||||
@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
|
||||
lastoff < page_offset(page))
|
||||
goto check_range;
|
||||
|
||||
/* Searching done if the page index is out of range. */
|
||||
if (page->index >= end)
|
||||
goto not_found;
|
||||
|
||||
lock_page(page);
|
||||
if (likely(page->mapping == inode->i_mapping) &&
|
||||
page_has_buffers(page)) {
|
||||
@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
|
||||
unlock_page(page);
|
||||
lastoff = page_offset(page) + PAGE_SIZE;
|
||||
}
|
||||
|
||||
/* Searching done if fewer pages returned than wanted. */
|
||||
if (nr_pages < want)
|
||||
break;
|
||||
|
||||
index = pvec.pages[i - 1]->index + 1;
|
||||
pagevec_release(&pvec);
|
||||
} while (index < end);
|
||||
|
||||
|
@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
|
||||
return FSCACHE_CHECKAUX_OKAY;
|
||||
}
|
||||
|
||||
static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
|
||||
{
|
||||
struct ceph_inode_info* ci = cookie_netfs_data;
|
||||
struct pagevec pvec;
|
||||
pgoff_t first;
|
||||
int loop, nr_pages;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
first = 0;
|
||||
|
||||
dout("ceph inode 0x%p now uncached", ci);
|
||||
|
||||
while (1) {
|
||||
nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
|
||||
PAGEVEC_SIZE - pagevec_count(&pvec));
|
||||
|
||||
if (!nr_pages)
|
||||
break;
|
||||
|
||||
for (loop = 0; loop < nr_pages; loop++)
|
||||
ClearPageFsCache(pvec.pages[loop]);
|
||||
|
||||
first = pvec.pages[nr_pages - 1]->index + 1;
|
||||
|
||||
pvec.nr = nr_pages;
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
|
||||
.name = "CEPH.inode",
|
||||
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
|
||||
@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
|
||||
.get_attr = ceph_fscache_inode_get_attr,
|
||||
.get_aux = ceph_fscache_inode_get_aux,
|
||||
.check_aux = ceph_fscache_inode_check_aux,
|
||||
.now_uncached = ceph_fscache_inode_now_uncached,
|
||||
};
|
||||
|
||||
void ceph_fscache_register_inode_cookie(struct inode *inode)
|
||||
|
@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
|
||||
return FSCACHE_CHECKAUX_OKAY;
|
||||
}
|
||||
|
||||
static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
|
||||
{
|
||||
struct cifsInodeInfo *cifsi = cookie_netfs_data;
|
||||
struct pagevec pvec;
|
||||
pgoff_t first;
|
||||
int loop, nr_pages;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
first = 0;
|
||||
|
||||
cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
|
||||
|
||||
for (;;) {
|
||||
nr_pages = pagevec_lookup(&pvec,
|
||||
cifsi->vfs_inode.i_mapping, first,
|
||||
PAGEVEC_SIZE - pagevec_count(&pvec));
|
||||
if (!nr_pages)
|
||||
break;
|
||||
|
||||
for (loop = 0; loop < nr_pages; loop++)
|
||||
ClearPageFsCache(pvec.pages[loop]);
|
||||
|
||||
first = pvec.pages[nr_pages - 1]->index + 1;
|
||||
|
||||
pvec.nr = nr_pages;
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
const struct fscache_cookie_def cifs_fscache_inode_object_def = {
|
||||
.name = "CIFS.uniqueid",
|
||||
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
|
||||
@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = {
|
||||
.get_attr = cifs_fscache_inode_get_attr,
|
||||
.get_aux = cifs_fscache_inode_get_aux,
|
||||
.check_aux = cifs_fscache_inode_check_aux,
|
||||
.now_uncached = cifs_fscache_inode_now_uncached,
|
||||
};
|
||||
|
363
fs/dax.c
363
fs/dax.c
@ -42,6 +42,9 @@
|
||||
#define DAX_WAIT_TABLE_BITS 12
|
||||
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
|
||||
|
||||
/* The 'colour' (ie low bits) within a PMD of a page offset. */
|
||||
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
|
||||
|
||||
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
|
||||
|
||||
static int __init init_dax_wait_table(void)
|
||||
@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void)
|
||||
}
|
||||
fs_initcall(init_dax_wait_table);
|
||||
|
||||
/*
|
||||
* We use lowest available bit in exceptional entry for locking, one bit for
|
||||
* the entry size (PMD) and two more to tell us if the entry is a zero page or
|
||||
* an empty entry that is just used for locking. In total four special bits.
|
||||
*
|
||||
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
|
||||
* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
|
||||
* block allocation.
|
||||
*/
|
||||
#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
|
||||
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
|
||||
#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
|
||||
#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
|
||||
#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
|
||||
|
||||
static unsigned long dax_radix_sector(void *entry)
|
||||
{
|
||||
return (unsigned long)entry >> RADIX_DAX_SHIFT;
|
||||
}
|
||||
|
||||
static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
|
||||
{
|
||||
return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
|
||||
((unsigned long)sector << RADIX_DAX_SHIFT) |
|
||||
RADIX_DAX_ENTRY_LOCK);
|
||||
}
|
||||
|
||||
static unsigned int dax_radix_order(void *entry)
|
||||
{
|
||||
if ((unsigned long)entry & RADIX_DAX_PMD)
|
||||
return PMD_SHIFT - PAGE_SHIFT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dax_is_pmd_entry(void *entry)
|
||||
{
|
||||
return (unsigned long)entry & RADIX_DAX_PMD;
|
||||
@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry)
|
||||
|
||||
static int dax_is_zero_entry(void *entry)
|
||||
{
|
||||
return (unsigned long)entry & RADIX_DAX_HZP;
|
||||
return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
|
||||
}
|
||||
|
||||
static int dax_is_empty_entry(void *entry)
|
||||
@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
|
||||
* the range covered by the PMD map to the same bit lock.
|
||||
*/
|
||||
if (dax_is_pmd_entry(entry))
|
||||
index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
|
||||
index &= ~PG_PMD_COLOUR;
|
||||
|
||||
key->mapping = mapping;
|
||||
key->entry_start = index;
|
||||
@ -120,6 +157,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
|
||||
return autoremove_wake_function(wait, mode, sync, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* We do not necessarily hold the mapping->tree_lock when we call this
|
||||
* function so it is possible that 'entry' is no longer a valid item in the
|
||||
* radix tree. This is okay because all we really need to do is to find the
|
||||
* correct waitqueue where tasks might be waiting for that old 'entry' and
|
||||
* wake them.
|
||||
*/
|
||||
static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
|
||||
pgoff_t index, void *entry, bool wake_all)
|
||||
{
|
||||
struct exceptional_entry_key key;
|
||||
wait_queue_head_t *wq;
|
||||
|
||||
wq = dax_entry_waitqueue(mapping, index, entry, &key);
|
||||
|
||||
/*
|
||||
* Checking for locked entry and prepare_to_wait_exclusive() happens
|
||||
* under mapping->tree_lock, ditto for entry handling in our callers.
|
||||
* So at this point all tasks that could have seen our entry locked
|
||||
* must be in the waitqueue and the following check will see them.
|
||||
*/
|
||||
if (waitqueue_active(wq))
|
||||
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check whether the given slot is locked. The function must be called with
|
||||
* mapping->tree_lock held
|
||||
@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
|
||||
for (;;) {
|
||||
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
|
||||
&slot);
|
||||
if (!entry || !radix_tree_exceptional_entry(entry) ||
|
||||
if (!entry ||
|
||||
WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
|
||||
!slot_locked(mapping, slot)) {
|
||||
if (slotp)
|
||||
*slotp = slot;
|
||||
@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
|
||||
}
|
||||
|
||||
static void put_locked_mapping_entry(struct address_space *mapping,
|
||||
pgoff_t index, void *entry)
|
||||
pgoff_t index)
|
||||
{
|
||||
if (!radix_tree_exceptional_entry(entry)) {
|
||||
unlock_page(entry);
|
||||
put_page(entry);
|
||||
} else {
|
||||
dax_unlock_mapping_entry(mapping, index);
|
||||
}
|
||||
dax_unlock_mapping_entry(mapping, index);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
|
||||
static void put_unlocked_mapping_entry(struct address_space *mapping,
|
||||
pgoff_t index, void *entry)
|
||||
{
|
||||
if (!radix_tree_exceptional_entry(entry))
|
||||
if (!entry)
|
||||
return;
|
||||
|
||||
/* We have to wake up next waiter for the radix tree entry lock */
|
||||
@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
|
||||
}
|
||||
|
||||
/*
|
||||
* Find radix tree entry at given index. If it points to a page, return with
|
||||
* the page locked. If it points to the exceptional entry, return with the
|
||||
* radix tree entry locked. If the radix tree doesn't contain given index,
|
||||
* create empty exceptional entry for the index and return with it locked.
|
||||
* Find radix tree entry at given index. If it points to an exceptional entry,
|
||||
* return it with the radix tree entry locked. If the radix tree doesn't
|
||||
* contain given index, create an empty exceptional entry for the index and
|
||||
* return with it locked.
|
||||
*
|
||||
* When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
|
||||
* either return that locked entry or will return an error. This error will
|
||||
* happen if there are any 4k entries (either zero pages or DAX entries)
|
||||
* within the 2MiB range that we are requesting.
|
||||
* happen if there are any 4k entries within the 2MiB range that we are
|
||||
* requesting.
|
||||
*
|
||||
* We always favor 4k entries over 2MiB entries. There isn't a flow where we
|
||||
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
|
||||
@ -276,18 +334,21 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
entry = get_unlocked_mapping_entry(mapping, index, &slot);
|
||||
|
||||
if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
|
||||
entry = ERR_PTR(-EIO);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (entry) {
|
||||
if (size_flag & RADIX_DAX_PMD) {
|
||||
if (!radix_tree_exceptional_entry(entry) ||
|
||||
dax_is_pte_entry(entry)) {
|
||||
if (dax_is_pte_entry(entry)) {
|
||||
put_unlocked_mapping_entry(mapping, index,
|
||||
entry);
|
||||
entry = ERR_PTR(-EEXIST);
|
||||
goto out_unlock;
|
||||
}
|
||||
} else { /* trying to grab a PTE entry */
|
||||
if (radix_tree_exceptional_entry(entry) &&
|
||||
dax_is_pmd_entry(entry) &&
|
||||
if (dax_is_pmd_entry(entry) &&
|
||||
(dax_is_zero_entry(entry) ||
|
||||
dax_is_empty_entry(entry))) {
|
||||
pmd_downgrade = true;
|
||||
@ -321,7 +382,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
|
||||
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
|
||||
if (err) {
|
||||
if (pmd_downgrade)
|
||||
put_locked_mapping_entry(mapping, index, entry);
|
||||
put_locked_mapping_entry(mapping, index);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
@ -371,52 +432,12 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
return entry;
|
||||
}
|
||||
/* Normal page in radix tree? */
|
||||
if (!radix_tree_exceptional_entry(entry)) {
|
||||
struct page *page = entry;
|
||||
|
||||
get_page(page);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
lock_page(page);
|
||||
/* Page got truncated? Retry... */
|
||||
if (unlikely(page->mapping != mapping)) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
goto restart;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
entry = lock_slot(mapping, slot);
|
||||
out_unlock:
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
return entry;
|
||||
}
|
||||
|
||||
/*
|
||||
* We do not necessarily hold the mapping->tree_lock when we call this
|
||||
* function so it is possible that 'entry' is no longer a valid item in the
|
||||
* radix tree. This is okay because all we really need to do is to find the
|
||||
* correct waitqueue where tasks might be waiting for that old 'entry' and
|
||||
* wake them.
|
||||
*/
|
||||
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
|
||||
pgoff_t index, void *entry, bool wake_all)
|
||||
{
|
||||
struct exceptional_entry_key key;
|
||||
wait_queue_head_t *wq;
|
||||
|
||||
wq = dax_entry_waitqueue(mapping, index, entry, &key);
|
||||
|
||||
/*
|
||||
* Checking for locked entry and prepare_to_wait_exclusive() happens
|
||||
* under mapping->tree_lock, ditto for entry handling in our callers.
|
||||
* So at this point all tasks that could have seen our entry locked
|
||||
* must be in the waitqueue and the following check will see them.
|
||||
*/
|
||||
if (waitqueue_active(wq))
|
||||
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
|
||||
}
|
||||
|
||||
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
|
||||
pgoff_t index, bool trunc)
|
||||
{
|
||||
@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
entry = get_unlocked_mapping_entry(mapping, index, NULL);
|
||||
if (!entry || !radix_tree_exceptional_entry(entry))
|
||||
if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
|
||||
goto out;
|
||||
if (!trunc &&
|
||||
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
|
||||
@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
|
||||
return __dax_invalidate_mapping_entry(mapping, index, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* The user has performed a load from a hole in the file. Allocating
|
||||
* a new page in the file would cause excessive storage usage for
|
||||
* workloads with sparse files. We allocate a page cache page instead.
|
||||
* We'll kick it out of the page cache if it's ever written to,
|
||||
* otherwise it will simply fall out of the page cache under memory
|
||||
* pressure without ever having been dirtied.
|
||||
*/
|
||||
static int dax_load_hole(struct address_space *mapping, void **entry,
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
struct page *page;
|
||||
int ret;
|
||||
|
||||
/* Hole page already exists? Return it... */
|
||||
if (!radix_tree_exceptional_entry(*entry)) {
|
||||
page = *entry;
|
||||
goto finish_fault;
|
||||
}
|
||||
|
||||
/* This will replace locked radix tree entry with a hole page */
|
||||
page = find_or_create_page(mapping, vmf->pgoff,
|
||||
vmf->gfp_mask | __GFP_ZERO);
|
||||
if (!page) {
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
finish_fault:
|
||||
vmf->page = page;
|
||||
ret = finish_fault(vmf);
|
||||
vmf->page = NULL;
|
||||
*entry = page;
|
||||
if (!ret) {
|
||||
/* Grab reference for PTE that is now referencing the page */
|
||||
get_page(page);
|
||||
ret = VM_FAULT_NOPAGE;
|
||||
}
|
||||
out:
|
||||
trace_dax_load_hole(inode, vmf, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
|
||||
sector_t sector, size_t size, struct page *to,
|
||||
unsigned long vaddr)
|
||||
@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct radix_tree_root *page_tree = &mapping->page_tree;
|
||||
int error = 0;
|
||||
bool hole_fill = false;
|
||||
void *new_entry;
|
||||
pgoff_t index = vmf->pgoff;
|
||||
|
||||
if (vmf->flags & FAULT_FLAG_WRITE)
|
||||
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
|
||||
|
||||
/* Replacing hole page with block mapping? */
|
||||
if (!radix_tree_exceptional_entry(entry)) {
|
||||
hole_fill = true;
|
||||
/*
|
||||
* Unmap the page now before we remove it from page cache below.
|
||||
* The page is locked so it cannot be faulted in again.
|
||||
*/
|
||||
unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
|
||||
PAGE_SIZE, 0);
|
||||
error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
|
||||
/* replacing huge zero page with PMD block mapping */
|
||||
unmap_mapping_range(mapping,
|
||||
(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
|
||||
if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
|
||||
/* we are replacing a zero page with block mapping */
|
||||
if (dax_is_pmd_entry(entry))
|
||||
unmap_mapping_range(mapping,
|
||||
(vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
|
||||
PMD_SIZE, 0);
|
||||
else /* pte entry */
|
||||
unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
|
||||
PAGE_SIZE, 0);
|
||||
}
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
new_entry = dax_radix_locked_entry(sector, flags);
|
||||
|
||||
if (hole_fill) {
|
||||
__delete_from_page_cache(entry, NULL);
|
||||
/* Drop pagecache reference */
|
||||
put_page(entry);
|
||||
error = __radix_tree_insert(page_tree, index,
|
||||
dax_radix_order(new_entry), new_entry);
|
||||
if (error) {
|
||||
new_entry = ERR_PTR(error);
|
||||
goto unlock;
|
||||
}
|
||||
mapping->nrexceptional++;
|
||||
} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
||||
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
||||
/*
|
||||
* Only swap our new entry into the radix tree if the current
|
||||
* entry is a zero page or an empty entry. If a normal PTE or
|
||||
@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
|
||||
WARN_ON_ONCE(ret != entry);
|
||||
__radix_tree_replace(page_tree, node, slot,
|
||||
new_entry, NULL, NULL);
|
||||
entry = new_entry;
|
||||
}
|
||||
|
||||
if (vmf->flags & FAULT_FLAG_WRITE)
|
||||
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
|
||||
unlock:
|
||||
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
if (hole_fill) {
|
||||
radix_tree_preload_end();
|
||||
/*
|
||||
* We don't need hole page anymore, it has been replaced with
|
||||
* locked radix tree entry now.
|
||||
*/
|
||||
if (mapping->a_ops->freepage)
|
||||
mapping->a_ops->freepage(entry);
|
||||
unlock_page(entry);
|
||||
put_page(entry);
|
||||
}
|
||||
return new_entry;
|
||||
return entry;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
@ -727,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev,
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
|
||||
/* Entry got punched out / reallocated? */
|
||||
if (!entry2 || !radix_tree_exceptional_entry(entry2))
|
||||
if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
|
||||
goto put_unlocked;
|
||||
/*
|
||||
* Entry got reallocated elsewhere? No need to writeback. We have to
|
||||
@ -799,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev,
|
||||
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
|
||||
dax_unlock:
|
||||
dax_read_unlock(id);
|
||||
put_locked_mapping_entry(mapping, index, entry);
|
||||
put_locked_mapping_entry(mapping, index);
|
||||
return ret;
|
||||
|
||||
put_unlocked:
|
||||
@ -874,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
|
||||
|
||||
static int dax_insert_mapping(struct address_space *mapping,
|
||||
struct block_device *bdev, struct dax_device *dax_dev,
|
||||
sector_t sector, size_t size, void **entryp,
|
||||
sector_t sector, size_t size, void *entry,
|
||||
struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
unsigned long vaddr = vmf->address;
|
||||
void *entry = *entryp;
|
||||
void *ret, *kaddr;
|
||||
pgoff_t pgoff;
|
||||
int id, rc;
|
||||
@ -899,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping,
|
||||
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
|
||||
if (IS_ERR(ret))
|
||||
return PTR_ERR(ret);
|
||||
*entryp = ret;
|
||||
|
||||
trace_dax_insert_mapping(mapping->host, vmf, ret);
|
||||
return vm_insert_mixed(vma, vaddr, pfn);
|
||||
if (vmf->flags & FAULT_FLAG_WRITE)
|
||||
return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
|
||||
else
|
||||
return vm_insert_mixed(vma, vaddr, pfn);
|
||||
}
|
||||
|
||||
/**
|
||||
* dax_pfn_mkwrite - handle first write to DAX page
|
||||
* @vmf: The description of the fault
|
||||
/*
|
||||
* The user has performed a load from a hole in the file. Allocating a new
|
||||
* page in the file would cause excessive storage usage for workloads with
|
||||
* sparse files. Instead we insert a read-only mapping of the 4k zero page.
|
||||
* If this page is ever written to we will re-fault and change the mapping to
|
||||
* point to real DAX storage instead.
|
||||
*/
|
||||
int dax_pfn_mkwrite(struct vm_fault *vmf)
|
||||
static int dax_load_hole(struct address_space *mapping, void *entry,
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
struct file *file = vmf->vma->vm_file;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
void *entry, **slot;
|
||||
pgoff_t index = vmf->pgoff;
|
||||
unsigned long vaddr = vmf->address;
|
||||
int ret = VM_FAULT_NOPAGE;
|
||||
struct page *zero_page;
|
||||
void *entry2;
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
entry = get_unlocked_mapping_entry(mapping, index, &slot);
|
||||
if (!entry || !radix_tree_exceptional_entry(entry)) {
|
||||
if (entry)
|
||||
put_unlocked_mapping_entry(mapping, index, entry);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
|
||||
return VM_FAULT_NOPAGE;
|
||||
zero_page = ZERO_PAGE(0);
|
||||
if (unlikely(!zero_page)) {
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out;
|
||||
}
|
||||
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
|
||||
entry = lock_slot(mapping, slot);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
/*
|
||||
* If we race with somebody updating the PTE and finish_mkwrite_fault()
|
||||
* fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
|
||||
* the fault in either case.
|
||||
*/
|
||||
finish_mkwrite_fault(vmf);
|
||||
put_locked_mapping_entry(mapping, index, entry);
|
||||
trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
|
||||
return VM_FAULT_NOPAGE;
|
||||
|
||||
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
|
||||
RADIX_DAX_ZERO_PAGE);
|
||||
if (IS_ERR(entry2)) {
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto out;
|
||||
}
|
||||
|
||||
vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
|
||||
out:
|
||||
trace_dax_load_hole(inode, vmf, ret);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
|
||||
|
||||
static bool dax_range_is_aligned(struct block_device *bdev,
|
||||
unsigned int offset, unsigned int length)
|
||||
@ -1059,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
if (map_len > end - pos)
|
||||
map_len = end - pos;
|
||||
|
||||
/*
|
||||
* The userspace address for the memory copy has already been
|
||||
* validated via access_ok() in either vfs_read() or
|
||||
* vfs_write(), depending on which operation we are doing.
|
||||
*/
|
||||
if (iov_iter_rw(iter) == WRITE)
|
||||
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
|
||||
map_len, iter);
|
||||
@ -1223,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
|
||||
major = VM_FAULT_MAJOR;
|
||||
}
|
||||
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
|
||||
sector, PAGE_SIZE, &entry, vmf->vma, vmf);
|
||||
sector, PAGE_SIZE, entry, vmf->vma, vmf);
|
||||
/* -EBUSY is fine, somebody else faulted on the same PTE */
|
||||
if (error == -EBUSY)
|
||||
error = 0;
|
||||
@ -1231,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
|
||||
case IOMAP_UNWRITTEN:
|
||||
case IOMAP_HOLE:
|
||||
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
|
||||
vmf_ret = dax_load_hole(mapping, &entry, vmf);
|
||||
vmf_ret = dax_load_hole(mapping, entry, vmf);
|
||||
goto finish_iomap;
|
||||
}
|
||||
/*FALLTHRU*/
|
||||
@ -1258,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
|
||||
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
|
||||
}
|
||||
unlock_entry:
|
||||
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
|
||||
put_locked_mapping_entry(mapping, vmf->pgoff);
|
||||
out:
|
||||
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
|
||||
return vmf_ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX_PMD
|
||||
/*
|
||||
* The 'colour' (ie low bits) within a PMD of a page offset. This comes up
|
||||
* more often than one might expect in the below functions.
|
||||
*/
|
||||
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
|
||||
|
||||
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
|
||||
loff_t pos, void **entryp)
|
||||
loff_t pos, void *entry)
|
||||
{
|
||||
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
||||
const sector_t sector = dax_iomap_sector(iomap, pos);
|
||||
@ -1283,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
|
||||
void *ret = NULL, *kaddr;
|
||||
long length = 0;
|
||||
pgoff_t pgoff;
|
||||
pfn_t pfn;
|
||||
pfn_t pfn = {};
|
||||
int id;
|
||||
|
||||
if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
|
||||
@ -1303,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
|
||||
goto unlock_fallback;
|
||||
dax_read_unlock(id);
|
||||
|
||||
ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
|
||||
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
|
||||
RADIX_DAX_PMD);
|
||||
if (IS_ERR(ret))
|
||||
goto fallback;
|
||||
*entryp = ret;
|
||||
|
||||
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
|
||||
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
|
||||
@ -1321,7 +1267,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
|
||||
}
|
||||
|
||||
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
|
||||
void **entryp)
|
||||
void *entry)
|
||||
{
|
||||
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
||||
unsigned long pmd_addr = vmf->address & PMD_MASK;
|
||||
@ -1336,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
|
||||
if (unlikely(!zero_page))
|
||||
goto fallback;
|
||||
|
||||
ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
|
||||
RADIX_DAX_PMD | RADIX_DAX_HZP);
|
||||
ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
|
||||
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
|
||||
if (IS_ERR(ret))
|
||||
goto fallback;
|
||||
*entryp = ret;
|
||||
|
||||
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
|
||||
if (!pmd_none(*(vmf->pmd))) {
|
||||
@ -1416,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
|
||||
goto fallback;
|
||||
|
||||
/*
|
||||
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
|
||||
* PMD or a HZP entry. If it can't (because a 4k page is already in
|
||||
* the tree, for instance), it will return -EEXIST and we just fall
|
||||
* back to 4k entries.
|
||||
* grab_mapping_entry() will make sure we get a 2MiB empty entry, a
|
||||
* 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
|
||||
* is already in the tree, for instance), it will return -EEXIST and
|
||||
* we just fall back to 4k entries.
|
||||
*/
|
||||
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
|
||||
if (IS_ERR(entry))
|
||||
@ -1452,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
|
||||
|
||||
switch (iomap.type) {
|
||||
case IOMAP_MAPPED:
|
||||
result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
|
||||
result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
|
||||
break;
|
||||
case IOMAP_UNWRITTEN:
|
||||
case IOMAP_HOLE:
|
||||
if (WARN_ON_ONCE(write))
|
||||
break;
|
||||
result = dax_pmd_load_hole(vmf, &iomap, &entry);
|
||||
result = dax_pmd_load_hole(vmf, &iomap, entry);
|
||||
break;
|
||||
default:
|
||||
WARN_ON_ONCE(1);
|
||||
@ -1481,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
|
||||
&iomap);
|
||||
}
|
||||
unlock_entry:
|
||||
put_locked_mapping_entry(mapping, pgoff, entry);
|
||||
put_locked_mapping_entry(mapping, pgoff);
|
||||
fallback:
|
||||
if (result == VM_FAULT_FALLBACK) {
|
||||
split_huge_pmd(vma, vmf->pmd, vmf->address);
|
||||
|
@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
|
||||
{
|
||||
struct inode *inode = file_inode(vmf->vma->vm_file);
|
||||
struct ext2_inode_info *ei = EXT2_I(inode);
|
||||
loff_t size;
|
||||
int ret;
|
||||
|
||||
sb_start_pagefault(inode->i_sb);
|
||||
file_update_time(vmf->vma->vm_file);
|
||||
down_read(&ei->dax_sem);
|
||||
|
||||
/* check that the faulting page hasn't raced with truncate */
|
||||
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
if (vmf->pgoff >= size)
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
else
|
||||
ret = dax_pfn_mkwrite(vmf);
|
||||
|
||||
up_read(&ei->dax_sem);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct ext2_dax_vm_ops = {
|
||||
.fault = ext2_dax_fault,
|
||||
/*
|
||||
@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
|
||||
* will always fail and fail back to regular faults.
|
||||
*/
|
||||
.page_mkwrite = ext2_dax_fault,
|
||||
.pfn_mkwrite = ext2_dax_pfn_mkwrite,
|
||||
.pfn_mkwrite = ext2_dax_fault,
|
||||
};
|
||||
|
||||
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
|
@ -324,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
|
||||
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
|
||||
* handler we check for races agaist truncate. Note that since we cycle through
|
||||
* i_mmap_sem, we are sure that also any hole punching that began before we
|
||||
* were called is finished by now and so if it included part of the file we
|
||||
* are working on, our pte will get unmapped and the check for pte_same() in
|
||||
* wp_pfn_shared() fails. Thus fault gets retried and things work out as
|
||||
* desired.
|
||||
*/
|
||||
static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
|
||||
{
|
||||
struct inode *inode = file_inode(vmf->vma->vm_file);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
loff_t size;
|
||||
int ret;
|
||||
|
||||
sb_start_pagefault(sb);
|
||||
file_update_time(vmf->vma->vm_file);
|
||||
down_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
if (vmf->pgoff >= size)
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
else
|
||||
ret = dax_pfn_mkwrite(vmf);
|
||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
sb_end_pagefault(sb);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
||||
.fault = ext4_dax_fault,
|
||||
.huge_fault = ext4_dax_huge_fault,
|
||||
.page_mkwrite = ext4_dax_fault,
|
||||
.pfn_mkwrite = ext4_dax_pfn_mkwrite,
|
||||
.pfn_mkwrite = ext4_dax_fault,
|
||||
};
|
||||
#else
|
||||
#define ext4_dax_vm_ops ext4_file_vm_ops
|
||||
@ -507,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
do {
|
||||
int i, num;
|
||||
int i;
|
||||
unsigned long nr_pages;
|
||||
|
||||
num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
|
||||
nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
|
||||
(pgoff_t)num);
|
||||
nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
|
||||
&index, end);
|
||||
if (nr_pages == 0)
|
||||
break;
|
||||
|
||||
@ -531,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (page->index > end)
|
||||
goto out;
|
||||
|
||||
lock_page(page);
|
||||
|
||||
if (unlikely(page->mapping != inode->i_mapping)) {
|
||||
@ -576,14 +542,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
|
||||
unlock_page(page);
|
||||
}
|
||||
|
||||
/* The no. of pages is less than our desired, we are done. */
|
||||
if (nr_pages < num)
|
||||
break;
|
||||
|
||||
index = pvec.pages[i - 1]->index + 1;
|
||||
pagevec_release(&pvec);
|
||||
} while (index <= end);
|
||||
|
||||
/* There are no pages upto endoff - that would be a hole in there. */
|
||||
if (whence == SEEK_HOLE && lastoff < endoff) {
|
||||
found = 1;
|
||||
*offset = lastoff;
|
||||
|
@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
while (index <= end) {
|
||||
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
|
||||
nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
|
||||
if (nr_pages == 0)
|
||||
break;
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
if (page->index > end)
|
||||
break;
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
BUG_ON(PageWriteback(page));
|
||||
if (invalidate) {
|
||||
@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
|
||||
}
|
||||
unlock_page(page);
|
||||
}
|
||||
index = pvec.pages[nr_pages - 1]->index + 1;
|
||||
pagevec_release(&pvec);
|
||||
}
|
||||
}
|
||||
@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
while (start <= end) {
|
||||
nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
|
||||
PAGEVEC_SIZE);
|
||||
nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
|
||||
&start, end);
|
||||
if (nr_pages == 0)
|
||||
break;
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
|
||||
if (page->index > end)
|
||||
break;
|
||||
/* Up to 'end' pages must be contiguous */
|
||||
BUG_ON(page->index != start);
|
||||
bh = head = page_buffers(page);
|
||||
do {
|
||||
if (lblk < mpd->map.m_lblk)
|
||||
@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
|
||||
pagevec_release(&pvec);
|
||||
return err;
|
||||
}
|
||||
start++;
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
}
|
||||
|
@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
|
||||
pagevec_init(&pvec, 0);
|
||||
next = 0;
|
||||
do {
|
||||
if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
|
||||
if (!pagevec_lookup(&pvec, mapping, &next))
|
||||
break;
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
next = page->index;
|
||||
if (PageFsCache(page)) {
|
||||
__fscache_wait_on_page_write(cookie, page);
|
||||
__fscache_uncache_page(cookie, page);
|
||||
@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
} while (++next);
|
||||
} while (next);
|
||||
|
||||
_leave("");
|
||||
}
|
||||
|
@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||
const pgoff_t end = lend >> huge_page_shift(h);
|
||||
struct vm_area_struct pseudo_vma;
|
||||
struct pagevec pvec;
|
||||
pgoff_t next;
|
||||
pgoff_t next, index;
|
||||
int i, freed = 0;
|
||||
long lookup_nr = PAGEVEC_SIZE;
|
||||
bool truncate_op = (lend == LLONG_MAX);
|
||||
|
||||
memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
|
||||
@ -411,34 +410,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||
pagevec_init(&pvec, 0);
|
||||
next = start;
|
||||
while (next < end) {
|
||||
/*
|
||||
* Don't grab more pages than the number left in the range.
|
||||
*/
|
||||
if (end - next < lookup_nr)
|
||||
lookup_nr = end - next;
|
||||
|
||||
/*
|
||||
* When no more pages are found, we are done.
|
||||
*/
|
||||
if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
|
||||
if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
|
||||
break;
|
||||
|
||||
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
||||
struct page *page = pvec.pages[i];
|
||||
u32 hash;
|
||||
|
||||
/*
|
||||
* The page (index) could be beyond end. This is
|
||||
* only possible in the punch hole case as end is
|
||||
* max page offset in the truncate case.
|
||||
*/
|
||||
next = page->index;
|
||||
if (next >= end)
|
||||
break;
|
||||
|
||||
index = page->index;
|
||||
hash = hugetlb_fault_mutex_hash(h, current->mm,
|
||||
&pseudo_vma,
|
||||
mapping, next, 0);
|
||||
mapping, index, 0);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
||||
/*
|
||||
@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||
|
||||
i_mmap_lock_write(mapping);
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||
next * pages_per_huge_page(h),
|
||||
(next + 1) * pages_per_huge_page(h));
|
||||
index * pages_per_huge_page(h),
|
||||
(index + 1) * pages_per_huge_page(h));
|
||||
i_mmap_unlock_write(mapping);
|
||||
}
|
||||
|
||||
@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||
freed++;
|
||||
if (!truncate_op) {
|
||||
if (unlikely(hugetlb_unreserve_pages(inode,
|
||||
next, next + 1, 1)))
|
||||
index, index + 1, 1)))
|
||||
hugetlb_fix_reserve_counts(inode);
|
||||
}
|
||||
|
||||
unlock_page(page);
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
}
|
||||
++next;
|
||||
huge_pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
|
@ -251,45 +251,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
|
||||
return FSCACHE_CHECKAUX_OKAY;
|
||||
}
|
||||
|
||||
/*
|
||||
* Indication from FS-Cache that the cookie is no longer cached
|
||||
* - This function is called when the backing store currently caching a cookie
|
||||
* is removed
|
||||
* - The netfs should use this to clean up any markers indicating cached pages
|
||||
* - This is mandatory for any object that may have data
|
||||
*/
|
||||
static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
|
||||
{
|
||||
struct nfs_inode *nfsi = cookie_netfs_data;
|
||||
struct pagevec pvec;
|
||||
pgoff_t first;
|
||||
int loop, nr_pages;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
first = 0;
|
||||
|
||||
dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
|
||||
|
||||
for (;;) {
|
||||
/* grab a bunch of pages to unmark */
|
||||
nr_pages = pagevec_lookup(&pvec,
|
||||
nfsi->vfs_inode.i_mapping,
|
||||
first,
|
||||
PAGEVEC_SIZE - pagevec_count(&pvec));
|
||||
if (!nr_pages)
|
||||
break;
|
||||
|
||||
for (loop = 0; loop < nr_pages; loop++)
|
||||
ClearPageFsCache(pvec.pages[loop]);
|
||||
|
||||
first = pvec.pages[nr_pages - 1]->index + 1;
|
||||
|
||||
pvec.nr = nr_pages;
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get an extra reference on a read context.
|
||||
* - This function can be absent if the completion function doesn't require a
|
||||
@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = {
|
||||
.get_attr = nfs_fscache_inode_get_attr,
|
||||
.get_aux = nfs_fscache_inode_get_aux,
|
||||
.check_aux = nfs_fscache_inode_check_aux,
|
||||
.now_uncached = nfs_fscache_inode_now_uncached,
|
||||
.get_context = nfs_fh_get_context,
|
||||
.put_context = nfs_fh_put_context,
|
||||
};
|
||||
|
@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap,
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
repeat:
|
||||
n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
|
||||
n = pagevec_lookup(&pvec, smap, &index);
|
||||
if (!n)
|
||||
return;
|
||||
index = pvec.pages[n - 1]->index + 1;
|
||||
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i], *dpage;
|
||||
|
@ -221,7 +221,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
|
||||
/*
|
||||
* Set the access or default ACL of an inode.
|
||||
*/
|
||||
int ocfs2_set_acl(handle_t *handle,
|
||||
static int ocfs2_set_acl(handle_t *handle,
|
||||
struct inode *inode,
|
||||
struct buffer_head *di_bh,
|
||||
int type,
|
||||
|
@ -28,13 +28,6 @@ struct ocfs2_acl_entry {
|
||||
|
||||
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
|
||||
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
|
||||
int ocfs2_set_acl(handle_t *handle,
|
||||
struct inode *inode,
|
||||
struct buffer_head *di_bh,
|
||||
int type,
|
||||
struct posix_acl *acl,
|
||||
struct ocfs2_alloc_context *meta_ac,
|
||||
struct ocfs2_alloc_context *data_ac);
|
||||
extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
|
||||
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
|
||||
struct buffer_head *, struct buffer_head *,
|
||||
|
@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
|
||||
/*
|
||||
* How many free extents have we got before we need more meta data?
|
||||
*/
|
||||
int ocfs2_num_free_extents(struct ocfs2_super *osb,
|
||||
struct ocfs2_extent_tree *et)
|
||||
int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
|
||||
{
|
||||
int retval;
|
||||
struct ocfs2_extent_list *el = NULL;
|
||||
@ -1933,14 +1932,12 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
|
||||
* the new changes.
|
||||
*
|
||||
* left_rec: the record on the left.
|
||||
* left_child_el: is the child list pointed to by left_rec
|
||||
* right_rec: the record to the right of left_rec
|
||||
* right_child_el: is the child list pointed to by right_rec
|
||||
*
|
||||
* By definition, this only works on interior nodes.
|
||||
*/
|
||||
static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
|
||||
struct ocfs2_extent_list *left_child_el,
|
||||
struct ocfs2_extent_rec *right_rec,
|
||||
struct ocfs2_extent_list *right_child_el)
|
||||
{
|
||||
@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
|
||||
*/
|
||||
BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
|
||||
|
||||
ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
|
||||
ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
|
||||
&root_el->l_recs[i + 1], right_el);
|
||||
}
|
||||
|
||||
@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
|
||||
el = right_path->p_node[i].el;
|
||||
right_rec = &el->l_recs[0];
|
||||
|
||||
ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
|
||||
right_el);
|
||||
ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
|
||||
|
||||
ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
|
||||
ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
|
||||
@ -2509,7 +2505,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
|
||||
|
||||
static int ocfs2_update_edge_lengths(handle_t *handle,
|
||||
struct ocfs2_extent_tree *et,
|
||||
int subtree_index, struct ocfs2_path *path)
|
||||
struct ocfs2_path *path)
|
||||
{
|
||||
int i, idx, ret;
|
||||
struct ocfs2_extent_rec *rec;
|
||||
@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
|
||||
if (del_right_subtree) {
|
||||
ocfs2_unlink_subtree(handle, et, left_path, right_path,
|
||||
subtree_index, dealloc);
|
||||
ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
|
||||
left_path);
|
||||
ret = ocfs2_update_edge_lengths(handle, et, left_path);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
|
||||
|
||||
ocfs2_unlink_subtree(handle, et, left_path, path,
|
||||
subtree_index, dealloc);
|
||||
ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
|
||||
left_path);
|
||||
ret = ocfs2_update_edge_lengths(handle, et, left_path);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
|
||||
if (mark_unwritten)
|
||||
flags = OCFS2_EXT_UNWRITTEN;
|
||||
|
||||
free_extents = ocfs2_num_free_extents(osb, et);
|
||||
free_extents = ocfs2_num_free_extents(et);
|
||||
if (free_extents < 0) {
|
||||
status = free_extents;
|
||||
mlog_errno(status);
|
||||
@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
|
||||
|
||||
*ac = NULL;
|
||||
|
||||
num_free_extents = ocfs2_num_free_extents(osb, et);
|
||||
num_free_extents = ocfs2_num_free_extents(et);
|
||||
if (num_free_extents < 0) {
|
||||
ret = num_free_extents;
|
||||
mlog_errno(ret);
|
||||
|
@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
|
||||
struct ocfs2_cached_dealloc_ctxt *dealloc,
|
||||
u64 refcount_loc, bool refcount_tree_locked);
|
||||
|
||||
int ocfs2_num_free_extents(struct ocfs2_super *osb,
|
||||
struct ocfs2_extent_tree *et);
|
||||
int ocfs2_num_free_extents(struct ocfs2_extent_tree *et);
|
||||
|
||||
/*
|
||||
* how many new metadata chunks would an allocation need at maximum?
|
||||
|
@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
|
||||
}
|
||||
}
|
||||
|
||||
static void o2hb_wait_on_io(struct o2hb_region *reg,
|
||||
struct o2hb_bio_wait_ctxt *wc)
|
||||
static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
|
||||
{
|
||||
o2hb_bio_wait_dec(wc, 1);
|
||||
wait_for_completion(&wc->wc_io_complete);
|
||||
@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
|
||||
status = 0;
|
||||
|
||||
bail_and_wait:
|
||||
o2hb_wait_on_io(reg, &wc);
|
||||
o2hb_wait_on_io(&wc);
|
||||
if (wc.wc_error && !status)
|
||||
status = wc.wc_error;
|
||||
|
||||
@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
|
||||
* before we can go to steady state. This ensures that
|
||||
* people we find in our steady state have seen us.
|
||||
*/
|
||||
o2hb_wait_on_io(reg, &write_wc);
|
||||
o2hb_wait_on_io(&write_wc);
|
||||
if (write_wc.wc_error) {
|
||||
/* Do not re-arm the write timeout on I/O error - we
|
||||
* can't be sure that the new block ever made it to
|
||||
@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data)
|
||||
o2hb_prepare_block(reg, 0);
|
||||
ret = o2hb_issue_node_write(reg, &write_wc);
|
||||
if (ret == 0)
|
||||
o2hb_wait_on_io(reg, &write_wc);
|
||||
o2hb_wait_on_io(&write_wc);
|
||||
else
|
||||
mlog_errno(ret);
|
||||
}
|
||||
@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
|
||||
|
||||
int o2hb_check_node_heartbeating(u8 node_num)
|
||||
{
|
||||
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
|
||||
o2hb_fill_node_map(testing_map, sizeof(testing_map));
|
||||
if (!test_bit(node_num, testing_map)) {
|
||||
mlog(ML_HEARTBEAT,
|
||||
"node (%u) does not have heartbeating enabled.\n",
|
||||
node_num);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
|
||||
|
||||
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
|
||||
{
|
||||
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
|
||||
|
||||
/* Makes sure our local node is configured with a node number, and is
|
||||
* heartbeating. */
|
||||
int o2hb_check_local_node_heartbeating(void)
|
||||
{
|
||||
u8 node_num;
|
||||
|
||||
/* if this node was set then we have networking */
|
||||
node_num = o2nm_this_node();
|
||||
if (node_num == O2NM_MAX_NODES) {
|
||||
mlog(ML_HEARTBEAT, "this node has not been configured.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return o2hb_check_node_heartbeating(node_num);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
|
||||
|
||||
/*
|
||||
* this is just a hack until we get the plumbing which flips file systems
|
||||
* read only and drops the hb ref instead of killing the node dead.
|
||||
|
@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
|
||||
spin_unlock(&OCFS2_I(dir)->ip_lock);
|
||||
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
|
||||
parent_fe_bh);
|
||||
num_free_extents = ocfs2_num_free_extents(osb, &et);
|
||||
num_free_extents = ocfs2_num_free_extents(&et);
|
||||
if (num_free_extents < 0) {
|
||||
status = num_free_extents;
|
||||
mlog_errno(status);
|
||||
|
@ -713,13 +713,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
|
||||
return status;
|
||||
}
|
||||
|
||||
int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
|
||||
u32 clusters_to_add, int mark_unwritten)
|
||||
{
|
||||
return __ocfs2_extend_allocation(inode, logical_start,
|
||||
clusters_to_add, mark_unwritten);
|
||||
}
|
||||
|
||||
/*
|
||||
* While a write will already be ordering the data, a truncate will not.
|
||||
* Thus, we need to explicitly order the zeroed pages.
|
||||
|
@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
|
||||
ocfs2_schedule_truncate_log_flush(osb, 0);
|
||||
|
||||
osb->local_alloc_copy = NULL;
|
||||
osb->dirty = 0;
|
||||
|
||||
/* queue to recover orphan slots for all offline slots */
|
||||
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
|
||||
|
@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
|
||||
unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
|
||||
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
||||
|
||||
num_free_extents = ocfs2_num_free_extents(osb, et);
|
||||
num_free_extents = ocfs2_num_free_extents(et);
|
||||
if (num_free_extents < 0) {
|
||||
ret = num_free_extents;
|
||||
mlog_errno(ret);
|
||||
|
@ -320,7 +320,6 @@ struct ocfs2_super
|
||||
u64 system_dir_blkno;
|
||||
u64 bitmap_blkno;
|
||||
u32 bitmap_cpg;
|
||||
u8 *uuid;
|
||||
char *uuid_str;
|
||||
u32 uuid_hash;
|
||||
u8 *vol_label;
|
||||
@ -388,9 +387,8 @@ struct ocfs2_super
|
||||
unsigned int osb_resv_level;
|
||||
unsigned int osb_dir_resv_level;
|
||||
|
||||
/* Next three fields are for local node slot recovery during
|
||||
/* Next two fields are for local node slot recovery during
|
||||
* mount. */
|
||||
int dirty;
|
||||
struct ocfs2_dinode *local_alloc_copy;
|
||||
struct ocfs2_quota_recovery *quota_rec;
|
||||
|
||||
|
@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
|
||||
int *credits)
|
||||
{
|
||||
int ret = 0, meta_add = 0;
|
||||
int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
|
||||
int num_free_extents = ocfs2_num_free_extents(et);
|
||||
|
||||
if (num_free_extents < 0) {
|
||||
ret = num_free_extents;
|
||||
|
@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode,
|
||||
|
||||
BUG_ON(clusters_to_add != 0 && data_ac == NULL);
|
||||
|
||||
num_free_extents = ocfs2_num_free_extents(osb, et);
|
||||
num_free_extents = ocfs2_num_free_extents(et);
|
||||
if (num_free_extents < 0) {
|
||||
ret = num_free_extents;
|
||||
mlog_errno(ret);
|
||||
|
@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
|
||||
if (dirty) {
|
||||
/* Recovery will be completed after we've mounted the
|
||||
* rest of the volume. */
|
||||
osb->dirty = 1;
|
||||
osb->local_alloc_copy = local_alloc;
|
||||
local_alloc = NULL;
|
||||
}
|
||||
|
@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
|
||||
*credits += 1;
|
||||
|
||||
/* count in the xattr tree change. */
|
||||
num_free_extents = ocfs2_num_free_extents(osb, xt_et);
|
||||
num_free_extents = ocfs2_num_free_extents(xt_et);
|
||||
if (num_free_extents < 0) {
|
||||
ret = num_free_extents;
|
||||
mlog_errno(ret);
|
||||
|
@ -2931,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = {
|
||||
#ifdef CONFIG_PROC_PAGE_MONITOR
|
||||
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
|
||||
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
|
||||
REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
|
||||
REG("pagemap", S_IRUSR, proc_pagemap_operations),
|
||||
#endif
|
||||
#ifdef CONFIG_SECURITY
|
||||
@ -3324,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = {
|
||||
#ifdef CONFIG_PROC_PAGE_MONITOR
|
||||
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
|
||||
REG("smaps", S_IRUGO, proc_tid_smaps_operations),
|
||||
REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
|
||||
REG("pagemap", S_IRUSR, proc_pagemap_operations),
|
||||
#endif
|
||||
#ifdef CONFIG_SECURITY
|
||||
|
@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *);
|
||||
/*
|
||||
* task_[no]mmu.c
|
||||
*/
|
||||
struct mem_size_stats;
|
||||
struct proc_maps_private {
|
||||
struct inode *inode;
|
||||
struct task_struct *task;
|
||||
struct mm_struct *mm;
|
||||
struct mem_size_stats *rollup;
|
||||
#ifdef CONFIG_MMU
|
||||
struct vm_area_struct *tail_vma;
|
||||
#endif
|
||||
@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations;
|
||||
extern const struct file_operations proc_pid_numa_maps_operations;
|
||||
extern const struct file_operations proc_tid_numa_maps_operations;
|
||||
extern const struct file_operations proc_pid_smaps_operations;
|
||||
extern const struct file_operations proc_pid_smaps_rollup_operations;
|
||||
extern const struct file_operations proc_tid_smaps_operations;
|
||||
extern const struct file_operations proc_clear_refs_operations;
|
||||
extern const struct file_operations proc_pagemap_operations;
|
||||
|
@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
||||
show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
|
||||
show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
|
||||
show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
|
||||
show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK));
|
||||
show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
show_val_kb(m, "HighTotal: ", i.totalhigh);
|
||||
@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
||||
show_val_kb(m, "SUnreclaim: ",
|
||||
global_node_page_state(NR_SLAB_UNRECLAIMABLE));
|
||||
seq_printf(m, "KernelStack: %8lu kB\n",
|
||||
global_page_state(NR_KERNEL_STACK_KB));
|
||||
global_zone_page_state(NR_KERNEL_STACK_KB));
|
||||
show_val_kb(m, "PageTables: ",
|
||||
global_page_state(NR_PAGETABLE));
|
||||
global_zone_page_state(NR_PAGETABLE));
|
||||
#ifdef CONFIG_QUICKLIST
|
||||
show_val_kb(m, "Quicklists: ", quicklist_total_size());
|
||||
#endif
|
||||
@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
||||
show_val_kb(m, "NFS_Unstable: ",
|
||||
global_node_page_state(NR_UNSTABLE_NFS));
|
||||
show_val_kb(m, "Bounce: ",
|
||||
global_page_state(NR_BOUNCE));
|
||||
global_zone_page_state(NR_BOUNCE));
|
||||
show_val_kb(m, "WritebackTmp: ",
|
||||
global_node_page_state(NR_WRITEBACK_TEMP));
|
||||
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
|
||||
@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
||||
#ifdef CONFIG_CMA
|
||||
show_val_kb(m, "CmaTotal: ", totalcma_pages);
|
||||
show_val_kb(m, "CmaFree: ",
|
||||
global_page_state(NR_FREE_CMA_PAGES));
|
||||
global_zone_page_state(NR_FREE_CMA_PAGES));
|
||||
#endif
|
||||
|
||||
hugetlb_report_meminfo(m);
|
||||
|
@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file)
|
||||
if (priv->mm)
|
||||
mmdrop(priv->mm);
|
||||
|
||||
kfree(priv->rollup);
|
||||
return seq_release_private(inode, file);
|
||||
}
|
||||
|
||||
@ -279,6 +280,23 @@ static int is_stack(struct proc_maps_private *priv,
|
||||
vma->vm_end >= vma->vm_mm->start_stack;
|
||||
}
|
||||
|
||||
static void show_vma_header_prefix(struct seq_file *m,
|
||||
unsigned long start, unsigned long end,
|
||||
vm_flags_t flags, unsigned long long pgoff,
|
||||
dev_t dev, unsigned long ino)
|
||||
{
|
||||
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
|
||||
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
|
||||
start,
|
||||
end,
|
||||
flags & VM_READ ? 'r' : '-',
|
||||
flags & VM_WRITE ? 'w' : '-',
|
||||
flags & VM_EXEC ? 'x' : '-',
|
||||
flags & VM_MAYSHARE ? 's' : 'p',
|
||||
pgoff,
|
||||
MAJOR(dev), MINOR(dev), ino);
|
||||
}
|
||||
|
||||
static void
|
||||
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
|
||||
{
|
||||
@ -301,17 +319,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
|
||||
|
||||
start = vma->vm_start;
|
||||
end = vma->vm_end;
|
||||
|
||||
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
|
||||
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
|
||||
start,
|
||||
end,
|
||||
flags & VM_READ ? 'r' : '-',
|
||||
flags & VM_WRITE ? 'w' : '-',
|
||||
flags & VM_EXEC ? 'x' : '-',
|
||||
flags & VM_MAYSHARE ? 's' : 'p',
|
||||
pgoff,
|
||||
MAJOR(dev), MINOR(dev), ino);
|
||||
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
|
||||
|
||||
/*
|
||||
* Print the dentry name for named mappings, and a
|
||||
@ -430,6 +438,7 @@ const struct file_operations proc_tid_maps_operations = {
|
||||
|
||||
#ifdef CONFIG_PROC_PAGE_MONITOR
|
||||
struct mem_size_stats {
|
||||
bool first;
|
||||
unsigned long resident;
|
||||
unsigned long shared_clean;
|
||||
unsigned long shared_dirty;
|
||||
@ -443,7 +452,9 @@ struct mem_size_stats {
|
||||
unsigned long swap;
|
||||
unsigned long shared_hugetlb;
|
||||
unsigned long private_hugetlb;
|
||||
unsigned long first_vma_start;
|
||||
u64 pss;
|
||||
u64 pss_locked;
|
||||
u64 swap_pss;
|
||||
bool check_shmem_swap;
|
||||
};
|
||||
@ -652,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
|
||||
[ilog2(VM_NORESERVE)] = "nr",
|
||||
[ilog2(VM_HUGETLB)] = "ht",
|
||||
[ilog2(VM_ARCH_1)] = "ar",
|
||||
[ilog2(VM_WIPEONFORK)] = "wf",
|
||||
[ilog2(VM_DONTDUMP)] = "dd",
|
||||
#ifdef CONFIG_MEM_SOFT_DIRTY
|
||||
[ilog2(VM_SOFTDIRTY)] = "sd",
|
||||
@ -719,18 +731,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
|
||||
|
||||
static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||
{
|
||||
struct proc_maps_private *priv = m->private;
|
||||
struct vm_area_struct *vma = v;
|
||||
struct mem_size_stats mss;
|
||||
struct mem_size_stats mss_stack;
|
||||
struct mem_size_stats *mss;
|
||||
struct mm_walk smaps_walk = {
|
||||
.pmd_entry = smaps_pte_range,
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
.hugetlb_entry = smaps_hugetlb_range,
|
||||
#endif
|
||||
.mm = vma->vm_mm,
|
||||
.private = &mss,
|
||||
};
|
||||
int ret = 0;
|
||||
bool rollup_mode;
|
||||
bool last_vma;
|
||||
|
||||
memset(&mss, 0, sizeof mss);
|
||||
if (priv->rollup) {
|
||||
rollup_mode = true;
|
||||
mss = priv->rollup;
|
||||
if (mss->first) {
|
||||
mss->first_vma_start = vma->vm_start;
|
||||
mss->first = false;
|
||||
}
|
||||
last_vma = !m_next_vma(priv, vma);
|
||||
} else {
|
||||
rollup_mode = false;
|
||||
memset(&mss_stack, 0, sizeof(mss_stack));
|
||||
mss = &mss_stack;
|
||||
}
|
||||
|
||||
smaps_walk.private = mss;
|
||||
|
||||
#ifdef CONFIG_SHMEM
|
||||
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
|
||||
@ -748,9 +778,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||
|
||||
if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
|
||||
!(vma->vm_flags & VM_WRITE)) {
|
||||
mss.swap = shmem_swapped;
|
||||
mss->swap = shmem_swapped;
|
||||
} else {
|
||||
mss.check_shmem_swap = true;
|
||||
mss->check_shmem_swap = true;
|
||||
smaps_walk.pte_hole = smaps_pte_hole;
|
||||
}
|
||||
}
|
||||
@ -758,54 +788,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||
|
||||
/* mmap_sem is held in m_start */
|
||||
walk_page_vma(vma, &smaps_walk);
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mss->pss_locked += mss->pss;
|
||||
|
||||
show_map_vma(m, vma, is_pid);
|
||||
if (!rollup_mode) {
|
||||
show_map_vma(m, vma, is_pid);
|
||||
} else if (last_vma) {
|
||||
show_vma_header_prefix(
|
||||
m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
|
||||
seq_pad(m, ' ');
|
||||
seq_puts(m, "[rollup]\n");
|
||||
} else {
|
||||
ret = SEQ_SKIP;
|
||||
}
|
||||
|
||||
seq_printf(m,
|
||||
"Size: %8lu kB\n"
|
||||
"Rss: %8lu kB\n"
|
||||
"Pss: %8lu kB\n"
|
||||
"Shared_Clean: %8lu kB\n"
|
||||
"Shared_Dirty: %8lu kB\n"
|
||||
"Private_Clean: %8lu kB\n"
|
||||
"Private_Dirty: %8lu kB\n"
|
||||
"Referenced: %8lu kB\n"
|
||||
"Anonymous: %8lu kB\n"
|
||||
"LazyFree: %8lu kB\n"
|
||||
"AnonHugePages: %8lu kB\n"
|
||||
"ShmemPmdMapped: %8lu kB\n"
|
||||
"Shared_Hugetlb: %8lu kB\n"
|
||||
"Private_Hugetlb: %7lu kB\n"
|
||||
"Swap: %8lu kB\n"
|
||||
"SwapPss: %8lu kB\n"
|
||||
"KernelPageSize: %8lu kB\n"
|
||||
"MMUPageSize: %8lu kB\n"
|
||||
"Locked: %8lu kB\n",
|
||||
(vma->vm_end - vma->vm_start) >> 10,
|
||||
mss.resident >> 10,
|
||||
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
|
||||
mss.shared_clean >> 10,
|
||||
mss.shared_dirty >> 10,
|
||||
mss.private_clean >> 10,
|
||||
mss.private_dirty >> 10,
|
||||
mss.referenced >> 10,
|
||||
mss.anonymous >> 10,
|
||||
mss.lazyfree >> 10,
|
||||
mss.anonymous_thp >> 10,
|
||||
mss.shmem_thp >> 10,
|
||||
mss.shared_hugetlb >> 10,
|
||||
mss.private_hugetlb >> 10,
|
||||
mss.swap >> 10,
|
||||
(unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
|
||||
vma_kernel_pagesize(vma) >> 10,
|
||||
vma_mmu_pagesize(vma) >> 10,
|
||||
(vma->vm_flags & VM_LOCKED) ?
|
||||
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
|
||||
if (!rollup_mode)
|
||||
seq_printf(m,
|
||||
"Size: %8lu kB\n"
|
||||
"KernelPageSize: %8lu kB\n"
|
||||
"MMUPageSize: %8lu kB\n",
|
||||
(vma->vm_end - vma->vm_start) >> 10,
|
||||
vma_kernel_pagesize(vma) >> 10,
|
||||
vma_mmu_pagesize(vma) >> 10);
|
||||
|
||||
arch_show_smap(m, vma);
|
||||
show_smap_vma_flags(m, vma);
|
||||
|
||||
if (!rollup_mode || last_vma)
|
||||
seq_printf(m,
|
||||
"Rss: %8lu kB\n"
|
||||
"Pss: %8lu kB\n"
|
||||
"Shared_Clean: %8lu kB\n"
|
||||
"Shared_Dirty: %8lu kB\n"
|
||||
"Private_Clean: %8lu kB\n"
|
||||
"Private_Dirty: %8lu kB\n"
|
||||
"Referenced: %8lu kB\n"
|
||||
"Anonymous: %8lu kB\n"
|
||||
"LazyFree: %8lu kB\n"
|
||||
"AnonHugePages: %8lu kB\n"
|
||||
"ShmemPmdMapped: %8lu kB\n"
|
||||
"Shared_Hugetlb: %8lu kB\n"
|
||||
"Private_Hugetlb: %7lu kB\n"
|
||||
"Swap: %8lu kB\n"
|
||||
"SwapPss: %8lu kB\n"
|
||||
"Locked: %8lu kB\n",
|
||||
mss->resident >> 10,
|
||||
(unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
|
||||
mss->shared_clean >> 10,
|
||||
mss->shared_dirty >> 10,
|
||||
mss->private_clean >> 10,
|
||||
mss->private_dirty >> 10,
|
||||
mss->referenced >> 10,
|
||||
mss->anonymous >> 10,
|
||||
mss->lazyfree >> 10,
|
||||
mss->anonymous_thp >> 10,
|
||||
mss->shmem_thp >> 10,
|
||||
mss->shared_hugetlb >> 10,
|
||||
mss->private_hugetlb >> 10,
|
||||
mss->swap >> 10,
|
||||
(unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
|
||||
(unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
|
||||
|
||||
if (!rollup_mode) {
|
||||
arch_show_smap(m, vma);
|
||||
show_smap_vma_flags(m, vma);
|
||||
}
|
||||
m_cache_vma(m, vma);
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int show_pid_smap(struct seq_file *m, void *v)
|
||||
@ -837,6 +884,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
|
||||
return do_maps_open(inode, file, &proc_pid_smaps_op);
|
||||
}
|
||||
|
||||
static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct seq_file *seq;
|
||||
struct proc_maps_private *priv;
|
||||
int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
|
||||
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
seq = file->private_data;
|
||||
priv = seq->private;
|
||||
priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
|
||||
if (!priv->rollup) {
|
||||
proc_map_release(inode, file);
|
||||
return -ENOMEM;
|
||||
}
|
||||
priv->rollup->first = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tid_smaps_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return do_maps_open(inode, file, &proc_tid_smaps_op);
|
||||
@ -849,6 +915,13 @@ const struct file_operations proc_pid_smaps_operations = {
|
||||
.release = proc_map_release,
|
||||
};
|
||||
|
||||
const struct file_operations proc_pid_smaps_rollup_operations = {
|
||||
.open = pid_smaps_rollup_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = proc_map_release,
|
||||
};
|
||||
|
||||
const struct file_operations proc_tid_smaps_operations = {
|
||||
.open = tid_smaps_open,
|
||||
.read = seq_read,
|
||||
|
@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
|
||||
if (!pages)
|
||||
goto out_free;
|
||||
|
||||
nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
|
||||
nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
|
||||
if (nr != lpages)
|
||||
goto out_free_pages; /* leave if some pages were missing */
|
||||
|
||||
|
@ -335,11 +335,6 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
|
||||
goto out_put;
|
||||
|
||||
mapping = f.file->f_mapping;
|
||||
if (!mapping) {
|
||||
ret = -EINVAL;
|
||||
goto out_put;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
|
||||
ret = file_fdatawait_range(f.file, offset, endbyte);
|
||||
|
@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
|
||||
|
||||
static inline struct uffd_msg userfault_msg(unsigned long address,
|
||||
unsigned int flags,
|
||||
unsigned long reason)
|
||||
unsigned long reason,
|
||||
unsigned int features)
|
||||
{
|
||||
struct uffd_msg msg;
|
||||
msg_init(&msg);
|
||||
@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
|
||||
* write protect fault.
|
||||
*/
|
||||
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
|
||||
if (features & UFFD_FEATURE_THREAD_ID)
|
||||
msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
|
||||
return msg;
|
||||
}
|
||||
|
||||
@ -370,6 +373,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
|
||||
VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
|
||||
VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
|
||||
|
||||
if (ctx->features & UFFD_FEATURE_SIGBUS)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If it's already released don't get it. This avoids to loop
|
||||
* in __get_user_pages if userfaultfd_release waits on the
|
||||
@ -419,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
|
||||
|
||||
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
|
||||
uwq.wq.private = current;
|
||||
uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
|
||||
uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
|
||||
ctx->features);
|
||||
uwq.ctx = ctx;
|
||||
uwq.waken = false;
|
||||
|
||||
@ -1194,7 +1201,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
||||
struct uffdio_register __user *user_uffdio_register;
|
||||
unsigned long vm_flags, new_flags;
|
||||
bool found;
|
||||
bool non_anon_pages;
|
||||
bool basic_ioctls;
|
||||
unsigned long start, end, vma_end;
|
||||
|
||||
user_uffdio_register = (struct uffdio_register __user *) arg;
|
||||
@ -1260,7 +1267,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
||||
* Search for not compatible vmas.
|
||||
*/
|
||||
found = false;
|
||||
non_anon_pages = false;
|
||||
basic_ioctls = false;
|
||||
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
|
||||
cond_resched();
|
||||
|
||||
@ -1299,8 +1306,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
||||
/*
|
||||
* Note vmas containing huge pages
|
||||
*/
|
||||
if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
|
||||
non_anon_pages = true;
|
||||
if (is_vm_hugetlb_page(cur))
|
||||
basic_ioctls = true;
|
||||
|
||||
found = true;
|
||||
}
|
||||
@ -1371,7 +1378,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
||||
* userland which ioctls methods are guaranteed to
|
||||
* succeed on this range.
|
||||
*/
|
||||
if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
|
||||
if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
|
||||
UFFD_API_RANGE_IOCTLS,
|
||||
&user_uffdio_register->ioctls))
|
||||
ret = -EFAULT;
|
||||
|
@ -1101,7 +1101,7 @@ xfs_filemap_pfn_mkwrite(
|
||||
if (vmf->pgoff >= size)
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
else if (IS_DAX(inode))
|
||||
ret = dax_pfn_mkwrite(vmf);
|
||||
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
return ret;
|
||||
|
@ -38,7 +38,15 @@
|
||||
#define BIO_BUG_ON
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_THP_SWAP
|
||||
#if HPAGE_PMD_NR > 256
|
||||
#define BIO_MAX_PAGES HPAGE_PMD_NR
|
||||
#else
|
||||
#define BIO_MAX_PAGES 256
|
||||
#endif
|
||||
#else
|
||||
#define BIO_MAX_PAGES 256
|
||||
#endif
|
||||
|
||||
#define bio_prio(bio) (bio)->bi_ioprio
|
||||
#define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio)
|
||||
|
@ -89,34 +89,6 @@ void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
|
||||
void dax_write_cache(struct dax_device *dax_dev, bool wc);
|
||||
bool dax_write_cache_enabled(struct dax_device *dax_dev);
|
||||
|
||||
/*
|
||||
* We use lowest available bit in exceptional entry for locking, one bit for
|
||||
* the entry size (PMD) and two more to tell us if the entry is a huge zero
|
||||
* page (HZP) or an empty entry that is just used for locking. In total four
|
||||
* special bits.
|
||||
*
|
||||
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
|
||||
* EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
|
||||
* block allocation.
|
||||
*/
|
||||
#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
|
||||
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
|
||||
#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
|
||||
#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
|
||||
#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
|
||||
|
||||
static inline unsigned long dax_radix_sector(void *entry)
|
||||
{
|
||||
return (unsigned long)entry >> RADIX_DAX_SHIFT;
|
||||
}
|
||||
|
||||
static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
|
||||
{
|
||||
return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
|
||||
((unsigned long)sector << RADIX_DAX_SHIFT) |
|
||||
RADIX_DAX_ENTRY_LOCK);
|
||||
}
|
||||
|
||||
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
const struct iomap_ops *ops);
|
||||
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
|
||||
@ -124,8 +96,6 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
|
||||
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
|
||||
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
|
||||
pgoff_t index);
|
||||
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
|
||||
pgoff_t index, void *entry, bool wake_all);
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
int __dax_zero_page_range(struct block_device *bdev,
|
||||
@ -140,21 +110,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_FS_DAX_PMD
|
||||
static inline unsigned int dax_radix_order(void *entry)
|
||||
{
|
||||
if ((unsigned long)entry & RADIX_DAX_PMD)
|
||||
return PMD_SHIFT - PAGE_SHIFT;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static inline unsigned int dax_radix_order(void *entry)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
int dax_pfn_mkwrite(struct vm_fault *vmf);
|
||||
|
||||
static inline bool dax_mapping(struct address_space *mapping)
|
||||
{
|
||||
return mapping->host && IS_DAX(mapping->host);
|
||||
|
@ -1269,8 +1269,6 @@ extern void f_delown(struct file *filp);
|
||||
extern pid_t f_getown(struct file *filp);
|
||||
extern int send_sigurg(struct fown_struct *fown);
|
||||
|
||||
struct mm_struct;
|
||||
|
||||
/*
|
||||
* Umount options
|
||||
*/
|
||||
|
@ -143,15 +143,6 @@ struct fscache_cookie_def {
|
||||
void (*mark_page_cached)(void *cookie_netfs_data,
|
||||
struct address_space *mapping,
|
||||
struct page *page);
|
||||
|
||||
/* indicate the cookie is no longer cached
|
||||
* - this function is called when the backing store currently caching
|
||||
* a cookie is removed
|
||||
* - the netfs should use this to clean up any markers indicating
|
||||
* cached pages
|
||||
* - this is mandatory for any object that may have data
|
||||
*/
|
||||
void (*now_uncached)(void *cookie_netfs_data);
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -488,8 +488,9 @@ struct mem_cgroup *lock_page_memcg(struct page *page);
|
||||
void __unlock_page_memcg(struct mem_cgroup *memcg);
|
||||
void unlock_page_memcg(struct page *page);
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
long val = 0;
|
||||
int cpu;
|
||||
@ -503,15 +504,17 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
|
||||
return val;
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx, int val)
|
||||
int idx, int val)
|
||||
{
|
||||
if (!mem_cgroup_disabled())
|
||||
__this_cpu_add(memcg->stat->count[idx], val);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx, int val)
|
||||
int idx, int val)
|
||||
{
|
||||
if (!mem_cgroup_disabled())
|
||||
this_cpu_add(memcg->stat->count[idx], val);
|
||||
@ -535,14 +538,14 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||
* Kernel pages are an exception to this, since they'll never move.
|
||||
*/
|
||||
static inline void __mod_memcg_page_state(struct page *page,
|
||||
enum memcg_stat_item idx, int val)
|
||||
int idx, int val)
|
||||
{
|
||||
if (page->mem_cgroup)
|
||||
__mod_memcg_state(page->mem_cgroup, idx, val);
|
||||
}
|
||||
|
||||
static inline void mod_memcg_page_state(struct page *page,
|
||||
enum memcg_stat_item idx, int val)
|
||||
int idx, int val)
|
||||
{
|
||||
if (page->mem_cgroup)
|
||||
mod_memcg_state(page->mem_cgroup, idx, val);
|
||||
@ -632,8 +635,9 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
|
||||
this_cpu_add(memcg->stat->events[idx], count);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void count_memcg_page_event(struct page *page,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
if (page->mem_cgroup)
|
||||
count_memcg_events(page->mem_cgroup, idx, 1);
|
||||
@ -846,31 +850,31 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
|
||||
}
|
||||
|
||||
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx,
|
||||
int idx,
|
||||
int nr)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx,
|
||||
int idx,
|
||||
int nr)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void __mod_memcg_page_state(struct page *page,
|
||||
enum memcg_stat_item idx,
|
||||
int idx,
|
||||
int nr)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mod_memcg_page_state(struct page *page,
|
||||
enum memcg_stat_item idx,
|
||||
int idx,
|
||||
int nr)
|
||||
{
|
||||
}
|
||||
@ -924,7 +928,7 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
|
||||
}
|
||||
|
||||
static inline void count_memcg_page_event(struct page *page,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
}
|
||||
|
||||
@ -934,26 +938,30 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
|
||||
}
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __inc_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
__mod_memcg_state(memcg, idx, 1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __dec_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
__mod_memcg_state(memcg, idx, -1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __inc_memcg_page_state(struct page *page,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
__mod_memcg_page_state(page, idx, 1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __dec_memcg_page_state(struct page *page,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
__mod_memcg_page_state(page, idx, -1);
|
||||
}
|
||||
@ -982,26 +990,30 @@ static inline void __dec_lruvec_page_state(struct page *page,
|
||||
__mod_lruvec_page_state(page, idx, -1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void inc_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
mod_memcg_state(memcg, idx, 1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void dec_memcg_state(struct mem_cgroup *memcg,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
mod_memcg_state(memcg, idx, -1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void inc_memcg_page_state(struct page *page,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
mod_memcg_page_state(page, idx, 1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void dec_memcg_page_state(struct page *page,
|
||||
enum memcg_stat_item idx)
|
||||
int idx)
|
||||
{
|
||||
mod_memcg_page_state(page, idx, -1);
|
||||
}
|
||||
|
@ -319,6 +319,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
|
||||
unsigned long pnum);
|
||||
extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
|
||||
int online_type);
|
||||
extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn,
|
||||
extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
|
||||
unsigned long nr_pages);
|
||||
#endif /* __LINUX_MEMORY_HOTPLUG_H */
|
||||
|
@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp);
|
||||
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
|
||||
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
|
||||
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
|
||||
#define VM_ARCH_2 0x02000000
|
||||
#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
|
||||
#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
|
||||
|
||||
#ifdef CONFIG_MEM_SOFT_DIRTY
|
||||
@ -208,10 +208,12 @@ extern unsigned int kobjsize(const void *objp);
|
||||
#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
|
||||
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
|
||||
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
|
||||
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
|
||||
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
|
||||
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
|
||||
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
|
||||
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
|
||||
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
|
||||
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
|
||||
|
||||
#if defined(CONFIG_X86)
|
||||
@ -235,9 +237,11 @@ extern unsigned int kobjsize(const void *objp);
|
||||
# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_X86)
|
||||
#if defined(CONFIG_X86_INTEL_MPX)
|
||||
/* MPX specific bounds table or bounds directory */
|
||||
# define VM_MPX VM_ARCH_2
|
||||
# define VM_MPX VM_HIGH_ARCH_BIT_4
|
||||
#else
|
||||
# define VM_MPX VM_NONE
|
||||
#endif
|
||||
|
||||
#ifndef VM_GROWSUP
|
||||
@ -2294,6 +2298,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
|
||||
unsigned long pfn, pgprot_t pgprot);
|
||||
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
|
||||
pfn_t pfn);
|
||||
int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
|
||||
pfn_t pfn);
|
||||
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
|
||||
|
||||
|
||||
@ -2506,7 +2512,7 @@ enum mf_action_page_type {
|
||||
|
||||
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
|
||||
extern void clear_huge_page(struct page *page,
|
||||
unsigned long addr,
|
||||
unsigned long addr_hint,
|
||||
unsigned int pages_per_huge_page);
|
||||
extern void copy_user_huge_page(struct page *dst, struct page *src,
|
||||
unsigned long addr, struct vm_area_struct *vma,
|
||||
|
@ -335,6 +335,7 @@ struct vm_area_struct {
|
||||
struct file * vm_file; /* File we map to (can be NULL). */
|
||||
void * vm_private_data; /* was vm_pte (shared mem) */
|
||||
|
||||
atomic_long_t swap_readahead_info;
|
||||
#ifndef CONFIG_MMU
|
||||
struct vm_region *vm_region; /* NOMMU mapping region */
|
||||
#endif
|
||||
|
@ -770,8 +770,7 @@ static inline bool is_dev_zone(const struct zone *zone)
|
||||
|
||||
#include <linux/memory_hotplug.h>
|
||||
|
||||
extern struct mutex zonelists_mutex;
|
||||
void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
|
||||
void build_all_zonelists(pg_data_t *pgdat);
|
||||
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
|
||||
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
|
||||
int classzone_idx, unsigned int alloc_flags,
|
||||
@ -896,7 +895,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
|
||||
extern int numa_zonelist_order_handler(struct ctl_table *, int,
|
||||
void __user *, size_t *, loff_t *);
|
||||
extern char numa_zonelist_order[];
|
||||
#define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */
|
||||
#define NUMA_ZONELIST_ORDER_LEN 16
|
||||
|
||||
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
||||
|
||||
|
@ -303,8 +303,8 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
|
||||
* Only test-and-set exist for PG_writeback. The unconditional operators are
|
||||
* risky: they bypass page accounting.
|
||||
*/
|
||||
TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
|
||||
TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
|
||||
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
|
||||
TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
|
||||
PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
|
||||
|
||||
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
|
||||
|
@ -353,8 +353,16 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
|
||||
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
|
||||
unsigned int nr_entries, struct page **entries,
|
||||
pgoff_t *indices);
|
||||
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
|
||||
unsigned int nr_pages, struct page **pages);
|
||||
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
|
||||
pgoff_t end, unsigned int nr_pages,
|
||||
struct page **pages);
|
||||
static inline unsigned find_get_pages(struct address_space *mapping,
|
||||
pgoff_t *start, unsigned int nr_pages,
|
||||
struct page **pages)
|
||||
{
|
||||
return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
|
||||
pages);
|
||||
}
|
||||
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
|
||||
unsigned int nr_pages, struct page **pages);
|
||||
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
|
||||
|
@ -27,8 +27,16 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
|
||||
pgoff_t start, unsigned nr_entries,
|
||||
pgoff_t *indices);
|
||||
void pagevec_remove_exceptionals(struct pagevec *pvec);
|
||||
unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
|
||||
pgoff_t start, unsigned nr_pages);
|
||||
unsigned pagevec_lookup_range(struct pagevec *pvec,
|
||||
struct address_space *mapping,
|
||||
pgoff_t *start, pgoff_t end);
|
||||
static inline unsigned pagevec_lookup(struct pagevec *pvec,
|
||||
struct address_space *mapping,
|
||||
pgoff_t *start)
|
||||
{
|
||||
return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
|
||||
}
|
||||
|
||||
unsigned pagevec_lookup_tag(struct pagevec *pvec,
|
||||
struct address_space *mapping, pgoff_t *index, int tag,
|
||||
unsigned nr_pages);
|
||||
|
@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
|
||||
|
||||
/* mmput gets rid of the mappings and all user-space */
|
||||
extern void mmput(struct mm_struct *);
|
||||
#ifdef CONFIG_MMU
|
||||
/* same as above but performs the slow path from the async context. Can
|
||||
* be called from the atomic context as well
|
||||
*/
|
||||
extern void mmput_async(struct mm_struct *);
|
||||
#endif
|
||||
|
||||
/* Grab a reference to a task's mm, if it is not already going away */
|
||||
extern struct mm_struct *get_task_mm(struct task_struct *task);
|
||||
|
@ -27,23 +27,6 @@ struct shmid_kernel /* private to the kernel */
|
||||
/* shm_mode upper byte flags */
|
||||
#define SHM_DEST 01000 /* segment will be destroyed on last detach */
|
||||
#define SHM_LOCKED 02000 /* segment will not be swapped */
|
||||
#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
|
||||
#define SHM_NORESERVE 010000 /* don't check for reservations */
|
||||
|
||||
/* Bits [26:31] are reserved */
|
||||
|
||||
/*
|
||||
* When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define SHM_HUGE_SHIFT 26
|
||||
#define SHM_HUGE_MASK 0x3f
|
||||
#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
|
||||
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
|
||||
|
||||
#ifdef CONFIG_SYSVIPC
|
||||
struct sysv_shm {
|
||||
|
@ -137,9 +137,15 @@ extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
|
||||
unsigned long dst_addr,
|
||||
unsigned long src_addr,
|
||||
struct page **pagep);
|
||||
extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
|
||||
pmd_t *dst_pmd,
|
||||
struct vm_area_struct *dst_vma,
|
||||
unsigned long dst_addr);
|
||||
#else
|
||||
#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
|
||||
src_addr, pagep) ({ BUG(); 0; })
|
||||
#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
|
||||
dst_addr) ({ BUG(); 0; })
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -18,6 +18,13 @@ struct shrink_control {
|
||||
*/
|
||||
unsigned long nr_to_scan;
|
||||
|
||||
/*
|
||||
* How many objects did scan_objects process?
|
||||
* This defaults to nr_to_scan before every call, but the callee
|
||||
* should track its actual progress.
|
||||
*/
|
||||
unsigned long nr_scanned;
|
||||
|
||||
/* current node being shrunk (for NUMA aware shrinkers) */
|
||||
int nid;
|
||||
|
||||
|
@ -115,6 +115,10 @@ struct kmem_cache {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SLAB_FREELIST_HARDENED
|
||||
unsigned long random;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/*
|
||||
* Defragmentation by allocating from a remote node.
|
||||
|
@ -188,6 +188,7 @@ struct swap_cluster_info {
|
||||
};
|
||||
#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
|
||||
#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
|
||||
#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
|
||||
|
||||
/*
|
||||
* We assign a cluster to each CPU, so each CPU can allocate swap entry from
|
||||
@ -211,7 +212,7 @@ struct swap_info_struct {
|
||||
unsigned long flags; /* SWP_USED etc: see above */
|
||||
signed short prio; /* swap priority of this type */
|
||||
struct plist_node list; /* entry in swap_active_head */
|
||||
struct plist_node avail_list; /* entry in swap_avail_head */
|
||||
struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */
|
||||
signed char type; /* strange name for an index */
|
||||
unsigned int max; /* extent of the swap_map */
|
||||
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
|
||||
@ -250,6 +251,25 @@ struct swap_info_struct {
|
||||
struct swap_cluster_list discard_clusters; /* discard clusters list */
|
||||
};
|
||||
|
||||
#ifdef CONFIG_64BIT
|
||||
#define SWAP_RA_ORDER_CEILING 5
|
||||
#else
|
||||
/* Avoid stack overflow, because we need to save part of page table */
|
||||
#define SWAP_RA_ORDER_CEILING 3
|
||||
#define SWAP_RA_PTE_CACHE_SIZE (1 << SWAP_RA_ORDER_CEILING)
|
||||
#endif
|
||||
|
||||
struct vma_swap_readahead {
|
||||
unsigned short win;
|
||||
unsigned short offset;
|
||||
unsigned short nr_pte;
|
||||
#ifdef CONFIG_64BIT
|
||||
pte_t *ptes;
|
||||
#else
|
||||
pte_t ptes[SWAP_RA_PTE_CACHE_SIZE];
|
||||
#endif
|
||||
};
|
||||
|
||||
/* linux/mm/workingset.c */
|
||||
void *workingset_eviction(struct address_space *mapping, struct page *page);
|
||||
bool workingset_refault(void *shadow);
|
||||
@ -262,8 +282,8 @@ extern unsigned long totalreserve_pages;
|
||||
extern unsigned long nr_free_buffer_pages(void);
|
||||
extern unsigned long nr_free_pagecache_pages(void);
|
||||
|
||||
/* Definition of global_page_state not available yet */
|
||||
#define nr_free_pages() global_page_state(NR_FREE_PAGES)
|
||||
/* Definition of global_zone_page_state not available yet */
|
||||
#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
|
||||
|
||||
|
||||
/* linux/mm/swap.c */
|
||||
@ -349,6 +369,7 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
|
||||
#define SWAP_ADDRESS_SPACE_SHIFT 14
|
||||
#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
|
||||
extern struct address_space *swapper_spaces[];
|
||||
extern bool swap_vma_readahead;
|
||||
#define swap_address_space(entry) \
|
||||
(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
|
||||
>> SWAP_ADDRESS_SPACE_SHIFT])
|
||||
@ -361,7 +382,9 @@ extern void __delete_from_swap_cache(struct page *);
|
||||
extern void delete_from_swap_cache(struct page *);
|
||||
extern void free_page_and_swap_cache(struct page *);
|
||||
extern void free_pages_and_swap_cache(struct page **, int);
|
||||
extern struct page *lookup_swap_cache(swp_entry_t);
|
||||
extern struct page *lookup_swap_cache(swp_entry_t entry,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr);
|
||||
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
bool do_poll);
|
||||
@ -371,11 +394,23 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
|
||||
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
|
||||
struct vm_area_struct *vma, unsigned long addr);
|
||||
|
||||
extern struct page *swap_readahead_detect(struct vm_fault *vmf,
|
||||
struct vma_swap_readahead *swap_ra);
|
||||
extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
|
||||
struct vm_fault *vmf,
|
||||
struct vma_swap_readahead *swap_ra);
|
||||
|
||||
/* linux/mm/swapfile.c */
|
||||
extern atomic_long_t nr_swap_pages;
|
||||
extern long total_swap_pages;
|
||||
extern atomic_t nr_rotate_swap;
|
||||
extern bool has_usable_swap(void);
|
||||
|
||||
static inline bool swap_use_vma_readahead(void)
|
||||
{
|
||||
return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap);
|
||||
}
|
||||
|
||||
/* Swap 50% full? Release swapcache more aggressively.. */
|
||||
static inline bool vm_swap_full(void)
|
||||
{
|
||||
@ -465,12 +500,32 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline bool swap_use_vma_readahead(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline struct page *swap_readahead_detect(
|
||||
struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct page *do_swap_page_readahead(
|
||||
swp_entry_t fentry, gfp_t gfp_mask,
|
||||
struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline struct page *lookup_swap_cache(swp_entry_t swp)
|
||||
static inline struct page *lookup_swap_cache(swp_entry_t swp,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
@ -509,8 +564,8 @@ static inline int swp_swapcount(swp_entry_t entry)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define reuse_swap_page(page, total_mapcount) \
|
||||
(page_trans_huge_mapcount(page, total_mapcount) == 1)
|
||||
#define reuse_swap_page(page, total_map_swapcount) \
|
||||
(page_trans_huge_mapcount(page, total_map_swapcount) == 1)
|
||||
|
||||
static inline int try_to_free_swap(struct page *page)
|
||||
{
|
||||
@ -526,6 +581,15 @@ static inline swp_entry_t get_swap_page(struct page *page)
|
||||
|
||||
#endif /* CONFIG_SWAP */
|
||||
|
||||
#ifdef CONFIG_THP_SWAP
|
||||
extern int split_swap_cluster(swp_entry_t entry);
|
||||
#else
|
||||
static inline int split_swap_cluster(swp_entry_t entry)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
|
||||
{
|
||||
|
@ -85,6 +85,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
||||
#endif
|
||||
THP_ZERO_PAGE_ALLOC,
|
||||
THP_ZERO_PAGE_ALLOC_FAILED,
|
||||
THP_SWPOUT,
|
||||
THP_SWPOUT_FALLBACK,
|
||||
#endif
|
||||
#ifdef CONFIG_MEMORY_BALLOON
|
||||
BALLOON_INFLATE,
|
||||
@ -103,6 +105,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
||||
VMACACHE_FIND_CALLS,
|
||||
VMACACHE_FIND_HITS,
|
||||
VMACACHE_FULL_FLUSHES,
|
||||
#endif
|
||||
#ifdef CONFIG_SWAP
|
||||
SWAP_RA,
|
||||
SWAP_RA_HIT,
|
||||
#endif
|
||||
NR_VM_EVENT_ITEMS
|
||||
};
|
||||
|
@ -123,7 +123,7 @@ static inline void node_page_state_add(long x, struct pglist_data *pgdat,
|
||||
atomic_long_add(x, &vm_node_stat[item]);
|
||||
}
|
||||
|
||||
static inline unsigned long global_page_state(enum zone_stat_item item)
|
||||
static inline unsigned long global_zone_page_state(enum zone_stat_item item)
|
||||
{
|
||||
long x = atomic_long_read(&vm_zone_stat[item]);
|
||||
#ifdef CONFIG_SMP
|
||||
@ -199,7 +199,7 @@ extern unsigned long sum_zone_node_page_state(int node,
|
||||
extern unsigned long node_page_state(struct pglist_data *pgdat,
|
||||
enum node_stat_item item);
|
||||
#else
|
||||
#define sum_zone_node_page_state(node, item) global_page_state(item)
|
||||
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
|
||||
#define node_page_state(node, item) global_node_page_state(item)
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
|
@ -190,8 +190,6 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
|
||||
|
||||
DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
|
||||
DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
|
||||
DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
|
||||
DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
|
||||
DEFINE_PTE_FAULT_EVENT(dax_load_hole);
|
||||
|
||||
TRACE_EVENT(dax_insert_mapping,
|
||||
|
@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
|
||||
#define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" }
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_X86)
|
||||
#define __VM_ARCH_SPECIFIC_2 {VM_MPX, "mpx" }
|
||||
#else
|
||||
#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2, "arch_2" }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEM_SOFT_DIRTY
|
||||
#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
|
||||
#else
|
||||
@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
|
||||
{VM_NORESERVE, "noreserve" }, \
|
||||
{VM_HUGETLB, "hugetlb" }, \
|
||||
__VM_ARCH_SPECIFIC_1 , \
|
||||
__VM_ARCH_SPECIFIC_2 , \
|
||||
{VM_WIPEONFORK, "wipeonfork" }, \
|
||||
{VM_DONTDUMP, "dontdump" }, \
|
||||
IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
|
||||
{VM_MIXEDMAP, "mixedmap" }, \
|
||||
|
34
include/uapi/asm-generic/hugetlb_encode.h
Normal file
34
include/uapi/asm-generic/hugetlb_encode.h
Normal file
@ -0,0 +1,34 @@
|
||||
#ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_
|
||||
#define _ASM_GENERIC_HUGETLB_ENCODE_H_
|
||||
|
||||
/*
|
||||
* Several system calls take a flag to request "hugetlb" huge pages.
|
||||
* Without further specification, these system calls will use the
|
||||
* system's default huge page size. If a system supports multiple
|
||||
* huge page sizes, the desired huge page size can be specified in
|
||||
* bits [26:31] of the flag arguments. The value in these 6 bits
|
||||
* will encode the log2 of the huge page size.
|
||||
*
|
||||
* The following definitions are associated with this huge page size
|
||||
* encoding in flag arguments. System call specific header files
|
||||
* that use this encoding should include this file. They can then
|
||||
* provide definitions based on these with their own specific prefix.
|
||||
* for example:
|
||||
* #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
|
||||
*/
|
||||
|
||||
#define HUGETLB_FLAG_ENCODE_SHIFT 26
|
||||
#define HUGETLB_FLAG_ENCODE_MASK 0x3f
|
||||
|
||||
#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT)
|
||||
|
||||
#endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */
|
@ -58,20 +58,12 @@
|
||||
overrides the coredump filter bits */
|
||||
#define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */
|
||||
|
||||
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
/*
|
||||
* When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
|
||||
* This gives us 6 bits, which is enough until someone invents 128 bit address
|
||||
* spaces.
|
||||
*
|
||||
* Assume these are all power of twos.
|
||||
* When 0 use the default page size.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT 26
|
||||
#define MAP_HUGE_MASK 0x3f
|
||||
|
||||
#define PKEY_DISABLE_ACCESS 0x1
|
||||
#define PKEY_DISABLE_WRITE 0x2
|
||||
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
|
||||
|
@ -1,8 +1,32 @@
|
||||
#ifndef _UAPI_LINUX_MEMFD_H
|
||||
#define _UAPI_LINUX_MEMFD_H
|
||||
|
||||
#include <asm-generic/hugetlb_encode.h>
|
||||
|
||||
/* flags for memfd_create(2) (unsigned int) */
|
||||
#define MFD_CLOEXEC 0x0001U
|
||||
#define MFD_ALLOW_SEALING 0x0002U
|
||||
#define MFD_HUGETLB 0x0004U
|
||||
|
||||
/*
|
||||
* Huge page size encoding when MFD_HUGETLB is specified, and a huge page
|
||||
* size other than the default is desired. See hugetlb_encode.h.
|
||||
* All known huge page size encodings are provided here. It is the
|
||||
* responsibility of the application to know which sizes are supported on
|
||||
* the running system. See mmap(2) man page for details.
|
||||
*/
|
||||
#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
|
||||
#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK
|
||||
|
||||
#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB
|
||||
#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
|
||||
#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB
|
||||
#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
|
||||
#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
|
||||
#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
|
||||
#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
|
||||
#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
|
||||
#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
|
||||
#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
|
||||
|
||||
#endif /* _UAPI_LINUX_MEMFD_H */
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define _UAPI_LINUX_MMAN_H
|
||||
|
||||
#include <asm/mman.h>
|
||||
#include <asm-generic/hugetlb_encode.h>
|
||||
|
||||
#define MREMAP_MAYMOVE 1
|
||||
#define MREMAP_FIXED 2
|
||||
@ -10,4 +11,25 @@
|
||||
#define OVERCOMMIT_ALWAYS 1
|
||||
#define OVERCOMMIT_NEVER 2
|
||||
|
||||
/*
|
||||
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page
|
||||
* size other than the default is desired. See hugetlb_encode.h.
|
||||
* All known huge page size encodings are provided here. It is the
|
||||
* responsibility of the application to know which sizes are supported on
|
||||
* the running system. See mmap(2) man page for details.
|
||||
*/
|
||||
#define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
|
||||
#define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK
|
||||
|
||||
#define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB
|
||||
#define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
|
||||
#define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB
|
||||
#define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
|
||||
#define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
|
||||
#define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
|
||||
#define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
|
||||
#define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
|
||||
#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
|
||||
#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
|
||||
|
||||
#endif /* _UAPI_LINUX_MMAN_H */
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include <linux/ipc.h>
|
||||
#include <linux/errno.h>
|
||||
#include <asm-generic/hugetlb_encode.h>
|
||||
#ifndef __KERNEL__
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
@ -40,11 +41,37 @@ struct shmid_ds {
|
||||
/* Include the definition of shmid64_ds and shminfo64 */
|
||||
#include <asm/shmbuf.h>
|
||||
|
||||
/* permission flag for shmget */
|
||||
/*
|
||||
* shmget() shmflg values.
|
||||
*/
|
||||
/* The bottom nine bits are the same as open(2) mode flags */
|
||||
#define SHM_R 0400 /* or S_IRUGO from <linux/stat.h> */
|
||||
#define SHM_W 0200 /* or S_IWUGO from <linux/stat.h> */
|
||||
/* Bits 9 & 10 are IPC_CREAT and IPC_EXCL */
|
||||
#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
|
||||
#define SHM_NORESERVE 010000 /* don't check for reservations */
|
||||
|
||||
/* mode for attach */
|
||||
/*
|
||||
* Huge page size encoding when SHM_HUGETLB is specified, and a huge page
|
||||
* size other than the default is desired. See hugetlb_encode.h
|
||||
*/
|
||||
#define SHM_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
|
||||
#define SHM_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK
|
||||
|
||||
#define SHM_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB
|
||||
#define SHM_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
|
||||
#define SHM_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB
|
||||
#define SHM_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
|
||||
#define SHM_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
|
||||
#define SHM_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
|
||||
#define SHM_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
|
||||
#define SHM_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
|
||||
#define SHM_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
|
||||
#define SHM_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
|
||||
|
||||
/*
|
||||
* shmat() shmflg values
|
||||
*/
|
||||
#define SHM_RDONLY 010000 /* read-only access */
|
||||
#define SHM_RND 020000 /* round attach address to SHMLBA boundary */
|
||||
#define SHM_REMAP 040000 /* take-over region on attach */
|
||||
|
@ -23,7 +23,9 @@
|
||||
UFFD_FEATURE_EVENT_REMOVE | \
|
||||
UFFD_FEATURE_EVENT_UNMAP | \
|
||||
UFFD_FEATURE_MISSING_HUGETLBFS | \
|
||||
UFFD_FEATURE_MISSING_SHMEM)
|
||||
UFFD_FEATURE_MISSING_SHMEM | \
|
||||
UFFD_FEATURE_SIGBUS | \
|
||||
UFFD_FEATURE_THREAD_ID)
|
||||
#define UFFD_API_IOCTLS \
|
||||
((__u64)1 << _UFFDIO_REGISTER | \
|
||||
(__u64)1 << _UFFDIO_UNREGISTER | \
|
||||
@ -78,6 +80,9 @@ struct uffd_msg {
|
||||
struct {
|
||||
__u64 flags;
|
||||
__u64 address;
|
||||
union {
|
||||
__u32 ptid;
|
||||
} feat;
|
||||
} pagefault;
|
||||
|
||||
struct {
|
||||
@ -153,6 +158,13 @@ struct uffdio_api {
|
||||
* UFFD_FEATURE_MISSING_SHMEM works the same as
|
||||
* UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
|
||||
* (i.e. tmpfs and other shmem based APIs).
|
||||
*
|
||||
* UFFD_FEATURE_SIGBUS feature means no page-fault
|
||||
* (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
|
||||
* a SIGBUS signal will be sent to the faulting process.
|
||||
*
|
||||
* UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
|
||||
* be returned, if feature is not requested 0 will be returned.
|
||||
*/
|
||||
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
|
||||
#define UFFD_FEATURE_EVENT_FORK (1<<1)
|
||||
@ -161,6 +173,8 @@ struct uffdio_api {
|
||||
#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4)
|
||||
#define UFFD_FEATURE_MISSING_SHMEM (1<<5)
|
||||
#define UFFD_FEATURE_EVENT_UNMAP (1<<6)
|
||||
#define UFFD_FEATURE_SIGBUS (1<<7)
|
||||
#define UFFD_FEATURE_THREAD_ID (1<<8)
|
||||
__u64 features;
|
||||
|
||||
__u64 ioctls;
|
||||
|
@ -1576,6 +1576,15 @@ config SLAB_FREELIST_RANDOM
|
||||
security feature reduces the predictability of the kernel slab
|
||||
allocator against heap overflows.
|
||||
|
||||
config SLAB_FREELIST_HARDENED
|
||||
bool "Harden slab freelist metadata"
|
||||
depends on SLUB
|
||||
help
|
||||
Many kernel heap attacks try to target slab cache metadata and
|
||||
other infrastructure. This options makes minor performance
|
||||
sacrifies to harden the kernel slab allocator against common
|
||||
freelist exploit methods.
|
||||
|
||||
config SLUB_CPU_PARTIAL
|
||||
default y
|
||||
depends on SLUB && SMP
|
||||
|
@ -542,7 +542,7 @@ asmlinkage __visible void __init start_kernel(void)
|
||||
boot_cpu_state_init();
|
||||
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
|
||||
|
||||
build_all_zonelists(NULL, NULL);
|
||||
build_all_zonelists(NULL);
|
||||
page_alloc_init();
|
||||
|
||||
pr_notice("Kernel command line: %s\n", boot_command_line);
|
||||
|
@ -4100,9 +4100,6 @@ static void offline_css(struct cgroup_subsys_state *css)
|
||||
if (!(css->flags & CSS_ONLINE))
|
||||
return;
|
||||
|
||||
if (ss->css_reset)
|
||||
ss->css_reset(css);
|
||||
|
||||
if (ss->css_offline)
|
||||
ss->css_offline(css);
|
||||
|
||||
|
@ -56,6 +56,7 @@
|
||||
#include <linux/time64.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/oom.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/atomic.h>
|
||||
@ -2500,12 +2501,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
|
||||
* If we're in interrupt, yes, we can always allocate. If @node is set in
|
||||
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
|
||||
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
|
||||
* yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
|
||||
* yes. If current has access to memory reserves as an oom victim, yes.
|
||||
* Otherwise, no.
|
||||
*
|
||||
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
|
||||
* and do not allow allocations outside the current tasks cpuset
|
||||
* unless the task has been OOM killed as is marked TIF_MEMDIE.
|
||||
* unless the task has been OOM killed.
|
||||
* GFP_KERNEL allocations are not so marked, so can escape to the
|
||||
* nearest enclosing hardwalled ancestor cpuset.
|
||||
*
|
||||
@ -2528,7 +2529,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
|
||||
* affect that:
|
||||
* in_interrupt - any node ok (current task context irrelevant)
|
||||
* GFP_ATOMIC - any node ok
|
||||
* TIF_MEMDIE - any node ok
|
||||
* tsk_is_oom_victim - any node ok
|
||||
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
|
||||
* GFP_USER - only nodes in current tasks mems allowed ok.
|
||||
*/
|
||||
@ -2546,7 +2547,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
|
||||
* Allow tasks that have access to memory reserves because they have
|
||||
* been OOM killed to get memory anywhere.
|
||||
*/
|
||||
if (unlikely(test_thread_flag(TIF_MEMDIE)))
|
||||
if (unlikely(tsk_is_oom_victim(current)))
|
||||
return true;
|
||||
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
|
||||
return false;
|
||||
|
@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
retval = dup_userfaultfd(tmp, &uf);
|
||||
if (retval)
|
||||
goto fail_nomem_anon_vma_fork;
|
||||
if (anon_vma_fork(tmp, mpnt))
|
||||
if (tmp->vm_flags & VM_WIPEONFORK) {
|
||||
/* VM_WIPEONFORK gets a clean slate in the child. */
|
||||
tmp->anon_vma = NULL;
|
||||
if (anon_vma_prepare(tmp))
|
||||
goto fail_nomem_anon_vma_fork;
|
||||
} else if (anon_vma_fork(tmp, mpnt))
|
||||
goto fail_nomem_anon_vma_fork;
|
||||
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
|
||||
tmp->vm_next = tmp->vm_prev = NULL;
|
||||
@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
||||
rb_parent = &tmp->vm_rb;
|
||||
|
||||
mm->map_count++;
|
||||
retval = copy_page_range(mm, oldmm, mpnt);
|
||||
if (!(tmp->vm_flags & VM_WIPEONFORK))
|
||||
retval = copy_page_range(mm, oldmm, mpnt);
|
||||
|
||||
if (tmp->vm_ops && tmp->vm_ops->open)
|
||||
tmp->vm_ops->open(tmp);
|
||||
@ -922,7 +928,6 @@ static inline void __mmput(struct mm_struct *mm)
|
||||
}
|
||||
if (mm->binfmt)
|
||||
module_put(mm->binfmt->module);
|
||||
set_bit(MMF_OOM_SKIP, &mm->flags);
|
||||
mmdrop(mm);
|
||||
}
|
||||
|
||||
@ -938,22 +943,6 @@ void mmput(struct mm_struct *mm)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mmput);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
static void mmput_async_fn(struct work_struct *work)
|
||||
{
|
||||
struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
|
||||
__mmput(mm);
|
||||
}
|
||||
|
||||
void mmput_async(struct mm_struct *mm)
|
||||
{
|
||||
if (atomic_dec_and_test(&mm->mm_users)) {
|
||||
INIT_WORK(&mm->async_put_work, mmput_async_fn);
|
||||
schedule_work(&mm->async_put_work);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* set_mm_exe_file - change a reference to the mm's executable file
|
||||
*
|
||||
|
@ -194,18 +194,41 @@ struct page_map {
|
||||
struct vmem_altmap altmap;
|
||||
};
|
||||
|
||||
static unsigned long order_at(struct resource *res, unsigned long pgoff)
|
||||
{
|
||||
unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
|
||||
unsigned long nr_pages, mask;
|
||||
|
||||
nr_pages = PHYS_PFN(resource_size(res));
|
||||
if (nr_pages == pgoff)
|
||||
return ULONG_MAX;
|
||||
|
||||
/*
|
||||
* What is the largest aligned power-of-2 range available from
|
||||
* this resource pgoff to the end of the resource range,
|
||||
* considering the alignment of the current pgoff?
|
||||
*/
|
||||
mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
|
||||
if (!mask)
|
||||
return ULONG_MAX;
|
||||
|
||||
return find_first_bit(&mask, BITS_PER_LONG);
|
||||
}
|
||||
|
||||
#define foreach_order_pgoff(res, order, pgoff) \
|
||||
for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
|
||||
pgoff += 1UL << order, order = order_at((res), pgoff))
|
||||
|
||||
static void pgmap_radix_release(struct resource *res)
|
||||
{
|
||||
resource_size_t key, align_start, align_size, align_end;
|
||||
|
||||
align_start = res->start & ~(SECTION_SIZE - 1);
|
||||
align_size = ALIGN(resource_size(res), SECTION_SIZE);
|
||||
align_end = align_start + align_size - 1;
|
||||
unsigned long pgoff, order;
|
||||
|
||||
mutex_lock(&pgmap_lock);
|
||||
for (key = res->start; key <= res->end; key += SECTION_SIZE)
|
||||
radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
|
||||
foreach_order_pgoff(res, order, pgoff)
|
||||
radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
|
||||
mutex_unlock(&pgmap_lock);
|
||||
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
static unsigned long pfn_first(struct page_map *page_map)
|
||||
@ -268,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
|
||||
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
|
||||
page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
|
||||
page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
|
||||
return page_map ? &page_map->pgmap : NULL;
|
||||
}
|
||||
|
||||
@ -293,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
|
||||
void *devm_memremap_pages(struct device *dev, struct resource *res,
|
||||
struct percpu_ref *ref, struct vmem_altmap *altmap)
|
||||
{
|
||||
resource_size_t key, align_start, align_size, align_end;
|
||||
resource_size_t align_start, align_size, align_end;
|
||||
unsigned long pfn, pgoff, order;
|
||||
pgprot_t pgprot = PAGE_KERNEL;
|
||||
struct dev_pagemap *pgmap;
|
||||
struct page_map *page_map;
|
||||
int error, nid, is_ram;
|
||||
unsigned long pfn;
|
||||
|
||||
align_start = res->start & ~(SECTION_SIZE - 1);
|
||||
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
|
||||
@ -337,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
|
||||
mutex_lock(&pgmap_lock);
|
||||
error = 0;
|
||||
align_end = align_start + align_size - 1;
|
||||
for (key = align_start; key <= align_end; key += SECTION_SIZE) {
|
||||
|
||||
foreach_order_pgoff(res, order, pgoff) {
|
||||
struct dev_pagemap *dup;
|
||||
|
||||
rcu_read_lock();
|
||||
dup = find_dev_pagemap(key);
|
||||
dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
|
||||
rcu_read_unlock();
|
||||
if (dup) {
|
||||
dev_err(dev, "%s: %pr collides with mapping for %s\n",
|
||||
@ -349,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
|
||||
error = -EBUSY;
|
||||
break;
|
||||
}
|
||||
error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
|
||||
page_map);
|
||||
error = __radix_tree_insert(&pgmap_radix,
|
||||
PHYS_PFN(res->start) + pgoff, order, page_map);
|
||||
if (error) {
|
||||
dev_err(dev, "%s: failed: %d\n", __func__, error);
|
||||
break;
|
||||
|
@ -678,6 +678,7 @@ config ZONE_DEVICE
|
||||
depends on MEMORY_HOTREMOVE
|
||||
depends on SPARSEMEM_VMEMMAP
|
||||
depends on ARCH_HAS_ZONE_DEVICE
|
||||
select RADIX_TREE_MULTIORDER
|
||||
|
||||
help
|
||||
Device memory hotplug support allows for establishing pmem,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user