mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-01 10:42:11 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton: - inotify tweaks - some ocfs2 updates (many more are awaiting review) - various misc bits - kernel/watchdog.c updates - Some of mm. I have a huge number of MM patches this time and quite a lot of it is quite difficult and much will be held over to next time. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (162 commits) selftests: vm: add tests for lock on fault mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage mm: introduce VM_LOCKONFAULT mm: mlock: add new mlock system call mm: mlock: refactor mlock, munlock, and munlockall code kasan: always taint kernel on report mm, slub, kasan: enable user tracking by default with KASAN=y kasan: use IS_ALIGNED in memory_is_poisoned_8() kasan: Fix a type conversion error lib: test_kasan: add some testcases kasan: update reference to kasan prototype repo kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile kasan: various fixes in documentation kasan: update log messages kasan: accurately determine the type of the bad access kasan: update reported bug types for kernel memory accesses kasan: update reported bug types for not user nor kernel memory accesses mm/kasan: prevent deadlock in kasan reporting mm/kasan: don't use kasan shadow pointer in generic functions mm/kasan: MODULE_VADDR is not available on all archs ...
This commit is contained in:
commit
2e3078af2c
@ -175,6 +175,7 @@ read the file /proc/PID/status:
|
||||
VmLib: 1412 kB
|
||||
VmPTE: 20 kb
|
||||
VmSwap: 0 kB
|
||||
HugetlbPages: 0 kB
|
||||
Threads: 1
|
||||
SigQ: 0/28578
|
||||
SigPnd: 0000000000000000
|
||||
@ -238,6 +239,7 @@ Table 1-2: Contents of the status files (as of 4.1)
|
||||
VmPTE size of page table entries
|
||||
VmPMD size of second level page tables
|
||||
VmSwap size of swap usage (the number of referred swapents)
|
||||
HugetlbPages size of hugetlb memory portions
|
||||
Threads number of threads
|
||||
SigQ number of signals queued/max. number for queue
|
||||
SigPnd bitmap of pending signals for the thread
|
||||
@ -424,12 +426,15 @@ Private_Clean: 0 kB
|
||||
Private_Dirty: 0 kB
|
||||
Referenced: 892 kB
|
||||
Anonymous: 0 kB
|
||||
AnonHugePages: 0 kB
|
||||
Shared_Hugetlb: 0 kB
|
||||
Private_Hugetlb: 0 kB
|
||||
Swap: 0 kB
|
||||
SwapPss: 0 kB
|
||||
KernelPageSize: 4 kB
|
||||
MMUPageSize: 4 kB
|
||||
Locked: 374 kB
|
||||
VmFlags: rd ex mr mw me de
|
||||
Locked: 0 kB
|
||||
VmFlags: rd ex mr mw me dw
|
||||
|
||||
the first of these lines shows the same information as is displayed for the
|
||||
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
|
||||
@ -449,9 +454,14 @@ accessed.
|
||||
"Anonymous" shows the amount of memory that does not belong to any file. Even
|
||||
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
|
||||
and a page is modified, the file page is replaced by a private anonymous copy.
|
||||
"Swap" shows how much would-be-anonymous memory is also used, but out on
|
||||
swap.
|
||||
"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
|
||||
"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
|
||||
hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
|
||||
reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
|
||||
"Swap" shows how much would-be-anonymous memory is also used, but out on swap.
|
||||
"SwapPss" shows proportional swap share of this mapping.
|
||||
"Locked" indicates whether the mapping is locked in memory or not.
|
||||
|
||||
"VmFlags" field deserves a separate description. This member represents the kernel
|
||||
flags associated with the particular virtual memory area in two letter encoded
|
||||
manner. The codes are the following:
|
||||
@ -475,7 +485,6 @@ manner. The codes are the following:
|
||||
ac - area is accountable
|
||||
nr - swap space is not reserved for the area
|
||||
ht - area uses huge tlb pages
|
||||
nl - non-linear mapping
|
||||
ar - architecture specific flag
|
||||
dd - do not include area into core dump
|
||||
sd - soft-dirty flag
|
||||
@ -815,9 +824,6 @@ varies by architecture and compile options. The following is from a
|
||||
|
||||
> cat /proc/meminfo
|
||||
|
||||
The "Locked" indicates whether the mapping is locked in memory or not.
|
||||
|
||||
|
||||
MemTotal: 16344972 kB
|
||||
MemFree: 13634064 kB
|
||||
MemAvailable: 14836172 kB
|
||||
|
@ -1,36 +1,34 @@
|
||||
Kernel address sanitizer
|
||||
================
|
||||
KernelAddressSanitizer (KASAN)
|
||||
==============================
|
||||
|
||||
0. Overview
|
||||
===========
|
||||
|
||||
Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides
|
||||
KernelAddressSANitizer (KASAN) is a dynamic memory error detector. It provides
|
||||
a fast and comprehensive solution for finding use-after-free and out-of-bounds
|
||||
bugs.
|
||||
|
||||
KASan uses compile-time instrumentation for checking every memory access,
|
||||
therefore you will need a gcc version of 4.9.2 or later. KASan could detect out
|
||||
of bounds accesses to stack or global variables, but only if gcc 5.0 or later was
|
||||
used to built the kernel.
|
||||
KASAN uses compile-time instrumentation for checking every memory access,
|
||||
therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
|
||||
required for detection of out-of-bounds accesses to stack or global variables.
|
||||
|
||||
Currently KASan is supported only for x86_64 architecture and requires that the
|
||||
kernel be built with the SLUB allocator.
|
||||
Currently KASAN is supported only for x86_64 architecture and requires the
|
||||
kernel to be built with the SLUB allocator.
|
||||
|
||||
1. Usage
|
||||
=========
|
||||
========
|
||||
|
||||
To enable KASAN configure kernel with:
|
||||
|
||||
CONFIG_KASAN = y
|
||||
|
||||
and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline
|
||||
is compiler instrumentation types. The former produces smaller binary the
|
||||
latter is 1.1 - 2 times faster. Inline instrumentation requires a gcc version
|
||||
of 5.0 or later.
|
||||
and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and
|
||||
inline are compiler instrumentation types. The former produces smaller binary
|
||||
the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
|
||||
version 5.0 or later.
|
||||
|
||||
Currently KASAN works only with the SLUB memory allocator.
|
||||
For better bug detection and nicer report, enable CONFIG_STACKTRACE and put
|
||||
at least 'slub_debug=U' in the boot cmdline.
|
||||
For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
|
||||
|
||||
To disable instrumentation for specific files or directories, add a line
|
||||
similar to the following to the respective kernel Makefile:
|
||||
@ -42,7 +40,7 @@ similar to the following to the respective kernel Makefile:
|
||||
KASAN_SANITIZE := n
|
||||
|
||||
1.1 Error reports
|
||||
==========
|
||||
=================
|
||||
|
||||
A typical out of bounds access report looks like this:
|
||||
|
||||
@ -119,14 +117,16 @@ Memory state around the buggy address:
|
||||
ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
|
||||
==================================================================
|
||||
|
||||
First sections describe slub object where bad access happened.
|
||||
See 'SLUB Debug output' section in Documentation/vm/slub.txt for details.
|
||||
The header of the report discribe what kind of bug happened and what kind of
|
||||
access caused it. It's followed by the description of the accessed slub object
|
||||
(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
|
||||
the description of the accessed memory page.
|
||||
|
||||
In the last section the report shows memory state around the accessed address.
|
||||
Reading this part requires some more understanding of how KASAN works.
|
||||
Reading this part requires some understanding of how KASAN works.
|
||||
|
||||
Each 8 bytes of memory are encoded in one shadow byte as accessible,
|
||||
partially accessible, freed or they can be part of a redzone.
|
||||
The state of each 8 aligned bytes of memory is encoded in one shadow byte.
|
||||
Those 8 bytes can be accessible, partially accessible, freed or be a redzone.
|
||||
We use the following encoding for each shadow byte: 0 means that all 8 bytes
|
||||
of the corresponding memory region are accessible; number N (1 <= N <= 7) means
|
||||
that the first N bytes are accessible, and other (8 - N) bytes are not;
|
||||
@ -139,7 +139,7 @@ the accessed address is partially accessible.
|
||||
|
||||
|
||||
2. Implementation details
|
||||
========================
|
||||
=========================
|
||||
|
||||
From a high level, our approach to memory error detection is similar to that
|
||||
of kmemcheck: use shadow memory to record whether each byte of memory is safe
|
||||
|
@ -1275,6 +1275,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
||||
Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
|
||||
Default: 1024
|
||||
|
||||
hardlockup_all_cpu_backtrace=
|
||||
[KNL] Should the hard-lockup detector generate
|
||||
backtraces on all cpus.
|
||||
Format: <integer>
|
||||
|
||||
hashdist= [KNL,NUMA] Large hashes allocated during boot
|
||||
are distributed across NUMA nodes. Defaults on
|
||||
for 64-bit NUMA, off otherwise.
|
||||
|
@ -20,8 +20,9 @@ kernel mode for more than 10 seconds (see "Implementation" below for
|
||||
details), without letting other interrupts have a chance to run.
|
||||
Similarly to the softlockup case, the current stack trace is displayed
|
||||
upon detection and the system will stay locked up unless the default
|
||||
behavior is changed, which can be done through a compile time knob,
|
||||
"BOOTPARAM_HARDLOCKUP_PANIC", and a kernel parameter, "nmi_watchdog"
|
||||
behavior is changed, which can be done through a sysctl,
|
||||
'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
|
||||
and a kernel parameter, "nmi_watchdog"
|
||||
(see "Documentation/kernel-parameters.txt" for details).
|
||||
|
||||
The panic option can be used in combination with panic_timeout (this
|
||||
|
@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
|
||||
- domainname
|
||||
- hostname
|
||||
- hotplug
|
||||
- hardlockup_all_cpu_backtrace
|
||||
- hung_task_panic
|
||||
- hung_task_check_count
|
||||
- hung_task_timeout_secs
|
||||
@ -292,6 +293,17 @@ Information Service) or YP (Yellow Pages) domainname. These two
|
||||
domain names are in general different. For a detailed discussion
|
||||
see the hostname(1) man page.
|
||||
|
||||
==============================================================
|
||||
hardlockup_all_cpu_backtrace:
|
||||
|
||||
This value controls the hard lockup detector behavior when a hard
|
||||
lockup condition is detected as to whether or not to gather further
|
||||
debug information. If enabled, arch-specific all-CPU stack dumping
|
||||
will be initiated.
|
||||
|
||||
0: do nothing. This is the default behavior.
|
||||
|
||||
1: on detection capture more debug information.
|
||||
==============================================================
|
||||
|
||||
hotplug:
|
||||
|
@ -92,29 +92,26 @@ Steps:
|
||||
|
||||
2. Insure that writeback is complete.
|
||||
|
||||
3. Prep the new page that we want to move to. It is locked
|
||||
and set to not being uptodate so that all accesses to the new
|
||||
page immediately lock while the move is in progress.
|
||||
3. Lock the new page that we want to move to. It is locked so that accesses to
|
||||
this (not yet uptodate) page immediately lock while the move is in progress.
|
||||
|
||||
4. The new page is prepped with some settings from the old page so that
|
||||
accesses to the new page will discover a page with the correct settings.
|
||||
4. All the page table references to the page are converted to migration
|
||||
entries. This decreases the mapcount of a page. If the resulting
|
||||
mapcount is not zero then we do not migrate the page. All user space
|
||||
processes that attempt to access the page will now wait on the page lock.
|
||||
|
||||
5. All the page table references to the page are converted
|
||||
to migration entries or dropped (nonlinear vmas).
|
||||
This decrease the mapcount of a page. If the resulting
|
||||
mapcount is not zero then we do not migrate the page.
|
||||
All user space processes that attempt to access the page
|
||||
will now wait on the page lock.
|
||||
|
||||
6. The radix tree lock is taken. This will cause all processes trying
|
||||
5. The radix tree lock is taken. This will cause all processes trying
|
||||
to access the page via the mapping to block on the radix tree spinlock.
|
||||
|
||||
7. The refcount of the page is examined and we back out if references remain
|
||||
6. The refcount of the page is examined and we back out if references remain
|
||||
otherwise we know that we are the only one referencing this page.
|
||||
|
||||
8. The radix tree is checked and if it does not contain the pointer to this
|
||||
7. The radix tree is checked and if it does not contain the pointer to this
|
||||
page then we back out because someone else modified the radix tree.
|
||||
|
||||
8. The new page is prepped with some settings from the old page so that
|
||||
accesses to the new page will discover a page with the correct settings.
|
||||
|
||||
9. The radix tree is changed to point to the new page.
|
||||
|
||||
10. The reference count of the old page is dropped because the radix tree
|
||||
|
@ -170,6 +170,16 @@ A lower value leads to gain less thp performance. Value of
|
||||
max_ptes_none can waste cpu time very little, you can
|
||||
ignore it.
|
||||
|
||||
max_ptes_swap specifies how many pages can be brought in from
|
||||
swap when collapsing a group of pages into a transparent huge page.
|
||||
|
||||
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
|
||||
|
||||
A higher value can cause excessive swap IO and waste
|
||||
memory. A lower value can prevent THPs from being
|
||||
collapsed, resulting fewer pages being collapsed into
|
||||
THPs, and lower memory access performance.
|
||||
|
||||
== Boot parameter ==
|
||||
|
||||
You can change the sysfs boot time defaults of Transparent Hugepage
|
||||
|
@ -531,83 +531,20 @@ map.
|
||||
|
||||
try_to_unmap() is always called, by either vmscan for reclaim or for page
|
||||
migration, with the argument page locked and isolated from the LRU. Separate
|
||||
functions handle anonymous and mapped file pages, as these types of pages have
|
||||
different reverse map mechanisms.
|
||||
functions handle anonymous and mapped file and KSM pages, as these types of
|
||||
pages have different reverse map lookup mechanisms, with different locking.
|
||||
In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(),
|
||||
it will call try_to_unmap_one() for every VMA which might contain the page.
|
||||
|
||||
(*) try_to_unmap_anon()
|
||||
When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED
|
||||
VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it,
|
||||
and return SWAP_MLOCK to indicate that the page is unevictable: and the scan
|
||||
stops there.
|
||||
|
||||
To unmap anonymous pages, each VMA in the list anchored in the anon_vma
|
||||
must be visited - at least until a VM_LOCKED VMA is encountered. If the
|
||||
page is being unmapped for migration, VM_LOCKED VMAs do not stop the
|
||||
process because mlocked pages are migratable. However, for reclaim, if
|
||||
the page is mapped into a VM_LOCKED VMA, the scan stops.
|
||||
|
||||
try_to_unmap_anon() attempts to acquire in read mode the mmap semaphore of
|
||||
the mm_struct to which the VMA belongs. If this is successful, it will
|
||||
mlock the page via mlock_vma_page() - we wouldn't have gotten to
|
||||
try_to_unmap_anon() if the page were already mlocked - and will return
|
||||
SWAP_MLOCK, indicating that the page is unevictable.
|
||||
|
||||
If the mmap semaphore cannot be acquired, we are not sure whether the page
|
||||
is really unevictable or not. In this case, try_to_unmap_anon() will
|
||||
return SWAP_AGAIN.
|
||||
|
||||
(*) try_to_unmap_file() - linear mappings
|
||||
|
||||
Unmapping of a mapped file page works the same as for anonymous mappings,
|
||||
except that the scan visits all VMAs that map the page's index/page offset
|
||||
in the page's mapping's reverse map priority search tree. It also visits
|
||||
each VMA in the page's mapping's non-linear list, if the list is
|
||||
non-empty.
|
||||
|
||||
As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
|
||||
page, try_to_unmap_file() will attempt to acquire the associated
|
||||
mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
|
||||
is successful, and SWAP_AGAIN, if not.
|
||||
|
||||
(*) try_to_unmap_file() - non-linear mappings
|
||||
|
||||
If a page's mapping contains a non-empty non-linear mapping VMA list, then
|
||||
try_to_un{map|lock}() must also visit each VMA in that list to determine
|
||||
whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit
|
||||
all VMAs in the non-linear list to ensure that the pages is not/should not
|
||||
be mlocked.
|
||||
|
||||
If a VM_LOCKED VMA is found in the list, the scan could terminate.
|
||||
However, there is no easy way to determine whether the page is actually
|
||||
mapped in a given VMA - either for unmapping or testing whether the
|
||||
VM_LOCKED VMA actually pins the page.
|
||||
|
||||
try_to_unmap_file() handles non-linear mappings by scanning a certain
|
||||
number of pages - a "cluster" - in each non-linear VMA associated with the
|
||||
page's mapping, for each file mapped page that vmscan tries to unmap. If
|
||||
this happens to unmap the page we're trying to unmap, try_to_unmap() will
|
||||
notice this on return (page_mapcount(page) will be 0) and return
|
||||
SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to
|
||||
recirculate this page. We take advantage of the cluster scan in
|
||||
try_to_unmap_cluster() as follows:
|
||||
|
||||
For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
|
||||
mmap semaphore of the associated mm_struct for read without blocking.
|
||||
|
||||
If this attempt is successful and the VMA is VM_LOCKED,
|
||||
try_to_unmap_cluster() will retain the mmap semaphore for the scan;
|
||||
otherwise it drops it here.
|
||||
|
||||
Then, for each page in the cluster, if we're holding the mmap semaphore
|
||||
for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
|
||||
mlock the page. This call is a no-op if the page is already locked,
|
||||
but will mlock any pages in the non-linear mapping that happen to be
|
||||
unlocked.
|
||||
|
||||
If one of the pages so mlocked is the page passed in to try_to_unmap(),
|
||||
try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
|
||||
SWAP_AGAIN. This will allow vmscan to cull the page, rather than
|
||||
recirculating it on the inactive list.
|
||||
|
||||
Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
|
||||
returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
|
||||
VMA, but couldn't be mlocked.
|
||||
mlock_vma_page() is called while holding the page table's lock (in addition
|
||||
to the page lock, and the rmap lock): to serialize against concurrent mlock or
|
||||
munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
|
||||
holepunching, and truncation of file pages and their anonymous COWed pages.
|
||||
|
||||
|
||||
try_to_munlock() REVERSE MAP SCAN
|
||||
@ -623,29 +560,15 @@ all PTEs from the page. For this purpose, the unevictable/mlock infrastructure
|
||||
introduced a variant of try_to_unmap() called try_to_munlock().
|
||||
|
||||
try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
|
||||
mapped file pages with an additional argument specifying unlock versus unmap
|
||||
mapped file and KSM pages with a flag argument specifying unlock versus unmap
|
||||
processing. Again, these functions walk the respective reverse maps looking
|
||||
for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file
|
||||
pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
|
||||
attempt to acquire the associated mmap semaphore, mlock the page via
|
||||
mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the
|
||||
pre-clearing of the page's PG_mlocked done by munlock_vma_page.
|
||||
|
||||
If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
|
||||
semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to
|
||||
recycle the page on the inactive list and hope that it has better luck with the
|
||||
page next time.
|
||||
|
||||
For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
|
||||
slightly differently. On encountering a VM_LOCKED non-linear VMA that might
|
||||
map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
|
||||
page. munlock_vma_page() will just leave the page unlocked and let vmscan deal
|
||||
with it - the usual fallback position.
|
||||
for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case,
|
||||
the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK. This
|
||||
undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page.
|
||||
|
||||
Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
|
||||
reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
|
||||
However, the scan can terminate when it encounters a VM_LOCKED VMA and can
|
||||
successfully acquire the VMA's mmap semaphore for read and mlock the page.
|
||||
However, the scan can terminate when it encounters a VM_LOCKED VMA.
|
||||
Although try_to_munlock() might be called a great many times when munlocking a
|
||||
large region or tearing down a large address space that has been mlocked via
|
||||
mlockall(), overall this is a fairly rare event.
|
||||
@ -673,11 +596,6 @@ Some examples of these unevictable pages on the LRU lists are:
|
||||
(3) mlocked pages that could not be isolated from the LRU and moved to the
|
||||
unevictable list in mlock_vma_page().
|
||||
|
||||
(4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
|
||||
acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
|
||||
munlock_vma_page() was forced to let the page back on to the normal LRU
|
||||
list for vmscan to handle.
|
||||
|
||||
shrink_inactive_list() also diverts any unevictable pages that it finds on the
|
||||
inactive lists to the appropriate zone's unevictable list.
|
||||
|
||||
|
@ -37,6 +37,9 @@
|
||||
|
||||
#define MCL_CURRENT 8192 /* lock all currently mapped pages */
|
||||
#define MCL_FUTURE 16384 /* lock all additions to address space */
|
||||
#define MCL_ONFAULT 32768 /* lock all pages that are faulted in */
|
||||
|
||||
#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
|
||||
|
||||
#define MADV_NORMAL 0 /* no further special treatment */
|
||||
#define MADV_RANDOM 1 /* expect random page references */
|
||||
|
@ -803,7 +803,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fault = probe_kernel_address(instrptr, instr);
|
||||
fault = probe_kernel_address((void *)instrptr, instr);
|
||||
instr = __mem_to_opcode_arm(instr);
|
||||
}
|
||||
|
||||
|
@ -61,6 +61,12 @@
|
||||
*/
|
||||
#define MCL_CURRENT 1 /* lock all current mappings */
|
||||
#define MCL_FUTURE 2 /* lock all future mappings */
|
||||
#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
|
||||
|
||||
/*
|
||||
* Flags for mlock
|
||||
*/
|
||||
#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
|
||||
|
||||
#define MADV_NORMAL 0 /* no further special treatment */
|
||||
#define MADV_RANDOM 1 /* expect random page references */
|
||||
|
@ -31,6 +31,9 @@
|
||||
|
||||
#define MCL_CURRENT 1 /* lock all current mappings */
|
||||
#define MCL_FUTURE 2 /* lock all future mappings */
|
||||
#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
|
||||
|
||||
#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
|
||||
|
||||
#define MADV_NORMAL 0 /* no further special treatment */
|
||||
#define MADV_RANDOM 1 /* expect random page references */
|
||||
|
@ -22,6 +22,7 @@
|
||||
|
||||
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
|
||||
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
|
||||
#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
|
||||
|
||||
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
|
||||
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
|
||||
|
@ -80,7 +80,7 @@ static void __init setup_node_to_cpumask_map(void)
|
||||
setup_nr_node_ids();
|
||||
|
||||
/* allocate the map */
|
||||
for (node = 0; node < nr_node_ids; node++)
|
||||
for_each_node(node)
|
||||
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
|
||||
|
||||
/* cpumask_of_node() will now work */
|
||||
|
@ -999,7 +999,7 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs)
|
||||
ret = get_user(regs->nip, &inst);
|
||||
pagefault_enable();
|
||||
} else {
|
||||
ret = probe_kernel_address(regs->nip, inst);
|
||||
ret = probe_kernel_address((void *)regs->nip, inst);
|
||||
}
|
||||
|
||||
if (mcheck_handle_load(regs, inst)) {
|
||||
|
@ -17,6 +17,7 @@
|
||||
|
||||
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
|
||||
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
|
||||
#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
|
||||
|
||||
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
|
||||
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
|
||||
|
@ -36,6 +36,7 @@
|
||||
*/
|
||||
#define MCL_CURRENT 1 /* lock all current mappings */
|
||||
#define MCL_FUTURE 2 /* lock all future mappings */
|
||||
#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
|
||||
|
||||
|
||||
#endif /* _ASM_TILE_MMAN_H */
|
||||
|
@ -9,13 +9,13 @@
|
||||
# Changed by many, many contributors over the years.
|
||||
#
|
||||
|
||||
KASAN_SANITIZE := n
|
||||
|
||||
# If you want to preset the SVGA mode, uncomment the next line and
|
||||
# set SVGA_MODE to whatever number you want.
|
||||
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
|
||||
# The number is the same as you would ordinarily press at bootup.
|
||||
|
||||
KASAN_SANITIZE := n
|
||||
|
||||
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
|
||||
|
||||
targets := vmlinux.bin setup.bin setup.elf bzImage
|
||||
|
@ -382,3 +382,4 @@
|
||||
373 i386 shutdown sys_shutdown
|
||||
374 i386 userfaultfd sys_userfaultfd
|
||||
375 i386 membarrier sys_membarrier
|
||||
376 i386 mlock2 sys_mlock2
|
||||
|
@ -331,6 +331,7 @@
|
||||
322 64 execveat stub_execveat
|
||||
323 common userfaultfd sys_userfaultfd
|
||||
324 common membarrier sys_membarrier
|
||||
325 common mlock2 sys_mlock2
|
||||
|
||||
#
|
||||
# x32-specific system call numbers start at 512 to avoid cache impact
|
||||
|
@ -126,5 +126,5 @@ void __init kasan_init(void)
|
||||
__flush_tlb_all();
|
||||
init_task.kasan_depth = 0;
|
||||
|
||||
pr_info("Kernel address sanitizer initialized\n");
|
||||
pr_info("KernelAddressSanitizer initialized\n");
|
||||
}
|
||||
|
@ -74,6 +74,12 @@
|
||||
*/
|
||||
#define MCL_CURRENT 1 /* lock all current mappings */
|
||||
#define MCL_FUTURE 2 /* lock all future mappings */
|
||||
#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
|
||||
|
||||
/*
|
||||
* Flags for mlock
|
||||
*/
|
||||
#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
|
||||
|
||||
#define MADV_NORMAL 0 /* no further special treatment */
|
||||
#define MADV_RANDOM 1 /* expect random page references */
|
||||
|
@ -231,7 +231,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
|
||||
if (res < 0 && fl->fl_type != F_UNLCK) {
|
||||
fl_type = fl->fl_type;
|
||||
fl->fl_type = F_UNLCK;
|
||||
res = locks_lock_file_wait(filp, fl);
|
||||
/* Even if this fails we want to return the remote error */
|
||||
locks_lock_file_wait(filp, fl);
|
||||
fl->fl_type = fl_type;
|
||||
}
|
||||
out:
|
||||
|
@ -2149,7 +2149,12 @@ static void wait_sb_inodes(struct super_block *sb)
|
||||
iput(old_inode);
|
||||
old_inode = inode;
|
||||
|
||||
filemap_fdatawait(mapping);
|
||||
/*
|
||||
* We keep the error status of individual mapping so that
|
||||
* applications can catch the writeback error using fsync(2).
|
||||
* See filemap_fdatawait_keep_errors() for details.
|
||||
*/
|
||||
filemap_fdatawait_keep_errors(mapping);
|
||||
|
||||
cond_resched();
|
||||
|
||||
|
@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
|
||||
unsigned int max_pages;
|
||||
int i;
|
||||
|
||||
max_pages = min(nr_pages, BIO_MAX_PAGES);
|
||||
max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
|
||||
|
||||
bio = bio_alloc(GFP_NOFS, max_pages);
|
||||
BUG_ON(!bio);
|
||||
@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
|
||||
unsigned int max_pages;
|
||||
int i;
|
||||
|
||||
max_pages = min(nr_pages, BIO_MAX_PAGES);
|
||||
max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
|
||||
|
||||
bio = bio_alloc(GFP_NOFS, max_pages);
|
||||
BUG_ON(!bio);
|
||||
|
@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
|
||||
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
|
||||
inode = igrab(mark->inode);
|
||||
if (inode) {
|
||||
/*
|
||||
* IN_ALL_EVENTS represents all of the mask bits
|
||||
* that we expose to userspace. There is at
|
||||
* least one bit (FS_EVENT_ON_CHILD) which is
|
||||
* used only internally to the kernel.
|
||||
*/
|
||||
u32 mask = mark->mask & IN_ALL_EVENTS;
|
||||
seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
|
||||
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
|
||||
mark->mask, mark->ignored_mask);
|
||||
mask, mark->ignored_mask);
|
||||
show_mark_fhandle(m, inode);
|
||||
seq_putc(m, '\n');
|
||||
iput(inode);
|
||||
|
@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
|
||||
int ret;
|
||||
unsigned flags = 0;
|
||||
|
||||
/* don't allow invalid bits: we don't want flags set */
|
||||
/*
|
||||
* We share a lot of code with fs/dnotify. We also share
|
||||
* the bit layout between inotify's IN_* and the fsnotify
|
||||
* FS_*. This check ensures that only the inotify IN_*
|
||||
* bits get passed in and set in watches/events.
|
||||
*/
|
||||
if (unlikely(mask & ~ALL_INOTIFY_BITS))
|
||||
return -EINVAL;
|
||||
/*
|
||||
* Require at least one valid bit set in the mask.
|
||||
* Without _something_ set, we would have no events to
|
||||
* watch for.
|
||||
*/
|
||||
if (unlikely(!(mask & ALL_INOTIFY_BITS)))
|
||||
return -EINVAL;
|
||||
|
||||
|
@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
|
||||
ret = -EIO;
|
||||
goto bail;
|
||||
}
|
||||
set_buffer_new(bh_result);
|
||||
up_write(&OCFS2_I(inode)->ip_alloc_sem);
|
||||
}
|
||||
|
||||
@ -864,6 +865,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
|
||||
is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
|
||||
if (is_overwrite < 0) {
|
||||
mlog_errno(is_overwrite);
|
||||
ret = is_overwrite;
|
||||
ocfs2_inode_unlock(inode, 1);
|
||||
goto clean_orphan;
|
||||
}
|
||||
|
@ -219,7 +219,8 @@ struct o2hb_region {
|
||||
unsigned hr_unclean_stop:1,
|
||||
hr_aborted_start:1,
|
||||
hr_item_pinned:1,
|
||||
hr_item_dropped:1;
|
||||
hr_item_dropped:1,
|
||||
hr_node_deleted:1;
|
||||
|
||||
/* protected by the hr_callback_sem */
|
||||
struct task_struct *hr_task;
|
||||
@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data)
|
||||
set_user_nice(current, MIN_NICE);
|
||||
|
||||
/* Pin node */
|
||||
o2nm_depend_this_node();
|
||||
ret = o2nm_depend_this_node();
|
||||
if (ret) {
|
||||
mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
|
||||
reg->hr_node_deleted = 1;
|
||||
wake_up(&o2hb_steady_queue);
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (!kthread_should_stop() &&
|
||||
!reg->hr_unclean_stop && !reg->hr_aborted_start) {
|
||||
@ -1787,7 +1794,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
|
||||
spin_unlock(&o2hb_live_lock);
|
||||
|
||||
ret = wait_event_interruptible(o2hb_steady_queue,
|
||||
atomic_read(®->hr_steady_iterations) == 0);
|
||||
atomic_read(®->hr_steady_iterations) == 0 ||
|
||||
reg->hr_node_deleted);
|
||||
if (ret) {
|
||||
atomic_set(®->hr_steady_iterations, 0);
|
||||
reg->hr_aborted_start = 1;
|
||||
@ -1798,6 +1806,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
|
||||
goto out3;
|
||||
}
|
||||
|
||||
if (reg->hr_node_deleted) {
|
||||
ret = -EINVAL;
|
||||
goto out3;
|
||||
}
|
||||
|
||||
/* Ok, we were woken. Make sure it wasn't by drop_item() */
|
||||
spin_lock(&o2hb_live_lock);
|
||||
hb_task = reg->hr_task;
|
||||
|
@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
|
||||
int status;
|
||||
unsigned int backoff;
|
||||
unsigned int total_backoff = 0;
|
||||
char wq_name[O2NM_MAX_NAME_LEN];
|
||||
|
||||
BUG_ON(!dlm);
|
||||
|
||||
@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
|
||||
goto bail;
|
||||
}
|
||||
|
||||
dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
|
||||
snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
|
||||
dlm->dlm_worker = create_singlethread_workqueue(wq_name);
|
||||
if (!dlm->dlm_worker) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
|
@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
|
||||
mlog(0, "starting dlm recovery thread...\n");
|
||||
|
||||
dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
|
||||
"dlm_reco_thread");
|
||||
"dlm_reco-%s", dlm->name);
|
||||
if (IS_ERR(dlm->dlm_reco_thread_task)) {
|
||||
mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
|
||||
dlm->dlm_reco_thread_task = NULL;
|
||||
|
@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
|
||||
{
|
||||
mlog(0, "Starting dlm_thread...\n");
|
||||
|
||||
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
|
||||
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s",
|
||||
dlm->name);
|
||||
if (IS_ERR(dlm->dlm_thread_task)) {
|
||||
mlog_errno(PTR_ERR(dlm->dlm_thread_task));
|
||||
dlm->dlm_thread_task = NULL;
|
||||
|
@ -2998,7 +2998,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
|
||||
}
|
||||
|
||||
/* launch downconvert thread */
|
||||
osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
|
||||
osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
|
||||
osb->uuid_str);
|
||||
if (IS_ERR(osb->dc_task)) {
|
||||
status = PTR_ERR(osb->dc_task);
|
||||
osb->dc_task = NULL;
|
||||
|
@ -112,6 +112,8 @@ struct ocfs2_inode_info
|
||||
#define OCFS2_INODE_OPEN_DIRECT 0x00000020
|
||||
/* Tell the inode wipe code it's not in orphan dir */
|
||||
#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
|
||||
/* Entry in orphan dir with 'dio-' prefix */
|
||||
#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080
|
||||
|
||||
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
|
||||
{
|
||||
|
@ -1090,7 +1090,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
|
||||
/* Launch the commit thread */
|
||||
if (!local) {
|
||||
osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
|
||||
"ocfs2cmt");
|
||||
"ocfs2cmt-%s", osb->uuid_str);
|
||||
if (IS_ERR(osb->commit_task)) {
|
||||
status = PTR_ERR(osb->commit_task);
|
||||
osb->commit_task = NULL;
|
||||
@ -1507,7 +1507,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
|
||||
goto out;
|
||||
|
||||
osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
|
||||
"ocfs2rec");
|
||||
"ocfs2rec-%s", osb->uuid_str);
|
||||
if (IS_ERR(osb->recovery_thread_task)) {
|
||||
mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
|
||||
osb->recovery_thread_task = NULL;
|
||||
@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv {
|
||||
struct dir_context ctx;
|
||||
struct inode *head;
|
||||
struct ocfs2_super *osb;
|
||||
enum ocfs2_orphan_reco_type orphan_reco_type;
|
||||
};
|
||||
|
||||
static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
|
||||
@ -2036,12 +2037,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
|
||||
if (name_len == 2 && !strncmp("..", name, 2))
|
||||
return 0;
|
||||
|
||||
/* do not include dio entry in case of orphan scan */
|
||||
if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
|
||||
(!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
|
||||
OCFS2_DIO_ORPHAN_PREFIX_LEN)))
|
||||
return 0;
|
||||
|
||||
/* Skip bad inodes so that recovery can continue */
|
||||
iter = ocfs2_iget(p->osb, ino,
|
||||
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
|
||||
if (IS_ERR(iter))
|
||||
return 0;
|
||||
|
||||
if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
|
||||
OCFS2_DIO_ORPHAN_PREFIX_LEN))
|
||||
OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
|
||||
|
||||
/* Skip inodes which are already added to recover list, since dio may
|
||||
* happen concurrently with unlink/rename */
|
||||
if (OCFS2_I(iter)->ip_next_orphan) {
|
||||
@ -2060,14 +2071,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
|
||||
|
||||
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
|
||||
int slot,
|
||||
struct inode **head)
|
||||
struct inode **head,
|
||||
enum ocfs2_orphan_reco_type orphan_reco_type)
|
||||
{
|
||||
int status;
|
||||
struct inode *orphan_dir_inode = NULL;
|
||||
struct ocfs2_orphan_filldir_priv priv = {
|
||||
.ctx.actor = ocfs2_orphan_filldir,
|
||||
.osb = osb,
|
||||
.head = *head
|
||||
.head = *head,
|
||||
.orphan_reco_type = orphan_reco_type
|
||||
};
|
||||
|
||||
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
|
||||
@ -2170,7 +2183,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
|
||||
trace_ocfs2_recover_orphans(slot);
|
||||
|
||||
ocfs2_mark_recovering_orphan_dir(osb, slot);
|
||||
ret = ocfs2_queue_orphans(osb, slot, &inode);
|
||||
ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
|
||||
ocfs2_clear_recovering_orphan_dir(osb, slot);
|
||||
|
||||
/* Error here should be noted, but we want to continue with as
|
||||
@ -2186,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
|
||||
iter = oi->ip_next_orphan;
|
||||
oi->ip_next_orphan = NULL;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
ret = ocfs2_rw_lock(inode, 1);
|
||||
if (ret < 0) {
|
||||
mlog_errno(ret);
|
||||
goto next;
|
||||
}
|
||||
/*
|
||||
* We need to take and drop the inode lock to
|
||||
* force read inode from disk.
|
||||
*/
|
||||
ret = ocfs2_inode_lock(inode, &di_bh, 1);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto unlock_rw;
|
||||
}
|
||||
if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
|
||||
mutex_lock(&inode->i_mutex);
|
||||
ret = ocfs2_rw_lock(inode, 1);
|
||||
if (ret < 0) {
|
||||
mlog_errno(ret);
|
||||
goto unlock_mutex;
|
||||
}
|
||||
/*
|
||||
* We need to take and drop the inode lock to
|
||||
* force read inode from disk.
|
||||
*/
|
||||
ret = ocfs2_inode_lock(inode, &di_bh, 1);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto unlock_rw;
|
||||
}
|
||||
|
||||
di = (struct ocfs2_dinode *)di_bh->b_data;
|
||||
di = (struct ocfs2_dinode *)di_bh->b_data;
|
||||
|
||||
if (inode->i_nlink == 0) {
|
||||
if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
|
||||
ret = ocfs2_truncate_file(inode, di_bh,
|
||||
i_size_read(inode));
|
||||
if (ret < 0) {
|
||||
if (ret != -ENOSPC)
|
||||
mlog_errno(ret);
|
||||
goto unlock_inode;
|
||||
}
|
||||
|
||||
ret = ocfs2_del_inode_from_orphan(osb, inode,
|
||||
di_bh, 0, 0);
|
||||
if (ret)
|
||||
mlog_errno(ret);
|
||||
}
|
||||
unlock_inode:
|
||||
ocfs2_inode_unlock(inode, 1);
|
||||
brelse(di_bh);
|
||||
di_bh = NULL;
|
||||
unlock_rw:
|
||||
ocfs2_rw_unlock(inode, 1);
|
||||
unlock_mutex:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
/* clear dio flag in ocfs2_inode_info */
|
||||
oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
|
||||
} else {
|
||||
spin_lock(&oi->ip_lock);
|
||||
/* Set the proper information to get us going into
|
||||
* ocfs2_delete_inode. */
|
||||
@ -2212,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
|
||||
spin_unlock(&oi->ip_lock);
|
||||
}
|
||||
|
||||
if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
|
||||
(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
|
||||
ret = ocfs2_truncate_file(inode, di_bh,
|
||||
i_size_read(inode));
|
||||
if (ret < 0) {
|
||||
if (ret != -ENOSPC)
|
||||
mlog_errno(ret);
|
||||
goto unlock_inode;
|
||||
}
|
||||
|
||||
ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
|
||||
if (ret)
|
||||
mlog_errno(ret);
|
||||
} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
|
||||
unlock_inode:
|
||||
ocfs2_inode_unlock(inode, 1);
|
||||
brelse(di_bh);
|
||||
di_bh = NULL;
|
||||
unlock_rw:
|
||||
ocfs2_rw_unlock(inode, 1);
|
||||
next:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
iput(inode);
|
||||
inode = iter;
|
||||
}
|
||||
|
@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
|
||||
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
|
||||
/* An orphan dir name is an 8 byte value, printed as a hex string */
|
||||
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
|
||||
#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
|
||||
#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
|
||||
|
||||
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags)
|
||||
@ -657,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
|
||||
return status;
|
||||
}
|
||||
|
||||
return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
|
||||
status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
|
||||
parent_fe_bh, handle, inode_ac,
|
||||
fe_blkno, suballoc_loc, suballoc_bit);
|
||||
if (status < 0) {
|
||||
u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
|
||||
int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
|
||||
inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
|
||||
if (tmp)
|
||||
mlog_errno(tmp);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static int ocfs2_mkdir(struct inode *dir,
|
||||
|
@ -26,6 +26,9 @@
|
||||
#ifndef OCFS2_NAMEI_H
|
||||
#define OCFS2_NAMEI_H
|
||||
|
||||
#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
|
||||
#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
|
||||
|
||||
extern const struct inode_operations ocfs2_dir_iops;
|
||||
|
||||
struct dentry *ocfs2_get_parent(struct dentry *child);
|
||||
|
@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
|
||||
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
|
||||
struct page *page;
|
||||
pgoff_t page_index;
|
||||
unsigned int from, to, readahead_pages;
|
||||
unsigned int from, to;
|
||||
loff_t offset, end, map_end;
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
|
||||
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
|
||||
new_cluster, new_len);
|
||||
|
||||
readahead_pages =
|
||||
(ocfs2_cow_contig_clusters(sb) <<
|
||||
OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
|
||||
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
|
||||
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
|
||||
/*
|
||||
|
@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
|
||||
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
|
||||
res, &bits_left);
|
||||
if (!status) {
|
||||
hint = ocfs2_group_from_res(res);
|
||||
if (ocfs2_is_cluster_bitmap(ac->ac_inode))
|
||||
hint = res->sr_bg_blkno;
|
||||
else
|
||||
hint = ocfs2_group_from_res(res);
|
||||
goto set_hint;
|
||||
}
|
||||
if (status < 0 && status != -ENOSPC) {
|
||||
|
@ -1032,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
|
||||
return simple_read_from_buffer(buf, count, ppos, buffer, len);
|
||||
}
|
||||
|
||||
/*
|
||||
* /proc/pid/oom_adj exists solely for backwards compatibility with previous
|
||||
* kernels. The effective policy is defined by oom_score_adj, which has a
|
||||
* different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
|
||||
* Values written to oom_adj are simply mapped linearly to oom_score_adj.
|
||||
* Processes that become oom disabled via oom_adj will still be oom disabled
|
||||
* with this implementation.
|
||||
*
|
||||
* oom_adj cannot be removed since existing userspace binaries use it.
|
||||
*/
|
||||
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
|
@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
|
||||
ptes >> 10,
|
||||
pmds >> 10,
|
||||
swap << (PAGE_SHIFT-10));
|
||||
hugetlb_report_usage(m, mm);
|
||||
}
|
||||
|
||||
unsigned long task_vsize(struct mm_struct *mm)
|
||||
@ -446,6 +447,8 @@ struct mem_size_stats {
|
||||
unsigned long anonymous;
|
||||
unsigned long anonymous_thp;
|
||||
unsigned long swap;
|
||||
unsigned long shared_hugetlb;
|
||||
unsigned long private_hugetlb;
|
||||
u64 pss;
|
||||
u64 swap_pss;
|
||||
};
|
||||
@ -625,12 +628,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct mem_size_stats *mss = walk->private;
|
||||
struct vm_area_struct *vma = walk->vma;
|
||||
struct page *page = NULL;
|
||||
|
||||
if (pte_present(*pte)) {
|
||||
page = vm_normal_page(vma, addr, *pte);
|
||||
} else if (is_swap_pte(*pte)) {
|
||||
swp_entry_t swpent = pte_to_swp_entry(*pte);
|
||||
|
||||
if (is_migration_entry(swpent))
|
||||
page = migration_entry_to_page(swpent);
|
||||
}
|
||||
if (page) {
|
||||
int mapcount = page_mapcount(page);
|
||||
|
||||
if (mapcount >= 2)
|
||||
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
|
||||
else
|
||||
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif /* HUGETLB_PAGE */
|
||||
|
||||
static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||
{
|
||||
struct vm_area_struct *vma = v;
|
||||
struct mem_size_stats mss;
|
||||
struct mm_walk smaps_walk = {
|
||||
.pmd_entry = smaps_pte_range,
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
.hugetlb_entry = smaps_hugetlb_range,
|
||||
#endif
|
||||
.mm = vma->vm_mm,
|
||||
.private = &mss,
|
||||
};
|
||||
@ -652,6 +687,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||
"Referenced: %8lu kB\n"
|
||||
"Anonymous: %8lu kB\n"
|
||||
"AnonHugePages: %8lu kB\n"
|
||||
"Shared_Hugetlb: %8lu kB\n"
|
||||
"Private_Hugetlb: %7lu kB\n"
|
||||
"Swap: %8lu kB\n"
|
||||
"SwapPss: %8lu kB\n"
|
||||
"KernelPageSize: %8lu kB\n"
|
||||
@ -667,6 +704,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||
mss.referenced >> 10,
|
||||
mss.anonymous >> 10,
|
||||
mss.anonymous_thp >> 10,
|
||||
mss.shared_hugetlb >> 10,
|
||||
mss.private_hugetlb >> 10,
|
||||
mss.swap >> 10,
|
||||
(unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
|
||||
vma_kernel_pagesize(vma) >> 10,
|
||||
@ -753,19 +792,27 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
|
||||
pte_t ptent = *pte;
|
||||
|
||||
if (pte_present(ptent)) {
|
||||
ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
|
||||
ptent = pte_wrprotect(ptent);
|
||||
ptent = pte_clear_soft_dirty(ptent);
|
||||
ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
|
||||
} else if (is_swap_pte(ptent)) {
|
||||
ptent = pte_swp_clear_soft_dirty(ptent);
|
||||
set_pte_at(vma->vm_mm, addr, pte, ptent);
|
||||
}
|
||||
|
||||
set_pte_at(vma->vm_mm, addr, pte, ptent);
|
||||
}
|
||||
#else
|
||||
static inline void clear_soft_dirty(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *pte)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
|
||||
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
pmd_t pmd = *pmdp;
|
||||
pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
|
||||
|
||||
pmd = pmd_wrprotect(pmd);
|
||||
pmd = pmd_clear_soft_dirty(pmd);
|
||||
@ -775,14 +822,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
|
||||
|
||||
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void clear_soft_dirty(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *pte)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
|
@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
|
||||
|
||||
static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
|
||||
{
|
||||
filemap_fdatawait(bdev->bd_inode->i_mapping);
|
||||
/*
|
||||
* We keep the error status of individual mapping so that
|
||||
* applications can catch the writeback error using fsync(2).
|
||||
* See filemap_fdatawait_keep_errors() for details.
|
||||
*/
|
||||
filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -15,7 +15,8 @@
|
||||
/* For more detailed tracepoint output */
|
||||
#define COMPACT_NO_SUITABLE_PAGE 5
|
||||
#define COMPACT_NOT_SUITABLE_ZONE 6
|
||||
/* When adding new state, please change compaction_status_string, too */
|
||||
#define COMPACT_CONTENDED 7
|
||||
/* When adding new states, please adjust include/trace/events/compaction.h */
|
||||
|
||||
/* Used to signal whether compaction detected need_sched() or lock contention */
|
||||
/* No contention detected */
|
||||
|
@ -210,6 +210,23 @@
|
||||
#define __visible __attribute__((externally_visible))
|
||||
#endif
|
||||
|
||||
|
||||
#if GCC_VERSION >= 40900 && !defined(__CHECKER__)
|
||||
/*
|
||||
* __assume_aligned(n, k): Tell the optimizer that the returned
|
||||
* pointer can be assumed to be k modulo n. The second argument is
|
||||
* optional (default 0), so we use a variadic macro to make the
|
||||
* shorthand.
|
||||
*
|
||||
* Beware: Do not apply this to functions which may return
|
||||
* ERR_PTRs. Also, it is probably unwise to apply it to functions
|
||||
* returning extra information in the low bits (but in that case the
|
||||
* compiler should see some alignment anyway, when the return value is
|
||||
* massaged by 'flags = ptr & 3; ptr &= ~3;').
|
||||
*/
|
||||
#define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__)))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* GCC 'asm goto' miscompiles certain code sequences:
|
||||
*
|
||||
|
@ -417,6 +417,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
|
||||
#define __visible
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Assume alignment of return value.
|
||||
*/
|
||||
#ifndef __assume_aligned
|
||||
#define __assume_aligned(a, ...)
|
||||
#endif
|
||||
|
||||
|
||||
/* Are two types/vars the same type (ignoring qualifiers)? */
|
||||
#ifndef __same_type
|
||||
# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
|
||||
|
@ -93,7 +93,7 @@ extern int current_cpuset_is_being_rebound(void);
|
||||
|
||||
extern void rebuild_sched_domains(void);
|
||||
|
||||
extern void cpuset_print_task_mems_allowed(struct task_struct *p);
|
||||
extern void cpuset_print_current_mems_allowed(void);
|
||||
|
||||
/*
|
||||
* read_mems_allowed_begin is required when making decisions involving
|
||||
@ -219,7 +219,7 @@ static inline void rebuild_sched_domains(void)
|
||||
partition_sched_domains(1, NULL, NULL);
|
||||
}
|
||||
|
||||
static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
|
||||
static inline void cpuset_print_current_mems_allowed(void)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -2409,6 +2409,7 @@ extern int write_inode_now(struct inode *, int);
|
||||
extern int filemap_fdatawrite(struct address_space *);
|
||||
extern int filemap_flush(struct address_space *);
|
||||
extern int filemap_fdatawait(struct address_space *);
|
||||
extern void filemap_fdatawait_keep_errors(struct address_space *);
|
||||
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
|
||||
loff_t lend);
|
||||
extern int filemap_write_and_wait(struct address_space *mapping);
|
||||
|
@ -483,6 +483,17 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
|
||||
#define hugepages_supported() (HPAGE_SHIFT != 0)
|
||||
#endif
|
||||
|
||||
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);
|
||||
|
||||
static inline void hugetlb_count_add(long l, struct mm_struct *mm)
|
||||
{
|
||||
atomic_long_add(l, &mm->hugetlb_usage);
|
||||
}
|
||||
|
||||
static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
|
||||
{
|
||||
atomic_long_sub(l, &mm->hugetlb_usage);
|
||||
}
|
||||
#else /* CONFIG_HUGETLB_PAGE */
|
||||
struct hstate {};
|
||||
#define alloc_huge_page(v, a, r) NULL
|
||||
@ -519,6 +530,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
|
||||
{
|
||||
return &mm->page_table_lock;
|
||||
}
|
||||
|
||||
static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
static inline spinlock_t *huge_pte_lock(struct hstate *h,
|
||||
|
@ -89,10 +89,6 @@ int memblock_add_range(struct memblock_type *type,
|
||||
phys_addr_t base, phys_addr_t size,
|
||||
int nid, unsigned long flags);
|
||||
|
||||
int memblock_remove_range(struct memblock_type *type,
|
||||
phys_addr_t base,
|
||||
phys_addr_t size);
|
||||
|
||||
void __next_mem_range(u64 *idx, int nid, ulong flags,
|
||||
struct memblock_type *type_a,
|
||||
struct memblock_type *type_b, phys_addr_t *out_start,
|
||||
|
@ -301,8 +301,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
|
||||
void mem_cgroup_uncharge(struct page *page);
|
||||
void mem_cgroup_uncharge_list(struct list_head *page_list);
|
||||
|
||||
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
|
||||
bool lrucare);
|
||||
void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage);
|
||||
|
||||
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
|
||||
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
|
||||
@ -384,7 +383,7 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
|
||||
return mz->lru_size[lru];
|
||||
}
|
||||
|
||||
static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
||||
static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
||||
{
|
||||
unsigned long inactive_ratio;
|
||||
unsigned long inactive;
|
||||
@ -403,24 +402,26 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
||||
return inactive * inactive_ratio < active;
|
||||
}
|
||||
|
||||
void mem_cgroup_handle_over_high(void);
|
||||
|
||||
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
||||
struct task_struct *p);
|
||||
|
||||
static inline void mem_cgroup_oom_enable(void)
|
||||
{
|
||||
WARN_ON(current->memcg_oom.may_oom);
|
||||
current->memcg_oom.may_oom = 1;
|
||||
WARN_ON(current->memcg_may_oom);
|
||||
current->memcg_may_oom = 1;
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_oom_disable(void)
|
||||
{
|
||||
WARN_ON(!current->memcg_oom.may_oom);
|
||||
current->memcg_oom.may_oom = 0;
|
||||
WARN_ON(!current->memcg_may_oom);
|
||||
current->memcg_may_oom = 0;
|
||||
}
|
||||
|
||||
static inline bool task_in_memcg_oom(struct task_struct *p)
|
||||
{
|
||||
return p->memcg_oom.memcg;
|
||||
return p->memcg_in_oom;
|
||||
}
|
||||
|
||||
bool mem_cgroup_oom_synchronize(bool wait);
|
||||
@ -537,9 +538,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_migrate(struct page *oldpage,
|
||||
struct page *newpage,
|
||||
bool lrucare)
|
||||
static inline void mem_cgroup_replace_page(struct page *old, struct page *new)
|
||||
{
|
||||
}
|
||||
|
||||
@ -585,10 +584,10 @@ static inline bool mem_cgroup_disabled(void)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline int
|
||||
static inline bool
|
||||
mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
||||
{
|
||||
return 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
|
||||
@ -622,6 +621,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_handle_over_high(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_oom_enable(void)
|
||||
{
|
||||
}
|
||||
@ -748,11 +751,10 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
|
||||
* conditions, but because they are pretty simple, they are expected to be
|
||||
* fast.
|
||||
*/
|
||||
bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
|
||||
int order);
|
||||
void __memcg_kmem_commit_charge(struct page *page,
|
||||
struct mem_cgroup *memcg, int order);
|
||||
void __memcg_kmem_uncharge_pages(struct page *page, int order);
|
||||
int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
|
||||
struct mem_cgroup *memcg);
|
||||
int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
|
||||
void __memcg_kmem_uncharge(struct page *page, int order);
|
||||
|
||||
/*
|
||||
* helper for acessing a memcg's index. It will be used as an index in the
|
||||
@ -767,77 +769,42 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
|
||||
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
|
||||
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
|
||||
|
||||
struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
|
||||
|
||||
int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
|
||||
unsigned long nr_pages);
|
||||
void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
|
||||
|
||||
/**
|
||||
* memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
|
||||
* @gfp: the gfp allocation flags.
|
||||
* @memcg: a pointer to the memcg this was charged against.
|
||||
* @order: allocation order.
|
||||
*
|
||||
* returns true if the memcg where the current task belongs can hold this
|
||||
* allocation.
|
||||
*
|
||||
* We return true automatically if this allocation is not to be accounted to
|
||||
* any memcg.
|
||||
*/
|
||||
static inline bool
|
||||
memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
|
||||
static inline bool __memcg_kmem_bypass(gfp_t gfp)
|
||||
{
|
||||
if (!memcg_kmem_enabled())
|
||||
return true;
|
||||
|
||||
if (gfp & __GFP_NOACCOUNT)
|
||||
return true;
|
||||
/*
|
||||
* __GFP_NOFAIL allocations will move on even if charging is not
|
||||
* possible. Therefore we don't even try, and have this allocation
|
||||
* unaccounted. We could in theory charge it forcibly, but we hope
|
||||
* those allocations are rare, and won't be worth the trouble.
|
||||
*/
|
||||
if (gfp & __GFP_NOFAIL)
|
||||
return true;
|
||||
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
|
||||
return true;
|
||||
|
||||
/* If the test is dying, just let it go. */
|
||||
if (unlikely(fatal_signal_pending(current)))
|
||||
return true;
|
||||
|
||||
return __memcg_kmem_newpage_charge(gfp, memcg, order);
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* memcg_kmem_uncharge_pages: uncharge pages from memcg
|
||||
* @page: pointer to struct page being freed
|
||||
* @order: allocation order.
|
||||
* memcg_kmem_charge: charge a kmem page
|
||||
* @page: page to charge
|
||||
* @gfp: reclaim mode
|
||||
* @order: allocation order
|
||||
*
|
||||
* Returns 0 on success, an error code on failure.
|
||||
*/
|
||||
static inline void
|
||||
memcg_kmem_uncharge_pages(struct page *page, int order)
|
||||
static __always_inline int memcg_kmem_charge(struct page *page,
|
||||
gfp_t gfp, int order)
|
||||
{
|
||||
if (__memcg_kmem_bypass(gfp))
|
||||
return 0;
|
||||
return __memcg_kmem_charge(page, gfp, order);
|
||||
}
|
||||
|
||||
/**
|
||||
* memcg_kmem_uncharge: uncharge a kmem page
|
||||
* @page: page to uncharge
|
||||
* @order: allocation order
|
||||
*/
|
||||
static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
|
||||
{
|
||||
if (memcg_kmem_enabled())
|
||||
__memcg_kmem_uncharge_pages(page, order);
|
||||
}
|
||||
|
||||
/**
|
||||
* memcg_kmem_commit_charge: embeds correct memcg in a page
|
||||
* @page: pointer to struct page recently allocated
|
||||
* @memcg: the memcg structure we charged against
|
||||
* @order: allocation order.
|
||||
*
|
||||
* Needs to be called after memcg_kmem_newpage_charge, regardless of success or
|
||||
* failure of the allocation. if @page is NULL, this function will revert the
|
||||
* charges. Otherwise, it will commit @page to @memcg.
|
||||
*/
|
||||
static inline void
|
||||
memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
|
||||
{
|
||||
if (memcg_kmem_enabled() && memcg)
|
||||
__memcg_kmem_commit_charge(page, memcg, order);
|
||||
__memcg_kmem_uncharge(page, order);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -850,17 +817,8 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
|
||||
static __always_inline struct kmem_cache *
|
||||
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
|
||||
{
|
||||
if (!memcg_kmem_enabled())
|
||||
if (__memcg_kmem_bypass(gfp))
|
||||
return cachep;
|
||||
if (gfp & __GFP_NOACCOUNT)
|
||||
return cachep;
|
||||
if (gfp & __GFP_NOFAIL)
|
||||
return cachep;
|
||||
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
|
||||
return cachep;
|
||||
if (unlikely(fatal_signal_pending(current)))
|
||||
return cachep;
|
||||
|
||||
return __memcg_kmem_get_cache(cachep);
|
||||
}
|
||||
|
||||
@ -869,13 +827,6 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
|
||||
if (memcg_kmem_enabled())
|
||||
__memcg_kmem_put_cache(cachep);
|
||||
}
|
||||
|
||||
static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
|
||||
{
|
||||
if (!memcg_kmem_enabled())
|
||||
return NULL;
|
||||
return __mem_cgroup_from_kmem(ptr);
|
||||
}
|
||||
#else
|
||||
#define for_each_memcg_cache_index(_idx) \
|
||||
for (; NULL; )
|
||||
@ -890,18 +841,12 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
|
||||
static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
|
||||
{
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
|
||||
static inline void memcg_kmem_uncharge(struct page *page, int order)
|
||||
{
|
||||
}
|
||||
|
||||
@ -927,11 +872,5 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
|
||||
static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
|
||||
{
|
||||
}
|
||||
|
||||
static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
#endif /* _LINUX_MEMCONTROL_H */
|
||||
|
||||
|
@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp);
|
||||
|
||||
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
|
||||
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
|
||||
#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
|
||||
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
|
||||
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
|
||||
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
|
||||
@ -202,6 +203,9 @@ extern unsigned int kobjsize(const void *objp);
|
||||
/* This mask defines which mm->def_flags a process can inherit its parent */
|
||||
#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
|
||||
|
||||
/* This mask is used to clear all the VMA flags used by mlock */
|
||||
#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))
|
||||
|
||||
/*
|
||||
* mapping from the currently active vm_flags protection bits (the
|
||||
* low four bits) to a page protection mask..
|
||||
@ -1606,8 +1610,10 @@ static inline void pgtable_init(void)
|
||||
|
||||
static inline bool pgtable_page_ctor(struct page *page)
|
||||
{
|
||||
if (!ptlock_init(page))
|
||||
return false;
|
||||
inc_zone_page_state(page, NR_PAGETABLE);
|
||||
return ptlock_init(page);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void pgtable_page_dtor(struct page *page)
|
||||
@ -2036,8 +2042,6 @@ void page_cache_async_readahead(struct address_space *mapping,
|
||||
pgoff_t offset,
|
||||
unsigned long size);
|
||||
|
||||
unsigned long max_sane_readahead(unsigned long nr);
|
||||
|
||||
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
|
||||
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
|
||||
|
||||
@ -2137,6 +2141,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
|
||||
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
|
||||
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
|
||||
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
|
||||
#define FOLL_MLOCK 0x1000 /* lock present pages */
|
||||
|
||||
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
|
||||
void *data);
|
||||
|
@ -486,6 +486,9 @@ struct mm_struct {
|
||||
/* address of the bounds directory */
|
||||
void __user *bd_addr;
|
||||
#endif
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
atomic_long_t hugetlb_usage;
|
||||
#endif
|
||||
};
|
||||
|
||||
static inline void mm_init_cpumask(struct mm_struct *mm)
|
||||
|
@ -823,8 +823,7 @@ enum memmap_context {
|
||||
MEMMAP_HOTPLUG,
|
||||
};
|
||||
extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
|
||||
unsigned long size,
|
||||
enum memmap_context context);
|
||||
unsigned long size);
|
||||
|
||||
extern void lruvec_init(struct lruvec *lruvec);
|
||||
|
||||
|
@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
|
||||
extern int watchdog_thresh;
|
||||
extern unsigned long *watchdog_cpumask_bits;
|
||||
extern int sysctl_softlockup_all_cpu_backtrace;
|
||||
extern int sysctl_hardlockup_all_cpu_backtrace;
|
||||
struct ctl_table;
|
||||
extern int proc_watchdog(struct ctl_table *, int ,
|
||||
void __user *, size_t *, loff_t *);
|
||||
|
@ -256,7 +256,7 @@ PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
|
||||
* Must use a macro here due to header dependency issues. page_zone() is not
|
||||
* available at this point.
|
||||
*/
|
||||
#define PageHighMem(__p) is_highmem(page_zone(__p))
|
||||
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
|
||||
#else
|
||||
PAGEFLAG_FALSE(HighMem)
|
||||
#endif
|
||||
|
@ -36,9 +36,9 @@ static inline unsigned long page_counter_read(struct page_counter *counter)
|
||||
|
||||
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
|
||||
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages);
|
||||
int page_counter_try_charge(struct page_counter *counter,
|
||||
unsigned long nr_pages,
|
||||
struct page_counter **fail);
|
||||
bool page_counter_try_charge(struct page_counter *counter,
|
||||
unsigned long nr_pages,
|
||||
struct page_counter **fail);
|
||||
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
|
||||
int page_counter_limit(struct page_counter *counter, unsigned long limit);
|
||||
int page_counter_memparse(const char *buf, const char *max,
|
||||
|
@ -384,6 +384,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos);
|
||||
extern unsigned int softlockup_panic;
|
||||
extern unsigned int hardlockup_panic;
|
||||
void lockup_detector_init(void);
|
||||
#else
|
||||
static inline void touch_softlockup_watchdog(void)
|
||||
@ -1460,7 +1461,9 @@ struct task_struct {
|
||||
unsigned sched_reset_on_fork:1;
|
||||
unsigned sched_contributes_to_load:1;
|
||||
unsigned sched_migrated:1;
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
unsigned memcg_may_oom:1;
|
||||
#endif
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
unsigned memcg_kmem_skip_account:1;
|
||||
#endif
|
||||
@ -1791,12 +1794,12 @@ struct task_struct {
|
||||
unsigned long trace_recursion;
|
||||
#endif /* CONFIG_TRACING */
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct memcg_oom_info {
|
||||
struct mem_cgroup *memcg;
|
||||
gfp_t gfp_mask;
|
||||
int order;
|
||||
unsigned int may_oom:1;
|
||||
} memcg_oom;
|
||||
struct mem_cgroup *memcg_in_oom;
|
||||
gfp_t memcg_oom_gfp_mask;
|
||||
int memcg_oom_order;
|
||||
|
||||
/* number of pages to reclaim on returning to userland */
|
||||
unsigned int memcg_nr_pages_over_high;
|
||||
#endif
|
||||
#ifdef CONFIG_UPROBES
|
||||
struct uprobe_task *utask;
|
||||
|
@ -111,7 +111,7 @@ struct mem_cgroup;
|
||||
* struct kmem_cache related prototypes
|
||||
*/
|
||||
void __init kmem_cache_init(void);
|
||||
int slab_is_available(void);
|
||||
bool slab_is_available(void);
|
||||
|
||||
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
|
||||
unsigned long,
|
||||
|
@ -887,4 +887,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
|
||||
|
||||
asmlinkage long sys_membarrier(int cmd, int flags);
|
||||
|
||||
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
|
||||
|
||||
#endif
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/task_work.h>
|
||||
#include <linux/memcontrol.h>
|
||||
struct linux_binprm;
|
||||
|
||||
/*
|
||||
@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
|
||||
smp_mb__after_atomic();
|
||||
if (unlikely(current->task_works))
|
||||
task_work_run();
|
||||
|
||||
mem_cgroup_handle_over_high();
|
||||
}
|
||||
|
||||
#endif /* <linux/tracehook.h> */
|
||||
|
@ -205,11 +205,25 @@ struct ustat {
|
||||
* struct callback_head - callback structure for use with RCU and task_work
|
||||
* @next: next update requests in a list
|
||||
* @func: actual update function to call after the grace period.
|
||||
*
|
||||
* The struct is aligned to size of pointer. On most architectures it happens
|
||||
* naturally due ABI requirements, but some architectures (like CRIS) have
|
||||
* weird ABI and we need to ask it explicitly.
|
||||
*
|
||||
* The alignment is required to guarantee that bits 0 and 1 of @next will be
|
||||
* clear under normal conditions -- as long as we use call_rcu(),
|
||||
* call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
|
||||
*
|
||||
* This guarantee is important for few reasons:
|
||||
* - future call_rcu_lazy() will make use of lower bits in the pointer;
|
||||
* - the structure shares storage spacer in struct page with @compound_head,
|
||||
* which encode PageTail() in bit 0. The guarantee is needed to avoid
|
||||
* false-positive PageTail().
|
||||
*/
|
||||
struct callback_head {
|
||||
struct callback_head *next;
|
||||
void (*func)(struct callback_head *head);
|
||||
};
|
||||
} __attribute__((aligned(sizeof(void *))));
|
||||
#define rcu_head callback_head
|
||||
|
||||
typedef void (*rcu_callback_t)(struct rcu_head *head);
|
||||
|
@ -75,36 +75,6 @@ static inline unsigned long __copy_from_user_nocache(void *to,
|
||||
|
||||
#endif /* ARCH_HAS_NOCACHE_UACCESS */
|
||||
|
||||
/**
|
||||
* probe_kernel_address(): safely attempt to read from a location
|
||||
* @addr: address to read from - its type is type typeof(retval)*
|
||||
* @retval: read into this variable
|
||||
*
|
||||
* Safely read from address @addr into variable @revtal. If a kernel fault
|
||||
* happens, handle that and return -EFAULT.
|
||||
* We ensure that the __get_user() is executed in atomic context so that
|
||||
* do_page_fault() doesn't attempt to take mmap_sem. This makes
|
||||
* probe_kernel_address() suitable for use within regions where the caller
|
||||
* already holds mmap_sem, or other locks which nest inside mmap_sem.
|
||||
* This must be a macro because __get_user() needs to know the types of the
|
||||
* args.
|
||||
*
|
||||
* We don't include enough header files to be able to do the set_fs(). We
|
||||
* require that the probe_kernel_address() caller will do that.
|
||||
*/
|
||||
#define probe_kernel_address(addr, retval) \
|
||||
({ \
|
||||
long ret; \
|
||||
mm_segment_t old_fs = get_fs(); \
|
||||
\
|
||||
set_fs(KERNEL_DS); \
|
||||
pagefault_disable(); \
|
||||
ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \
|
||||
pagefault_enable(); \
|
||||
set_fs(old_fs); \
|
||||
ret; \
|
||||
})
|
||||
|
||||
/*
|
||||
* probe_kernel_read(): safely attempt to read from a location
|
||||
* @dst: pointer to the buffer that shall take the data
|
||||
@ -131,4 +101,14 @@ extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size
|
||||
|
||||
extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
|
||||
|
||||
/**
|
||||
* probe_kernel_address(): safely attempt to read from a location
|
||||
* @addr: address to read from
|
||||
* @retval: read into this variable
|
||||
*
|
||||
* Returns 0 on success, or -EFAULT.
|
||||
*/
|
||||
#define probe_kernel_address(addr, retval) \
|
||||
probe_kernel_read(&retval, addr, sizeof(retval))
|
||||
|
||||
#endif /* __LINUX_UACCESS_H__ */
|
||||
|
@ -14,12 +14,12 @@
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
#define HIGHMEM_ZONE(xx) , xx##_HIGH
|
||||
#define HIGHMEM_ZONE(xx) xx##_HIGH,
|
||||
#else
|
||||
#define HIGHMEM_ZONE(xx)
|
||||
#endif
|
||||
|
||||
#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE
|
||||
#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
|
||||
|
||||
enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
||||
FOR_ALL_ZONES(PGALLOC),
|
||||
|
@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/*
|
||||
* Determine the per node value of a stat item. This function
|
||||
* is called frequently in a NUMA machine, so try to be as
|
||||
* frugal as possible.
|
||||
*/
|
||||
static inline unsigned long node_page_state(int node,
|
||||
enum zone_stat_item item)
|
||||
{
|
||||
struct zone *zones = NODE_DATA(node)->node_zones;
|
||||
|
||||
return
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
zone_page_state(&zones[ZONE_DMA], item) +
|
||||
#endif
|
||||
#ifdef CONFIG_ZONE_DMA32
|
||||
zone_page_state(&zones[ZONE_DMA32], item) +
|
||||
#endif
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
zone_page_state(&zones[ZONE_HIGHMEM], item) +
|
||||
#endif
|
||||
zone_page_state(&zones[ZONE_NORMAL], item) +
|
||||
zone_page_state(&zones[ZONE_MOVABLE], item);
|
||||
}
|
||||
|
||||
extern unsigned long node_page_state(int node, enum zone_stat_item item);
|
||||
extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
|
||||
|
||||
#else
|
||||
@ -269,7 +247,6 @@ static inline void __dec_zone_page_state(struct page *page,
|
||||
|
||||
#define set_pgdat_percpu_threshold(pgdat, callback) { }
|
||||
|
||||
static inline void refresh_cpu_vm_stats(int cpu) { }
|
||||
static inline void refresh_zone_stat_thresholds(void) { }
|
||||
static inline void cpu_vm_stats_fold(int cpu) { }
|
||||
|
||||
|
@ -9,6 +9,62 @@
|
||||
#include <linux/tracepoint.h>
|
||||
#include <trace/events/gfpflags.h>
|
||||
|
||||
#define COMPACTION_STATUS \
|
||||
EM( COMPACT_DEFERRED, "deferred") \
|
||||
EM( COMPACT_SKIPPED, "skipped") \
|
||||
EM( COMPACT_CONTINUE, "continue") \
|
||||
EM( COMPACT_PARTIAL, "partial") \
|
||||
EM( COMPACT_COMPLETE, "complete") \
|
||||
EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \
|
||||
EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \
|
||||
EMe(COMPACT_CONTENDED, "contended")
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
#define IFDEF_ZONE_DMA(X) X
|
||||
#else
|
||||
#define IFDEF_ZONE_DMA(X)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA32
|
||||
#define IFDEF_ZONE_DMA32(X) X
|
||||
#else
|
||||
#define IFDEF_ZONE_DMA32(X)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
#define IFDEF_ZONE_HIGHMEM(X) X
|
||||
#else
|
||||
#define IFDEF_ZONE_HIGHMEM(X)
|
||||
#endif
|
||||
|
||||
#define ZONE_TYPE \
|
||||
IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \
|
||||
IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \
|
||||
EM (ZONE_NORMAL, "Normal") \
|
||||
IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \
|
||||
EMe(ZONE_MOVABLE,"Movable")
|
||||
|
||||
/*
|
||||
* First define the enums in the above macros to be exported to userspace
|
||||
* via TRACE_DEFINE_ENUM().
|
||||
*/
|
||||
#undef EM
|
||||
#undef EMe
|
||||
#define EM(a, b) TRACE_DEFINE_ENUM(a);
|
||||
#define EMe(a, b) TRACE_DEFINE_ENUM(a);
|
||||
|
||||
COMPACTION_STATUS
|
||||
ZONE_TYPE
|
||||
|
||||
/*
|
||||
* Now redefine the EM() and EMe() macros to map the enums to the strings
|
||||
* that will be printed in the output.
|
||||
*/
|
||||
#undef EM
|
||||
#undef EMe
|
||||
#define EM(a, b) {a, b},
|
||||
#define EMe(a, b) {a, b}
|
||||
|
||||
DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
|
||||
|
||||
TP_PROTO(
|
||||
@ -161,7 +217,7 @@ TRACE_EVENT(mm_compaction_end,
|
||||
__entry->free_pfn,
|
||||
__entry->zone_end,
|
||||
__entry->sync ? "sync" : "async",
|
||||
compaction_status_string[__entry->status])
|
||||
__print_symbolic(__entry->status, COMPACTION_STATUS))
|
||||
);
|
||||
|
||||
TRACE_EVENT(mm_compaction_try_to_compact_pages,
|
||||
@ -201,23 +257,23 @@ DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, nid)
|
||||
__field(char *, name)
|
||||
__field(enum zone_type, idx)
|
||||
__field(int, order)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->nid = zone_to_nid(zone);
|
||||
__entry->name = (char *)zone->name;
|
||||
__entry->idx = zone_idx(zone);
|
||||
__entry->order = order;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("node=%d zone=%-8s order=%d ret=%s",
|
||||
__entry->nid,
|
||||
__entry->name,
|
||||
__print_symbolic(__entry->idx, ZONE_TYPE),
|
||||
__entry->order,
|
||||
compaction_status_string[__entry->ret])
|
||||
__print_symbolic(__entry->ret, COMPACTION_STATUS))
|
||||
);
|
||||
|
||||
DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished,
|
||||
@ -247,7 +303,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, nid)
|
||||
__field(char *, name)
|
||||
__field(enum zone_type, idx)
|
||||
__field(int, order)
|
||||
__field(unsigned int, considered)
|
||||
__field(unsigned int, defer_shift)
|
||||
@ -256,7 +312,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->nid = zone_to_nid(zone);
|
||||
__entry->name = (char *)zone->name;
|
||||
__entry->idx = zone_idx(zone);
|
||||
__entry->order = order;
|
||||
__entry->considered = zone->compact_considered;
|
||||
__entry->defer_shift = zone->compact_defer_shift;
|
||||
@ -265,7 +321,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
|
||||
|
||||
TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu",
|
||||
__entry->nid,
|
||||
__entry->name,
|
||||
__print_symbolic(__entry->idx, ZONE_TYPE),
|
||||
__entry->order,
|
||||
__entry->order_failed,
|
||||
__entry->considered,
|
||||
|
@ -25,6 +25,11 @@
|
||||
# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Flags for mlock
|
||||
*/
|
||||
#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
|
||||
|
||||
#define MS_ASYNC 1 /* sync memory asynchronously */
|
||||
#define MS_INVALIDATE 2 /* invalidate the caches */
|
||||
#define MS_SYNC 4 /* synchronous memory sync */
|
||||
|
@ -17,5 +17,6 @@
|
||||
|
||||
#define MCL_CURRENT 1 /* lock all current mappings */
|
||||
#define MCL_FUTURE 2 /* lock all future mappings */
|
||||
#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
|
||||
|
||||
#endif /* __ASM_GENERIC_MMAN_H */
|
||||
|
@ -713,9 +713,11 @@ __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
|
||||
__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
|
||||
#define __NR_membarrier 283
|
||||
__SYSCALL(__NR_membarrier, sys_membarrier)
|
||||
#define __NR_mlock2 284
|
||||
__SYSCALL(__NR_mlock2, sys_mlock2)
|
||||
|
||||
#undef __NR_syscalls
|
||||
#define __NR_syscalls 284
|
||||
#define __NR_syscalls 285
|
||||
|
||||
/*
|
||||
* All syscalls below here should go away really,
|
||||
|
@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
|
||||
}
|
||||
|
||||
/**
|
||||
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
|
||||
* @tsk: pointer to task_struct of some task.
|
||||
* cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
|
||||
*
|
||||
* Description: Prints @task's name, cpuset name, and cached copy of its
|
||||
* Description: Prints current's name, cpuset name, and cached copy of its
|
||||
* mems_allowed to the kernel log.
|
||||
*/
|
||||
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
|
||||
void cpuset_print_current_mems_allowed(void)
|
||||
{
|
||||
struct cgroup *cgrp;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
cgrp = task_cs(tsk)->css.cgroup;
|
||||
pr_info("%s cpuset=", tsk->comm);
|
||||
cgrp = task_cs(current)->css.cgroup;
|
||||
pr_info("%s cpuset=", current->comm);
|
||||
pr_cont_cgroup_name(cgrp);
|
||||
pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
|
||||
pr_cont(" mems_allowed=%*pbl\n",
|
||||
nodemask_pr_args(¤t->mems_allowed));
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
@ -455,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
tmp->vm_mm = mm;
|
||||
if (anon_vma_fork(tmp, mpnt))
|
||||
goto fail_nomem_anon_vma_fork;
|
||||
tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
|
||||
tmp->vm_flags &=
|
||||
~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
|
||||
tmp->vm_next = tmp->vm_prev = NULL;
|
||||
tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
|
||||
file = tmp->vm_file;
|
||||
|
@ -194,6 +194,7 @@ cond_syscall(sys_mlock);
|
||||
cond_syscall(sys_munlock);
|
||||
cond_syscall(sys_mlockall);
|
||||
cond_syscall(sys_munlockall);
|
||||
cond_syscall(sys_mlock2);
|
||||
cond_syscall(sys_mincore);
|
||||
cond_syscall(sys_madvise);
|
||||
cond_syscall(sys_mremap);
|
||||
|
@ -888,6 +888,17 @@ static struct ctl_table kern_table[] = {
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one,
|
||||
},
|
||||
#ifdef CONFIG_HARDLOCKUP_DETECTOR
|
||||
{
|
||||
.procname = "hardlockup_panic",
|
||||
.data = &hardlockup_panic,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_SMP
|
||||
{
|
||||
.procname = "softlockup_all_cpu_backtrace",
|
||||
@ -898,6 +909,15 @@ static struct ctl_table kern_table[] = {
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one,
|
||||
},
|
||||
{
|
||||
.procname = "hardlockup_all_cpu_backtrace",
|
||||
.data = &sysctl_hardlockup_all_cpu_backtrace,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one,
|
||||
},
|
||||
#endif /* CONFIG_SMP */
|
||||
#endif
|
||||
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
|
||||
|
@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
|
||||
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
|
||||
#else
|
||||
#define sysctl_softlockup_all_cpu_backtrace 0
|
||||
#define sysctl_hardlockup_all_cpu_backtrace 0
|
||||
#endif
|
||||
static struct cpumask watchdog_cpumask __read_mostly;
|
||||
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
|
||||
@ -110,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn;
|
||||
* Should we panic when a soft-lockup or hard-lockup occurs:
|
||||
*/
|
||||
#ifdef CONFIG_HARDLOCKUP_DETECTOR
|
||||
static int hardlockup_panic =
|
||||
unsigned int __read_mostly hardlockup_panic =
|
||||
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
|
||||
static unsigned long hardlockup_allcpu_dumped;
|
||||
/*
|
||||
* We may not want to enable hard lockup detection by default in all cases,
|
||||
* for example when running the kernel as a guest on a hypervisor. In these
|
||||
@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
|
||||
return 1;
|
||||
}
|
||||
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
|
||||
static int __init hardlockup_all_cpu_backtrace_setup(char *str)
|
||||
{
|
||||
sysctl_hardlockup_all_cpu_backtrace =
|
||||
!!simple_strtol(str, NULL, 0);
|
||||
return 1;
|
||||
}
|
||||
__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -263,15 +273,15 @@ void touch_softlockup_watchdog_sync(void)
|
||||
|
||||
#ifdef CONFIG_HARDLOCKUP_DETECTOR
|
||||
/* watchdog detector functions */
|
||||
static int is_hardlockup(void)
|
||||
static bool is_hardlockup(void)
|
||||
{
|
||||
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
|
||||
|
||||
if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
|
||||
return 1;
|
||||
return true;
|
||||
|
||||
__this_cpu_write(hrtimer_interrupts_saved, hrint);
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -279,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts)
|
||||
{
|
||||
unsigned long now = get_timestamp();
|
||||
|
||||
if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
|
||||
if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
|
||||
/* Warn about unreasonable delays. */
|
||||
if (time_after(now, touch_ts + get_softlockup_thresh()))
|
||||
return now - touch_ts;
|
||||
@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
|
||||
*/
|
||||
if (is_hardlockup()) {
|
||||
int this_cpu = smp_processor_id();
|
||||
struct pt_regs *regs = get_irq_regs();
|
||||
|
||||
/* only print hardlockups once */
|
||||
if (__this_cpu_read(hard_watchdog_warn) == true)
|
||||
return;
|
||||
|
||||
if (hardlockup_panic)
|
||||
panic("Watchdog detected hard LOCKUP on cpu %d",
|
||||
this_cpu);
|
||||
pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
|
||||
print_modules();
|
||||
print_irqtrace_events(current);
|
||||
if (regs)
|
||||
show_regs(regs);
|
||||
else
|
||||
WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
|
||||
this_cpu);
|
||||
dump_stack();
|
||||
|
||||
/*
|
||||
* Perform all-CPU dump only once to avoid multiple hardlockups
|
||||
* generating interleaving traces
|
||||
*/
|
||||
if (sysctl_hardlockup_all_cpu_backtrace &&
|
||||
!test_and_set_bit(0, &hardlockup_allcpu_dumped))
|
||||
trigger_allbutself_cpu_backtrace();
|
||||
|
||||
if (hardlockup_panic)
|
||||
panic("Hard LOCKUP");
|
||||
|
||||
__this_cpu_write(hard_watchdog_warn, true);
|
||||
return;
|
||||
@ -347,6 +370,9 @@ static void watchdog_interrupt_count(void)
|
||||
static int watchdog_nmi_enable(unsigned int cpu);
|
||||
static void watchdog_nmi_disable(unsigned int cpu);
|
||||
|
||||
static int watchdog_enable_all_cpus(void);
|
||||
static void watchdog_disable_all_cpus(void);
|
||||
|
||||
/* watchdog kicker functions */
|
||||
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
|
||||
{
|
||||
@ -651,37 +677,41 @@ static struct smp_hotplug_thread watchdog_threads = {
|
||||
|
||||
/*
|
||||
* park all watchdog threads that are specified in 'watchdog_cpumask'
|
||||
*
|
||||
* This function returns an error if kthread_park() of a watchdog thread
|
||||
* fails. In this situation, the watchdog threads of some CPUs can already
|
||||
* be parked and the watchdog threads of other CPUs can still be runnable.
|
||||
* Callers are expected to handle this special condition as appropriate in
|
||||
* their context.
|
||||
*
|
||||
* This function may only be called in a context that is protected against
|
||||
* races with CPU hotplug - for example, via get_online_cpus().
|
||||
*/
|
||||
static int watchdog_park_threads(void)
|
||||
{
|
||||
int cpu, ret = 0;
|
||||
|
||||
get_online_cpus();
|
||||
for_each_watchdog_cpu(cpu) {
|
||||
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
if (ret) {
|
||||
for_each_watchdog_cpu(cpu)
|
||||
kthread_unpark(per_cpu(softlockup_watchdog, cpu));
|
||||
}
|
||||
put_online_cpus();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* unpark all watchdog threads that are specified in 'watchdog_cpumask'
|
||||
*
|
||||
* This function may only be called in a context that is protected against
|
||||
* races with CPU hotplug - for example, via get_online_cpus().
|
||||
*/
|
||||
static void watchdog_unpark_threads(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
get_online_cpus();
|
||||
for_each_watchdog_cpu(cpu)
|
||||
kthread_unpark(per_cpu(softlockup_watchdog, cpu));
|
||||
put_online_cpus();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -691,6 +721,7 @@ int lockup_detector_suspend(void)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
get_online_cpus();
|
||||
mutex_lock(&watchdog_proc_mutex);
|
||||
/*
|
||||
* Multiple suspend requests can be active in parallel (counted by
|
||||
@ -704,6 +735,11 @@ int lockup_detector_suspend(void)
|
||||
|
||||
if (ret == 0)
|
||||
watchdog_suspended++;
|
||||
else {
|
||||
watchdog_disable_all_cpus();
|
||||
pr_err("Failed to suspend lockup detectors, disabled\n");
|
||||
watchdog_enabled = 0;
|
||||
}
|
||||
|
||||
mutex_unlock(&watchdog_proc_mutex);
|
||||
|
||||
@ -726,12 +762,20 @@ void lockup_detector_resume(void)
|
||||
watchdog_unpark_threads();
|
||||
|
||||
mutex_unlock(&watchdog_proc_mutex);
|
||||
put_online_cpus();
|
||||
}
|
||||
|
||||
static void update_watchdog_all_cpus(void)
|
||||
static int update_watchdog_all_cpus(void)
|
||||
{
|
||||
watchdog_park_threads();
|
||||
int ret;
|
||||
|
||||
ret = watchdog_park_threads();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
watchdog_unpark_threads();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int watchdog_enable_all_cpus(void)
|
||||
@ -750,15 +794,20 @@ static int watchdog_enable_all_cpus(void)
|
||||
* Enable/disable the lockup detectors or
|
||||
* change the sample period 'on the fly'.
|
||||
*/
|
||||
update_watchdog_all_cpus();
|
||||
err = update_watchdog_all_cpus();
|
||||
|
||||
if (err) {
|
||||
watchdog_disable_all_cpus();
|
||||
pr_err("Failed to update lockup detectors, disabled\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (err)
|
||||
watchdog_enabled = 0;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/* prepare/enable/disable routines */
|
||||
/* sysctl functions */
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static void watchdog_disable_all_cpus(void)
|
||||
{
|
||||
if (watchdog_running) {
|
||||
@ -767,6 +816,8 @@ static void watchdog_disable_all_cpus(void)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
|
||||
/*
|
||||
* Update the run state of the lockup detectors.
|
||||
*/
|
||||
@ -808,6 +859,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
|
||||
int err, old, new;
|
||||
int *watchdog_param = (int *)table->data;
|
||||
|
||||
get_online_cpus();
|
||||
mutex_lock(&watchdog_proc_mutex);
|
||||
|
||||
if (watchdog_suspended) {
|
||||
@ -849,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
|
||||
} while (cmpxchg(&watchdog_enabled, old, new) != old);
|
||||
|
||||
/*
|
||||
* Update the run state of the lockup detectors.
|
||||
* Restore 'watchdog_enabled' on failure.
|
||||
* Update the run state of the lockup detectors. There is _no_
|
||||
* need to check the value returned by proc_watchdog_update()
|
||||
* and to restore the previous value of 'watchdog_enabled' as
|
||||
* both lockup detectors are disabled if proc_watchdog_update()
|
||||
* returns an error.
|
||||
*/
|
||||
err = proc_watchdog_update();
|
||||
if (err)
|
||||
watchdog_enabled = old;
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&watchdog_proc_mutex);
|
||||
put_online_cpus();
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -899,6 +953,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
|
||||
{
|
||||
int err, old;
|
||||
|
||||
get_online_cpus();
|
||||
mutex_lock(&watchdog_proc_mutex);
|
||||
|
||||
if (watchdog_suspended) {
|
||||
@ -914,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Update the sample period.
|
||||
* Restore 'watchdog_thresh' on failure.
|
||||
* Update the sample period. Restore on failure.
|
||||
*/
|
||||
set_sample_period();
|
||||
err = proc_watchdog_update();
|
||||
if (err)
|
||||
if (err) {
|
||||
watchdog_thresh = old;
|
||||
set_sample_period();
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&watchdog_proc_mutex);
|
||||
put_online_cpus();
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -937,6 +994,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
|
||||
{
|
||||
int err;
|
||||
|
||||
get_online_cpus();
|
||||
mutex_lock(&watchdog_proc_mutex);
|
||||
|
||||
if (watchdog_suspended) {
|
||||
@ -964,6 +1022,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&watchdog_proc_mutex);
|
||||
put_online_cpus();
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -15,8 +15,7 @@ config KASAN
|
||||
global variables requires gcc 5.0 or later.
|
||||
This feature consumes about 1/8 of available memory and brings about
|
||||
~x3 performance slowdown.
|
||||
For better error detection enable CONFIG_STACKTRACE,
|
||||
and add slub_debug=U to boot cmdline.
|
||||
For better error detection enable CONFIG_STACKTRACE.
|
||||
|
||||
choice
|
||||
prompt "Instrumentation type"
|
||||
|
@ -138,6 +138,71 @@ static noinline void __init kmalloc_oob_16(void)
|
||||
kfree(ptr2);
|
||||
}
|
||||
|
||||
static noinline void __init kmalloc_oob_memset_2(void)
|
||||
{
|
||||
char *ptr;
|
||||
size_t size = 8;
|
||||
|
||||
pr_info("out-of-bounds in memset2\n");
|
||||
ptr = kmalloc(size, GFP_KERNEL);
|
||||
if (!ptr) {
|
||||
pr_err("Allocation failed\n");
|
||||
return;
|
||||
}
|
||||
|
||||
memset(ptr+7, 0, 2);
|
||||
kfree(ptr);
|
||||
}
|
||||
|
||||
static noinline void __init kmalloc_oob_memset_4(void)
|
||||
{
|
||||
char *ptr;
|
||||
size_t size = 8;
|
||||
|
||||
pr_info("out-of-bounds in memset4\n");
|
||||
ptr = kmalloc(size, GFP_KERNEL);
|
||||
if (!ptr) {
|
||||
pr_err("Allocation failed\n");
|
||||
return;
|
||||
}
|
||||
|
||||
memset(ptr+5, 0, 4);
|
||||
kfree(ptr);
|
||||
}
|
||||
|
||||
|
||||
static noinline void __init kmalloc_oob_memset_8(void)
|
||||
{
|
||||
char *ptr;
|
||||
size_t size = 8;
|
||||
|
||||
pr_info("out-of-bounds in memset8\n");
|
||||
ptr = kmalloc(size, GFP_KERNEL);
|
||||
if (!ptr) {
|
||||
pr_err("Allocation failed\n");
|
||||
return;
|
||||
}
|
||||
|
||||
memset(ptr+1, 0, 8);
|
||||
kfree(ptr);
|
||||
}
|
||||
|
||||
static noinline void __init kmalloc_oob_memset_16(void)
|
||||
{
|
||||
char *ptr;
|
||||
size_t size = 16;
|
||||
|
||||
pr_info("out-of-bounds in memset16\n");
|
||||
ptr = kmalloc(size, GFP_KERNEL);
|
||||
if (!ptr) {
|
||||
pr_err("Allocation failed\n");
|
||||
return;
|
||||
}
|
||||
|
||||
memset(ptr+1, 0, 16);
|
||||
kfree(ptr);
|
||||
}
|
||||
|
||||
static noinline void __init kmalloc_oob_in_memset(void)
|
||||
{
|
||||
char *ptr;
|
||||
@ -264,6 +329,10 @@ static int __init kmalloc_tests_init(void)
|
||||
kmalloc_oob_krealloc_less();
|
||||
kmalloc_oob_16();
|
||||
kmalloc_oob_in_memset();
|
||||
kmalloc_oob_memset_2();
|
||||
kmalloc_oob_memset_4();
|
||||
kmalloc_oob_memset_8();
|
||||
kmalloc_oob_memset_16();
|
||||
kmalloc_uaf();
|
||||
kmalloc_uaf_memset();
|
||||
kmalloc_uaf2();
|
||||
|
@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage,
|
||||
struct balloon_dev_info *balloon = balloon_page_device(page);
|
||||
int rc = -EAGAIN;
|
||||
|
||||
/*
|
||||
* Block others from accessing the 'newpage' when we get around to
|
||||
* establishing additional references. We should be the only one
|
||||
* holding a reference to the 'newpage' at this point.
|
||||
*/
|
||||
BUG_ON(!trylock_page(newpage));
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
|
||||
|
||||
if (WARN_ON(!__is_movable_balloon_page(page))) {
|
||||
dump_page(page, "not movable balloon page");
|
||||
unlock_page(newpage);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (balloon && balloon->migratepage)
|
||||
rc = balloon->migratepage(balloon, newpage, page, mode);
|
||||
|
||||
unlock_page(newpage);
|
||||
return rc;
|
||||
}
|
||||
#endif /* CONFIG_BALLOON_COMPACTION */
|
||||
|
6
mm/cma.c
6
mm/cma.c
@ -363,7 +363,9 @@ int __init cma_declare_contiguous(phys_addr_t base,
|
||||
*/
|
||||
struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
|
||||
{
|
||||
unsigned long mask, offset, pfn, start = 0;
|
||||
unsigned long mask, offset;
|
||||
unsigned long pfn = -1;
|
||||
unsigned long start = 0;
|
||||
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
|
||||
struct page *page = NULL;
|
||||
int ret;
|
||||
@ -418,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
|
||||
start = bitmap_no + mask + 1;
|
||||
}
|
||||
|
||||
trace_cma_alloc(page ? pfn : -1UL, page, count, align);
|
||||
trace_cma_alloc(pfn, page, count, align);
|
||||
|
||||
pr_debug("%s(): returned %p\n", __func__, page);
|
||||
return page;
|
||||
|
@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
|
||||
#endif
|
||||
|
||||
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
|
||||
#ifdef CONFIG_TRACEPOINTS
|
||||
static const char *const compaction_status_string[] = {
|
||||
"deferred",
|
||||
"skipped",
|
||||
"continue",
|
||||
"partial",
|
||||
"complete",
|
||||
"no_suitable_page",
|
||||
"not_suitable_zone",
|
||||
};
|
||||
#endif
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/compaction.h>
|
||||
@ -1197,6 +1186,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
|
||||
}
|
||||
|
||||
/*
|
||||
* order == -1 is expected when compacting via
|
||||
* /proc/sys/vm/compact_memory
|
||||
*/
|
||||
static inline bool is_via_compact_memory(int order)
|
||||
{
|
||||
return order == -1;
|
||||
}
|
||||
|
||||
static int __compact_finished(struct zone *zone, struct compact_control *cc,
|
||||
const int migratetype)
|
||||
{
|
||||
@ -1204,7 +1202,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
|
||||
unsigned long watermark;
|
||||
|
||||
if (cc->contended || fatal_signal_pending(current))
|
||||
return COMPACT_PARTIAL;
|
||||
return COMPACT_CONTENDED;
|
||||
|
||||
/* Compaction run completes if the migrate and free scanner meet */
|
||||
if (compact_scanners_met(cc)) {
|
||||
@ -1223,11 +1221,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
|
||||
return COMPACT_COMPLETE;
|
||||
}
|
||||
|
||||
/*
|
||||
* order == -1 is expected when compacting via
|
||||
* /proc/sys/vm/compact_memory
|
||||
*/
|
||||
if (cc->order == -1)
|
||||
if (is_via_compact_memory(cc->order))
|
||||
return COMPACT_CONTINUE;
|
||||
|
||||
/* Compaction run is not finished if the watermark is not met */
|
||||
@ -1290,11 +1284,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
|
||||
int fragindex;
|
||||
unsigned long watermark;
|
||||
|
||||
/*
|
||||
* order == -1 is expected when compacting via
|
||||
* /proc/sys/vm/compact_memory
|
||||
*/
|
||||
if (order == -1)
|
||||
if (is_via_compact_memory(order))
|
||||
return COMPACT_CONTINUE;
|
||||
|
||||
watermark = low_wmark_pages(zone);
|
||||
@ -1403,7 +1393,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
|
||||
switch (isolate_migratepages(zone, cc)) {
|
||||
case ISOLATE_ABORT:
|
||||
ret = COMPACT_PARTIAL;
|
||||
ret = COMPACT_CONTENDED;
|
||||
putback_movable_pages(&cc->migratepages);
|
||||
cc->nr_migratepages = 0;
|
||||
goto out;
|
||||
@ -1434,7 +1424,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
* and we want compact_finished() to detect it
|
||||
*/
|
||||
if (err == -ENOMEM && !compact_scanners_met(cc)) {
|
||||
ret = COMPACT_PARTIAL;
|
||||
ret = COMPACT_CONTENDED;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
@ -1487,6 +1477,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
|
||||
cc->free_pfn, end_pfn, sync, ret);
|
||||
|
||||
if (ret == COMPACT_CONTENDED)
|
||||
ret = COMPACT_PARTIAL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1658,10 +1651,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
|
||||
* this makes sure we compact the whole zone regardless of
|
||||
* cached scanner positions.
|
||||
*/
|
||||
if (cc->order == -1)
|
||||
if (is_via_compact_memory(cc->order))
|
||||
__reset_isolation_suitable(zone);
|
||||
|
||||
if (cc->order == -1 || !compaction_deferred(zone, cc->order))
|
||||
if (is_via_compact_memory(cc->order) ||
|
||||
!compaction_deferred(zone, cc->order))
|
||||
compact_zone(zone, cc);
|
||||
|
||||
if (cc->order > 0) {
|
||||
|
@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
|
||||
{VM_GROWSDOWN, "growsdown" },
|
||||
{VM_PFNMAP, "pfnmap" },
|
||||
{VM_DENYWRITE, "denywrite" },
|
||||
{VM_LOCKONFAULT, "lockonfault" },
|
||||
{VM_LOCKED, "locked" },
|
||||
{VM_IO, "io" },
|
||||
{VM_SEQ_READ, "seqread" },
|
||||
|
@ -126,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
|
||||
/*
|
||||
* Mappings have to be page-aligned
|
||||
*/
|
||||
offset = phys_addr & ~PAGE_MASK;
|
||||
offset = offset_in_page(phys_addr);
|
||||
phys_addr &= PAGE_MASK;
|
||||
size = PAGE_ALIGN(last_addr + 1) - phys_addr;
|
||||
|
||||
@ -189,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
|
||||
if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
|
||||
return;
|
||||
|
||||
offset = virt_addr & ~PAGE_MASK;
|
||||
offset = offset_in_page(virt_addr);
|
||||
nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
|
||||
|
||||
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
|
||||
@ -234,7 +234,7 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
|
||||
char *p;
|
||||
|
||||
while (size) {
|
||||
slop = src & ~PAGE_MASK;
|
||||
slop = offset_in_page(src);
|
||||
clen = size;
|
||||
if (clen > MAX_MAP_CHUNK - slop)
|
||||
clen = MAX_MAP_CHUNK - slop;
|
||||
|
77
mm/filemap.c
77
mm/filemap.c
@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping)
|
||||
}
|
||||
EXPORT_SYMBOL(filemap_flush);
|
||||
|
||||
/**
|
||||
* filemap_fdatawait_range - wait for writeback to complete
|
||||
* @mapping: address space structure to wait for
|
||||
* @start_byte: offset in bytes where the range starts
|
||||
* @end_byte: offset in bytes where the range ends (inclusive)
|
||||
*
|
||||
* Walk the list of under-writeback pages of the given address space
|
||||
* in the given range and wait for all of them.
|
||||
*/
|
||||
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
|
||||
loff_t end_byte)
|
||||
static int __filemap_fdatawait_range(struct address_space *mapping,
|
||||
loff_t start_byte, loff_t end_byte)
|
||||
{
|
||||
pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
|
||||
pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
|
||||
struct pagevec pvec;
|
||||
int nr_pages;
|
||||
int ret2, ret = 0;
|
||||
int ret = 0;
|
||||
|
||||
if (end_byte < start_byte)
|
||||
goto out;
|
||||
@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
|
||||
cond_resched();
|
||||
}
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_fdatawait_range - wait for writeback to complete
|
||||
* @mapping: address space structure to wait for
|
||||
* @start_byte: offset in bytes where the range starts
|
||||
* @end_byte: offset in bytes where the range ends (inclusive)
|
||||
*
|
||||
* Walk the list of under-writeback pages of the given address space
|
||||
* in the given range and wait for all of them. Check error status of
|
||||
* the address space and return it.
|
||||
*
|
||||
* Since the error status of the address space is cleared by this function,
|
||||
* callers are responsible for checking the return value and handling and/or
|
||||
* reporting the error.
|
||||
*/
|
||||
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
|
||||
loff_t end_byte)
|
||||
{
|
||||
int ret, ret2;
|
||||
|
||||
ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
|
||||
ret2 = filemap_check_errors(mapping);
|
||||
if (!ret)
|
||||
ret = ret2;
|
||||
@ -382,12 +396,39 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
|
||||
}
|
||||
EXPORT_SYMBOL(filemap_fdatawait_range);
|
||||
|
||||
/**
|
||||
* filemap_fdatawait_keep_errors - wait for writeback without clearing errors
|
||||
* @mapping: address space structure to wait for
|
||||
*
|
||||
* Walk the list of under-writeback pages of the given address space
|
||||
* and wait for all of them. Unlike filemap_fdatawait(), this function
|
||||
* does not clear error status of the address space.
|
||||
*
|
||||
* Use this function if callers don't handle errors themselves. Expected
|
||||
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
|
||||
* fsfreeze(8)
|
||||
*/
|
||||
void filemap_fdatawait_keep_errors(struct address_space *mapping)
|
||||
{
|
||||
loff_t i_size = i_size_read(mapping->host);
|
||||
|
||||
if (i_size == 0)
|
||||
return;
|
||||
|
||||
__filemap_fdatawait_range(mapping, 0, i_size - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_fdatawait - wait for all under-writeback pages to complete
|
||||
* @mapping: address space structure to wait for
|
||||
*
|
||||
* Walk the list of under-writeback pages of the given address space
|
||||
* and wait for all of them.
|
||||
* and wait for all of them. Check error status of the address space
|
||||
* and return it.
|
||||
*
|
||||
* Since the error status of the address space is cleared by this function,
|
||||
* callers are responsible for checking the return value and handling and/or
|
||||
* reporting the error.
|
||||
*/
|
||||
int filemap_fdatawait(struct address_space *mapping)
|
||||
{
|
||||
@ -510,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
|
||||
__inc_zone_page_state(new, NR_SHMEM);
|
||||
spin_unlock_irqrestore(&mapping->tree_lock, flags);
|
||||
mem_cgroup_end_page_stat(memcg);
|
||||
mem_cgroup_migrate(old, new, true);
|
||||
mem_cgroup_replace_page(old, new);
|
||||
radix_tree_preload_end();
|
||||
if (freepage)
|
||||
freepage(old);
|
||||
@ -1807,7 +1848,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
|
||||
struct file *file,
|
||||
pgoff_t offset)
|
||||
{
|
||||
unsigned long ra_pages;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
|
||||
/* If we don't want any read-ahead, don't bother */
|
||||
@ -1836,10 +1876,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
|
||||
/*
|
||||
* mmap read-around
|
||||
*/
|
||||
ra_pages = max_sane_readahead(ra->ra_pages);
|
||||
ra->start = max_t(long, 0, offset - ra_pages / 2);
|
||||
ra->size = ra_pages;
|
||||
ra->async_size = ra_pages / 4;
|
||||
ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
|
||||
ra->size = ra->ra_pages;
|
||||
ra->async_size = ra->ra_pages / 4;
|
||||
ra_submit(ra, mapping, file);
|
||||
}
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
/*
|
||||
/**
|
||||
* get_vaddr_frames() - map virtual addresses to pfns
|
||||
* @start: starting user address
|
||||
* @nr_frames: number of pages / pfns from start to map
|
||||
|
10
mm/gup.c
10
mm/gup.c
@ -129,7 +129,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
|
||||
*/
|
||||
mark_page_accessed(page);
|
||||
}
|
||||
if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
|
||||
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
|
||||
/*
|
||||
* The preliminary mapping check is mainly to avoid the
|
||||
* pointless overhead of lock_page on the ZERO_PAGE
|
||||
@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
|
||||
unsigned int fault_flags = 0;
|
||||
int ret;
|
||||
|
||||
/* mlock all present pages, but do not fault in new pages */
|
||||
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
|
||||
return -ENOENT;
|
||||
/* For mm_populate(), just skip the stack guard page. */
|
||||
if ((*flags & FOLL_POPULATE) &&
|
||||
(stack_guard_page_start(vma, address) ||
|
||||
@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
|
||||
VM_BUG_ON_VMA(end > vma->vm_end, vma);
|
||||
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
|
||||
|
||||
gup_flags = FOLL_TOUCH | FOLL_POPULATE;
|
||||
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
|
||||
if (vma->vm_flags & VM_LOCKONFAULT)
|
||||
gup_flags &= ~FOLL_POPULATE;
|
||||
|
||||
/*
|
||||
* We want to touch writable mappings with a write fault in order
|
||||
* to break COW, except for shared mappings because these don't COW
|
||||
|
@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
|
||||
pmd, _pmd, 1))
|
||||
update_mmu_cache_pmd(vma, addr, pmd);
|
||||
}
|
||||
if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
|
||||
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
|
||||
if (page->mapping && trylock_page(page)) {
|
||||
lru_add_drain();
|
||||
if (page->mapping)
|
||||
|
139
mm/hugetlb.c
139
mm/hugetlb.c
@ -1437,7 +1437,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||
dissolve_free_huge_page(pfn_to_page(pfn));
|
||||
}
|
||||
|
||||
static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
|
||||
/*
|
||||
* There are 3 ways this can get called:
|
||||
* 1. With vma+addr: we use the VMA's memory policy
|
||||
* 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
|
||||
* page from any node, and let the buddy allocator itself figure
|
||||
* it out.
|
||||
* 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
|
||||
* strictly from 'nid'
|
||||
*/
|
||||
static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr, int nid)
|
||||
{
|
||||
int order = huge_page_order(h);
|
||||
gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
|
||||
unsigned int cpuset_mems_cookie;
|
||||
|
||||
/*
|
||||
* We need a VMA to get a memory policy. If we do not
|
||||
* have one, we use the 'nid' argument.
|
||||
*
|
||||
* The mempolicy stuff below has some non-inlined bits
|
||||
* and calls ->vm_ops. That makes it hard to optimize at
|
||||
* compile-time, even when NUMA is off and it does
|
||||
* nothing. This helps the compiler optimize it out.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
|
||||
/*
|
||||
* If a specific node is requested, make sure to
|
||||
* get memory from there, but only when a node
|
||||
* is explicitly specified.
|
||||
*/
|
||||
if (nid != NUMA_NO_NODE)
|
||||
gfp |= __GFP_THISNODE;
|
||||
/*
|
||||
* Make sure to call something that can handle
|
||||
* nid=NUMA_NO_NODE
|
||||
*/
|
||||
return alloc_pages_node(nid, gfp, order);
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, so we have a VMA. Fetch the mempolicy and try to
|
||||
* allocate a huge page with it. We will only reach this
|
||||
* when CONFIG_NUMA=y.
|
||||
*/
|
||||
do {
|
||||
struct page *page;
|
||||
struct mempolicy *mpol;
|
||||
struct zonelist *zl;
|
||||
nodemask_t *nodemask;
|
||||
|
||||
cpuset_mems_cookie = read_mems_allowed_begin();
|
||||
zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
|
||||
mpol_cond_put(mpol);
|
||||
page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
|
||||
if (page)
|
||||
return page;
|
||||
} while (read_mems_allowed_retry(cpuset_mems_cookie));
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* There are two ways to allocate a huge page:
|
||||
* 1. When you have a VMA and an address (like a fault)
|
||||
* 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
|
||||
*
|
||||
* 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
|
||||
* this case which signifies that the allocation should be done with
|
||||
* respect for the VMA's memory policy.
|
||||
*
|
||||
* For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
|
||||
* implies that memory policies will not be taken in to account.
|
||||
*/
|
||||
static struct page *__alloc_buddy_huge_page(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr, int nid)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned int r_nid;
|
||||
@ -1445,6 +1520,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
|
||||
if (hstate_is_gigantic(h))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Make sure that anyone specifying 'nid' is not also specifying a VMA.
|
||||
* This makes sure the caller is picking _one_ of the modes with which
|
||||
* we can call this function, not both.
|
||||
*/
|
||||
if (vma || (addr != -1)) {
|
||||
VM_WARN_ON_ONCE(addr == -1);
|
||||
VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
|
||||
}
|
||||
/*
|
||||
* Assume we will successfully allocate the surplus page to
|
||||
* prevent racing processes from causing the surplus to exceed
|
||||
@ -1478,14 +1562,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
if (nid == NUMA_NO_NODE)
|
||||
page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
|
||||
__GFP_REPEAT|__GFP_NOWARN,
|
||||
huge_page_order(h));
|
||||
else
|
||||
page = __alloc_pages_node(nid,
|
||||
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
|
||||
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
|
||||
page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (page) {
|
||||
@ -1509,6 +1586,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a huge page from 'nid'. Note, 'nid' may be
|
||||
* NUMA_NO_NODE, which means that it may be allocated
|
||||
* anywhere.
|
||||
*/
|
||||
static
|
||||
struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
|
||||
{
|
||||
unsigned long addr = -1;
|
||||
|
||||
return __alloc_buddy_huge_page(h, NULL, addr, nid);
|
||||
}
|
||||
|
||||
/*
|
||||
* Use the VMA's mpolicy to allocate a huge page from the buddy.
|
||||
*/
|
||||
static
|
||||
struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
|
||||
}
|
||||
|
||||
/*
|
||||
* This allocation function is useful in the context where vma is irrelevant.
|
||||
* E.g. soft-offlining uses this function because it only cares physical
|
||||
@ -1524,7 +1624,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
if (!page)
|
||||
page = alloc_buddy_huge_page(h, nid);
|
||||
page = __alloc_buddy_huge_page_no_mpol(h, nid);
|
||||
|
||||
return page;
|
||||
}
|
||||
@ -1554,7 +1654,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
|
||||
retry:
|
||||
spin_unlock(&hugetlb_lock);
|
||||
for (i = 0; i < needed; i++) {
|
||||
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
||||
page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
|
||||
if (!page) {
|
||||
alloc_ok = false;
|
||||
break;
|
||||
@ -1787,7 +1887,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
|
||||
if (!page) {
|
||||
spin_unlock(&hugetlb_lock);
|
||||
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
||||
page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
|
||||
if (!page)
|
||||
goto out_uncharge_cgroup;
|
||||
|
||||
@ -2376,7 +2476,7 @@ struct node_hstate {
|
||||
struct kobject *hugepages_kobj;
|
||||
struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
|
||||
};
|
||||
struct node_hstate node_hstates[MAX_NUMNODES];
|
||||
static struct node_hstate node_hstates[MAX_NUMNODES];
|
||||
|
||||
/*
|
||||
* A subset of global hstate attributes for node devices
|
||||
@ -2790,6 +2890,12 @@ void hugetlb_show_meminfo(void)
|
||||
1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
|
||||
}
|
||||
|
||||
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
|
||||
{
|
||||
seq_printf(m, "HugetlbPages:\t%8lu kB\n",
|
||||
atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
|
||||
}
|
||||
|
||||
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
|
||||
unsigned long hugetlb_total_pages(void)
|
||||
{
|
||||
@ -3025,6 +3131,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
||||
get_page(ptepage);
|
||||
page_dup_rmap(ptepage);
|
||||
set_huge_pte_at(dst, addr, dst_pte, entry);
|
||||
hugetlb_count_add(pages_per_huge_page(h), dst);
|
||||
}
|
||||
spin_unlock(src_ptl);
|
||||
spin_unlock(dst_ptl);
|
||||
@ -3105,6 +3212,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
if (huge_pte_dirty(pte))
|
||||
set_page_dirty(page);
|
||||
|
||||
hugetlb_count_sub(pages_per_huge_page(h), mm);
|
||||
page_remove_rmap(page);
|
||||
force_flush = !__tlb_remove_page(tlb, page);
|
||||
if (force_flush) {
|
||||
@ -3509,6 +3617,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
&& (vma->vm_flags & VM_SHARED)));
|
||||
set_huge_pte_at(mm, address, ptep, new_pte);
|
||||
|
||||
hugetlb_count_add(pages_per_huge_page(h), mm);
|
||||
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
|
||||
/* Optimization, do the COW without a second fault */
|
||||
ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
|
||||
@ -4028,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
|
||||
unsigned long s_end = sbase + PUD_SIZE;
|
||||
|
||||
/* Allow segments to share if only one is marked locked */
|
||||
unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
|
||||
unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
|
||||
unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
|
||||
unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
|
||||
|
||||
/*
|
||||
* match the virtual addresses, permission and the alignment of the
|
||||
|
@ -186,7 +186,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter);
|
||||
if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
|
||||
ret = -ENOMEM;
|
||||
css_put(&h_cg->css);
|
||||
done:
|
||||
*ptr = h_cg;
|
||||
|
@ -271,20 +271,19 @@ extern unsigned int munlock_vma_page(struct page *page);
|
||||
extern void clear_page_mlock(struct page *page);
|
||||
|
||||
/*
|
||||
* mlock_migrate_page - called only from migrate_page_copy() to
|
||||
* migrate the Mlocked page flag; update statistics.
|
||||
* mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
|
||||
* (because that does not go through the full procedure of migration ptes):
|
||||
* to migrate the Mlocked page flag; update statistics.
|
||||
*/
|
||||
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
|
||||
{
|
||||
if (TestClearPageMlocked(page)) {
|
||||
unsigned long flags;
|
||||
int nr_pages = hpage_nr_pages(page);
|
||||
|
||||
local_irq_save(flags);
|
||||
/* Holding pmd lock, no change in irq context: __mod is safe */
|
||||
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
||||
SetPageMlocked(newpage);
|
||||
__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
|
||||
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
|
||||
*
|
||||
* Some of code borrowed from https://github.com/xairy/linux by
|
||||
* Some code borrowed from https://github.com/xairy/kasan-prototype by
|
||||
* Andrey Konovalov <adech.fo@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -86,6 +86,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr)
|
||||
if (memory_is_poisoned_1(addr + 1))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If single shadow byte covers 2-byte access, we don't
|
||||
* need to do anything more. Otherwise, test the first
|
||||
* shadow byte.
|
||||
*/
|
||||
if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
|
||||
return false;
|
||||
|
||||
@ -103,6 +108,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr)
|
||||
if (memory_is_poisoned_1(addr + 3))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If single shadow byte covers 4-byte access, we don't
|
||||
* need to do anything more. Otherwise, test the first
|
||||
* shadow byte.
|
||||
*/
|
||||
if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
|
||||
return false;
|
||||
|
||||
@ -120,7 +130,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr)
|
||||
if (memory_is_poisoned_1(addr + 7))
|
||||
return true;
|
||||
|
||||
if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
|
||||
/*
|
||||
* If single shadow byte covers 8-byte access, we don't
|
||||
* need to do anything more. Otherwise, test the first
|
||||
* shadow byte.
|
||||
*/
|
||||
if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
|
||||
return false;
|
||||
|
||||
return unlikely(*(u8 *)shadow_addr);
|
||||
@ -139,7 +154,12 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr)
|
||||
if (unlikely(shadow_first_bytes))
|
||||
return true;
|
||||
|
||||
if (likely(IS_ALIGNED(addr, 8)))
|
||||
/*
|
||||
* If two shadow bytes covers 16-byte access, we don't
|
||||
* need to do anything more. Otherwise, test the last
|
||||
* shadow byte.
|
||||
*/
|
||||
if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
|
||||
return false;
|
||||
|
||||
return memory_is_poisoned_1(addr + 15);
|
||||
@ -203,7 +223,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
|
||||
s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
|
||||
|
||||
if (unlikely(ret != (unsigned long)last_shadow ||
|
||||
((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
|
||||
((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -235,18 +255,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
|
||||
static __always_inline void check_memory_region(unsigned long addr,
|
||||
size_t size, bool write)
|
||||
{
|
||||
struct kasan_access_info info;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return;
|
||||
|
||||
if (unlikely((void *)addr <
|
||||
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
|
||||
info.access_addr = (void *)addr;
|
||||
info.access_size = size;
|
||||
info.is_write = write;
|
||||
info.ip = _RET_IP_;
|
||||
kasan_report_user_access(&info);
|
||||
kasan_report(addr, size, write, _RET_IP_);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -524,7 +538,7 @@ static int kasan_mem_notifier(struct notifier_block *nb,
|
||||
|
||||
static int __init kasan_memhotplug_init(void)
|
||||
{
|
||||
pr_err("WARNING: KASan doesn't support memory hot-add\n");
|
||||
pr_err("WARNING: KASAN doesn't support memory hot-add\n");
|
||||
pr_err("Memory hot-add will be disabled\n");
|
||||
|
||||
hotplug_memory_notifier(kasan_mem_notifier, 0);
|
||||
|
@ -54,16 +54,13 @@ struct kasan_global {
|
||||
#endif
|
||||
};
|
||||
|
||||
void kasan_report_error(struct kasan_access_info *info);
|
||||
void kasan_report_user_access(struct kasan_access_info *info);
|
||||
|
||||
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
|
||||
{
|
||||
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
|
||||
<< KASAN_SHADOW_SCALE_SHIFT);
|
||||
}
|
||||
|
||||
static inline bool kasan_enabled(void)
|
||||
static inline bool kasan_report_enabled(void)
|
||||
{
|
||||
return !current->kasan_depth;
|
||||
}
|
||||
|
@ -4,7 +4,7 @@
|
||||
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
|
||||
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
|
||||
*
|
||||
* Some of code borrowed from https://github.com/xairy/linux by
|
||||
* Some code borrowed from https://github.com/xairy/kasan-prototype by
|
||||
* Andrey Konovalov <adech.fo@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -22,6 +22,7 @@
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kasan.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/sections.h>
|
||||
|
||||
@ -48,34 +49,49 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
|
||||
|
||||
static void print_error_description(struct kasan_access_info *info)
|
||||
{
|
||||
const char *bug_type = "unknown crash";
|
||||
u8 shadow_val;
|
||||
const char *bug_type = "unknown-crash";
|
||||
u8 *shadow_addr;
|
||||
|
||||
info->first_bad_addr = find_first_bad_addr(info->access_addr,
|
||||
info->access_size);
|
||||
|
||||
shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
|
||||
shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
|
||||
|
||||
switch (shadow_val) {
|
||||
case KASAN_FREE_PAGE:
|
||||
case KASAN_KMALLOC_FREE:
|
||||
bug_type = "use after free";
|
||||
/*
|
||||
* If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
|
||||
* at the next shadow byte to determine the type of the bad access.
|
||||
*/
|
||||
if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
|
||||
shadow_addr++;
|
||||
|
||||
switch (*shadow_addr) {
|
||||
case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
|
||||
/*
|
||||
* In theory it's still possible to see these shadow values
|
||||
* due to a data race in the kernel code.
|
||||
*/
|
||||
bug_type = "out-of-bounds";
|
||||
break;
|
||||
case KASAN_PAGE_REDZONE:
|
||||
case KASAN_KMALLOC_REDZONE:
|
||||
bug_type = "slab-out-of-bounds";
|
||||
break;
|
||||
case KASAN_GLOBAL_REDZONE:
|
||||
case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
|
||||
bug_type = "out of bounds access";
|
||||
bug_type = "global-out-of-bounds";
|
||||
break;
|
||||
case KASAN_STACK_LEFT:
|
||||
case KASAN_STACK_MID:
|
||||
case KASAN_STACK_RIGHT:
|
||||
case KASAN_STACK_PARTIAL:
|
||||
bug_type = "out of bounds on stack";
|
||||
bug_type = "stack-out-of-bounds";
|
||||
break;
|
||||
case KASAN_FREE_PAGE:
|
||||
case KASAN_KMALLOC_FREE:
|
||||
bug_type = "use-after-free";
|
||||
break;
|
||||
}
|
||||
|
||||
pr_err("BUG: KASan: %s in %pS at addr %p\n",
|
||||
pr_err("BUG: KASAN: %s in %pS at addr %p\n",
|
||||
bug_type, (void *)info->ip,
|
||||
info->access_addr);
|
||||
pr_err("%s of size %zu by task %s/%d\n",
|
||||
@ -85,9 +101,11 @@ static void print_error_description(struct kasan_access_info *info)
|
||||
|
||||
static inline bool kernel_or_module_addr(const void *addr)
|
||||
{
|
||||
return (addr >= (void *)_stext && addr < (void *)_end)
|
||||
|| (addr >= (void *)MODULES_VADDR
|
||||
&& addr < (void *)MODULES_END);
|
||||
if (addr >= (void *)_stext && addr < (void *)_end)
|
||||
return true;
|
||||
if (is_module_address((unsigned long)addr))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool init_task_stack_addr(const void *addr)
|
||||
@ -161,15 +179,19 @@ static void print_shadow_for_address(const void *addr)
|
||||
for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
|
||||
const void *kaddr = kasan_shadow_to_mem(shadow_row);
|
||||
char buffer[4 + (BITS_PER_LONG/8)*2];
|
||||
char shadow_buf[SHADOW_BYTES_PER_ROW];
|
||||
|
||||
snprintf(buffer, sizeof(buffer),
|
||||
(i == 0) ? ">%p: " : " %p: ", kaddr);
|
||||
|
||||
kasan_disable_current();
|
||||
/*
|
||||
* We should not pass a shadow pointer to generic
|
||||
* function, because generic functions may try to
|
||||
* access kasan mapping for the passed address.
|
||||
*/
|
||||
memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
|
||||
print_hex_dump(KERN_ERR, buffer,
|
||||
DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
|
||||
shadow_row, SHADOW_BYTES_PER_ROW, 0);
|
||||
kasan_enable_current();
|
||||
shadow_buf, SHADOW_BYTES_PER_ROW, 0);
|
||||
|
||||
if (row_is_guilty(shadow_row, shadow))
|
||||
pr_err("%*c\n",
|
||||
@ -182,37 +204,43 @@ static void print_shadow_for_address(const void *addr)
|
||||
|
||||
static DEFINE_SPINLOCK(report_lock);
|
||||
|
||||
void kasan_report_error(struct kasan_access_info *info)
|
||||
static void kasan_report_error(struct kasan_access_info *info)
|
||||
{
|
||||
unsigned long flags;
|
||||
const char *bug_type;
|
||||
|
||||
/*
|
||||
* Make sure we don't end up in loop.
|
||||
*/
|
||||
kasan_disable_current();
|
||||
spin_lock_irqsave(&report_lock, flags);
|
||||
pr_err("================================="
|
||||
"=================================\n");
|
||||
print_error_description(info);
|
||||
print_address_description(info);
|
||||
print_shadow_for_address(info->first_bad_addr);
|
||||
pr_err("================================="
|
||||
"=================================\n");
|
||||
spin_unlock_irqrestore(&report_lock, flags);
|
||||
}
|
||||
|
||||
void kasan_report_user_access(struct kasan_access_info *info)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&report_lock, flags);
|
||||
pr_err("================================="
|
||||
"=================================\n");
|
||||
pr_err("BUG: KASan: user-memory-access on address %p\n",
|
||||
info->access_addr);
|
||||
pr_err("%s of size %zu by task %s/%d\n",
|
||||
info->is_write ? "Write" : "Read",
|
||||
info->access_size, current->comm, task_pid_nr(current));
|
||||
dump_stack();
|
||||
if (info->access_addr <
|
||||
kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
|
||||
if ((unsigned long)info->access_addr < PAGE_SIZE)
|
||||
bug_type = "null-ptr-deref";
|
||||
else if ((unsigned long)info->access_addr < TASK_SIZE)
|
||||
bug_type = "user-memory-access";
|
||||
else
|
||||
bug_type = "wild-memory-access";
|
||||
pr_err("BUG: KASAN: %s on address %p\n",
|
||||
bug_type, info->access_addr);
|
||||
pr_err("%s of size %zu by task %s/%d\n",
|
||||
info->is_write ? "Write" : "Read",
|
||||
info->access_size, current->comm,
|
||||
task_pid_nr(current));
|
||||
dump_stack();
|
||||
} else {
|
||||
print_error_description(info);
|
||||
print_address_description(info);
|
||||
print_shadow_for_address(info->first_bad_addr);
|
||||
}
|
||||
pr_err("================================="
|
||||
"=================================\n");
|
||||
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
|
||||
spin_unlock_irqrestore(&report_lock, flags);
|
||||
kasan_enable_current();
|
||||
}
|
||||
|
||||
void kasan_report(unsigned long addr, size_t size,
|
||||
@ -220,13 +248,14 @@ void kasan_report(unsigned long addr, size_t size,
|
||||
{
|
||||
struct kasan_access_info info;
|
||||
|
||||
if (likely(!kasan_enabled()))
|
||||
if (likely(!kasan_report_enabled()))
|
||||
return;
|
||||
|
||||
info.access_addr = (void *)addr;
|
||||
info.access_size = size;
|
||||
info.is_write = is_write;
|
||||
info.ip = ip;
|
||||
|
||||
kasan_report_error(&info);
|
||||
}
|
||||
|
||||
|
@ -479,7 +479,7 @@ static void put_object(struct kmemleak_object *object)
|
||||
static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct kmemleak_object *object = NULL;
|
||||
struct kmemleak_object *object;
|
||||
|
||||
rcu_read_lock();
|
||||
read_lock_irqsave(&kmemleak_lock, flags);
|
||||
|
49
mm/ksm.c
49
mm/ksm.c
@ -475,7 +475,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
|
||||
flush_dcache_page(page);
|
||||
} else {
|
||||
put_page(page);
|
||||
out: page = NULL;
|
||||
out:
|
||||
page = NULL;
|
||||
}
|
||||
up_read(&mm->mmap_sem);
|
||||
return page;
|
||||
@ -625,7 +626,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
|
||||
if (stable_node->hlist.first)
|
||||
if (!hlist_empty(&stable_node->hlist))
|
||||
ksm_pages_sharing--;
|
||||
else
|
||||
ksm_pages_shared--;
|
||||
@ -1021,8 +1022,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
|
||||
if (page == kpage) /* ksm page forked */
|
||||
return 0;
|
||||
|
||||
if (!(vma->vm_flags & VM_MERGEABLE))
|
||||
goto out;
|
||||
if (PageTransCompound(page) && page_trans_compound_anon_split(page))
|
||||
goto out;
|
||||
BUG_ON(PageTransCompound(page));
|
||||
@ -1087,10 +1086,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
|
||||
int err = -EFAULT;
|
||||
|
||||
down_read(&mm->mmap_sem);
|
||||
if (ksm_test_exit(mm))
|
||||
goto out;
|
||||
vma = find_vma(mm, rmap_item->address);
|
||||
if (!vma || vma->vm_start > rmap_item->address)
|
||||
vma = find_mergeable_vma(mm, rmap_item->address);
|
||||
if (!vma)
|
||||
goto out;
|
||||
|
||||
err = try_to_merge_one_page(vma, page, kpage);
|
||||
@ -1177,8 +1174,18 @@ static struct page *stable_tree_search(struct page *page)
|
||||
cond_resched();
|
||||
stable_node = rb_entry(*new, struct stable_node, node);
|
||||
tree_page = get_ksm_page(stable_node, false);
|
||||
if (!tree_page)
|
||||
return NULL;
|
||||
if (!tree_page) {
|
||||
/*
|
||||
* If we walked over a stale stable_node,
|
||||
* get_ksm_page() will call rb_erase() and it
|
||||
* may rebalance the tree from under us. So
|
||||
* restart the search from scratch. Returning
|
||||
* NULL would be safe too, but we'd generate
|
||||
* false negative insertions just because some
|
||||
* stable_node was stale.
|
||||
*/
|
||||
goto again;
|
||||
}
|
||||
|
||||
ret = memcmp_pages(page, tree_page);
|
||||
put_page(tree_page);
|
||||
@ -1254,12 +1261,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
|
||||
unsigned long kpfn;
|
||||
struct rb_root *root;
|
||||
struct rb_node **new;
|
||||
struct rb_node *parent = NULL;
|
||||
struct rb_node *parent;
|
||||
struct stable_node *stable_node;
|
||||
|
||||
kpfn = page_to_pfn(kpage);
|
||||
nid = get_kpfn_nid(kpfn);
|
||||
root = root_stable_tree + nid;
|
||||
again:
|
||||
parent = NULL;
|
||||
new = &root->rb_node;
|
||||
|
||||
while (*new) {
|
||||
@ -1269,8 +1278,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
|
||||
cond_resched();
|
||||
stable_node = rb_entry(*new, struct stable_node, node);
|
||||
tree_page = get_ksm_page(stable_node, false);
|
||||
if (!tree_page)
|
||||
return NULL;
|
||||
if (!tree_page) {
|
||||
/*
|
||||
* If we walked over a stale stable_node,
|
||||
* get_ksm_page() will call rb_erase() and it
|
||||
* may rebalance the tree from under us. So
|
||||
* restart the search from scratch. Returning
|
||||
* NULL would be safe too, but we'd generate
|
||||
* false negative insertions just because some
|
||||
* stable_node was stale.
|
||||
*/
|
||||
goto again;
|
||||
}
|
||||
|
||||
ret = memcmp_pages(kpage, tree_page);
|
||||
put_page(tree_page);
|
||||
@ -1340,7 +1359,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
|
||||
cond_resched();
|
||||
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
|
||||
tree_page = get_mergeable_page(tree_rmap_item);
|
||||
if (IS_ERR_OR_NULL(tree_page))
|
||||
if (!tree_page)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
@ -1914,9 +1933,11 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
|
||||
struct anon_vma_chain *vmac;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
cond_resched();
|
||||
anon_vma_lock_read(anon_vma);
|
||||
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
|
||||
0, ULONG_MAX) {
|
||||
cond_resched();
|
||||
vma = vmac->vma;
|
||||
if (rmap_item->address < vma->vm_start ||
|
||||
rmap_item->address >= vma->vm_end)
|
||||
|
@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru)
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
static inline bool list_lru_memcg_aware(struct list_lru *lru)
|
||||
{
|
||||
/*
|
||||
* This needs node 0 to be always present, even
|
||||
* in the systems supporting sparse numa ids.
|
||||
*/
|
||||
return !!lru->node[0].memcg_lrus;
|
||||
}
|
||||
|
||||
@ -59,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
|
||||
return &nlru->lru;
|
||||
}
|
||||
|
||||
static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
if (!memcg_kmem_enabled())
|
||||
return NULL;
|
||||
page = virt_to_head_page(ptr);
|
||||
return page->mem_cgroup;
|
||||
}
|
||||
|
||||
static inline struct list_lru_one *
|
||||
list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
|
||||
{
|
||||
@ -377,16 +391,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nr_node_ids; i++) {
|
||||
if (!memcg_aware)
|
||||
lru->node[i].memcg_lrus = NULL;
|
||||
else if (memcg_init_list_lru_node(&lru->node[i]))
|
||||
if (!memcg_aware)
|
||||
return 0;
|
||||
|
||||
for_each_node(i) {
|
||||
if (memcg_init_list_lru_node(&lru->node[i]))
|
||||
goto fail;
|
||||
}
|
||||
return 0;
|
||||
fail:
|
||||
for (i = i - 1; i >= 0; i--)
|
||||
for (i = i - 1; i >= 0; i--) {
|
||||
if (!lru->node[i].memcg_lrus)
|
||||
continue;
|
||||
memcg_destroy_list_lru_node(&lru->node[i]);
|
||||
}
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@ -397,7 +415,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return;
|
||||
|
||||
for (i = 0; i < nr_node_ids; i++)
|
||||
for_each_node(i)
|
||||
memcg_destroy_list_lru_node(&lru->node[i]);
|
||||
}
|
||||
|
||||
@ -409,16 +427,20 @@ static int memcg_update_list_lru(struct list_lru *lru,
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < nr_node_ids; i++) {
|
||||
for_each_node(i) {
|
||||
if (memcg_update_list_lru_node(&lru->node[i],
|
||||
old_size, new_size))
|
||||
goto fail;
|
||||
}
|
||||
return 0;
|
||||
fail:
|
||||
for (i = i - 1; i >= 0; i--)
|
||||
for (i = i - 1; i >= 0; i--) {
|
||||
if (!lru->node[i].memcg_lrus)
|
||||
continue;
|
||||
|
||||
memcg_cancel_update_list_lru_node(&lru->node[i],
|
||||
old_size, new_size);
|
||||
}
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@ -430,7 +452,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return;
|
||||
|
||||
for (i = 0; i < nr_node_ids; i++)
|
||||
for_each_node(i)
|
||||
memcg_cancel_update_list_lru_node(&lru->node[i],
|
||||
old_size, new_size);
|
||||
}
|
||||
@ -485,7 +507,7 @@ static void memcg_drain_list_lru(struct list_lru *lru,
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return;
|
||||
|
||||
for (i = 0; i < nr_node_ids; i++)
|
||||
for_each_node(i)
|
||||
memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
|
||||
}
|
||||
|
||||
@ -522,7 +544,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
|
||||
if (!lru->node)
|
||||
goto out;
|
||||
|
||||
for (i = 0; i < nr_node_ids; i++) {
|
||||
for_each_node(i) {
|
||||
spin_lock_init(&lru->node[i].lock);
|
||||
if (key)
|
||||
lockdep_set_class(&lru->node[i].lock, key);
|
||||
|
@ -13,6 +13,11 @@
|
||||
*
|
||||
* Safely read from address @src to the buffer at @dst. If a kernel fault
|
||||
* happens, handle that and return -EFAULT.
|
||||
*
|
||||
* We ensure that the copy_from_user is executed in atomic context so that
|
||||
* do_page_fault() doesn't attempt to take mmap_sem. This makes
|
||||
* probe_kernel_read() suitable for use within regions where the caller
|
||||
* already holds mmap_sem, or other locks which nest inside mmap_sem.
|
||||
*/
|
||||
|
||||
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
|
||||
@ -99,5 +104,5 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
|
||||
pagefault_enable();
|
||||
set_fs(old_fs);
|
||||
|
||||
return ret < 0 ? ret : src - unsafe_addr;
|
||||
return ret ? -EFAULT : src - unsafe_addr;
|
||||
}
|
||||
|
@ -706,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __init_memblock memblock_remove_range(struct memblock_type *type,
|
||||
static int __init_memblock memblock_remove_range(struct memblock_type *type,
|
||||
phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
int start_rgn, end_rgn;
|
||||
|
307
mm/memcontrol.c
307
mm/memcontrol.c
@ -62,6 +62,7 @@
|
||||
#include <linux/oom.h>
|
||||
#include <linux/lockdep.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/tracehook.h>
|
||||
#include "internal.h"
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
@ -1661,7 +1662,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
|
||||
|
||||
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
|
||||
{
|
||||
if (!current->memcg_oom.may_oom)
|
||||
if (!current->memcg_may_oom)
|
||||
return;
|
||||
/*
|
||||
* We are in the middle of the charge context here, so we
|
||||
@ -1678,9 +1679,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
|
||||
* and when we know whether the fault was overall successful.
|
||||
*/
|
||||
css_get(&memcg->css);
|
||||
current->memcg_oom.memcg = memcg;
|
||||
current->memcg_oom.gfp_mask = mask;
|
||||
current->memcg_oom.order = order;
|
||||
current->memcg_in_oom = memcg;
|
||||
current->memcg_oom_gfp_mask = mask;
|
||||
current->memcg_oom_order = order;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1702,7 +1703,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
|
||||
*/
|
||||
bool mem_cgroup_oom_synchronize(bool handle)
|
||||
{
|
||||
struct mem_cgroup *memcg = current->memcg_oom.memcg;
|
||||
struct mem_cgroup *memcg = current->memcg_in_oom;
|
||||
struct oom_wait_info owait;
|
||||
bool locked;
|
||||
|
||||
@ -1730,8 +1731,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
|
||||
if (locked && !memcg->oom_kill_disable) {
|
||||
mem_cgroup_unmark_under_oom(memcg);
|
||||
finish_wait(&memcg_oom_waitq, &owait.wait);
|
||||
mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
|
||||
current->memcg_oom.order);
|
||||
mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
|
||||
current->memcg_oom_order);
|
||||
} else {
|
||||
schedule();
|
||||
mem_cgroup_unmark_under_oom(memcg);
|
||||
@ -1748,7 +1749,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
|
||||
memcg_oom_recover(memcg);
|
||||
}
|
||||
cleanup:
|
||||
current->memcg_oom.memcg = NULL;
|
||||
current->memcg_in_oom = NULL;
|
||||
css_put(&memcg->css);
|
||||
return true;
|
||||
}
|
||||
@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Scheduled by try_charge() to be executed from the userland return path
|
||||
* and reclaims memory over the high limit.
|
||||
*/
|
||||
void mem_cgroup_handle_over_high(void)
|
||||
{
|
||||
unsigned int nr_pages = current->memcg_nr_pages_over_high;
|
||||
struct mem_cgroup *memcg, *pos;
|
||||
|
||||
if (likely(!nr_pages))
|
||||
return;
|
||||
|
||||
pos = memcg = get_mem_cgroup_from_mm(current->mm);
|
||||
|
||||
do {
|
||||
if (page_counter_read(&pos->memory) <= pos->high)
|
||||
continue;
|
||||
mem_cgroup_events(pos, MEMCG_HIGH, 1);
|
||||
try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
|
||||
} while ((pos = parent_mem_cgroup(pos)));
|
||||
|
||||
css_put(&memcg->css);
|
||||
current->memcg_nr_pages_over_high = 0;
|
||||
}
|
||||
|
||||
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
unsigned int nr_pages)
|
||||
{
|
||||
@ -1982,17 +2008,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
unsigned long nr_reclaimed;
|
||||
bool may_swap = true;
|
||||
bool drained = false;
|
||||
int ret = 0;
|
||||
|
||||
if (mem_cgroup_is_root(memcg))
|
||||
goto done;
|
||||
return 0;
|
||||
retry:
|
||||
if (consume_stock(memcg, nr_pages))
|
||||
goto done;
|
||||
return 0;
|
||||
|
||||
if (!do_swap_account ||
|
||||
!page_counter_try_charge(&memcg->memsw, batch, &counter)) {
|
||||
if (!page_counter_try_charge(&memcg->memory, batch, &counter))
|
||||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
|
||||
if (page_counter_try_charge(&memcg->memory, batch, &counter))
|
||||
goto done_restock;
|
||||
if (do_swap_account)
|
||||
page_counter_uncharge(&memcg->memsw, batch);
|
||||
@ -2016,7 +2041,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
if (unlikely(test_thread_flag(TIF_MEMDIE) ||
|
||||
fatal_signal_pending(current) ||
|
||||
current->flags & PF_EXITING))
|
||||
goto bypass;
|
||||
goto force;
|
||||
|
||||
if (unlikely(task_in_memcg_oom(current)))
|
||||
goto nomem;
|
||||
@ -2062,38 +2087,54 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
goto retry;
|
||||
|
||||
if (gfp_mask & __GFP_NOFAIL)
|
||||
goto bypass;
|
||||
goto force;
|
||||
|
||||
if (fatal_signal_pending(current))
|
||||
goto bypass;
|
||||
goto force;
|
||||
|
||||
mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
|
||||
|
||||
mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
|
||||
mem_cgroup_oom(mem_over_limit, gfp_mask,
|
||||
get_order(nr_pages * PAGE_SIZE));
|
||||
nomem:
|
||||
if (!(gfp_mask & __GFP_NOFAIL))
|
||||
return -ENOMEM;
|
||||
bypass:
|
||||
return -EINTR;
|
||||
force:
|
||||
/*
|
||||
* The allocation either can't fail or will lead to more memory
|
||||
* being freed very soon. Allow memory usage go over the limit
|
||||
* temporarily by force charging it.
|
||||
*/
|
||||
page_counter_charge(&memcg->memory, nr_pages);
|
||||
if (do_swap_account)
|
||||
page_counter_charge(&memcg->memsw, nr_pages);
|
||||
css_get_many(&memcg->css, nr_pages);
|
||||
|
||||
return 0;
|
||||
|
||||
done_restock:
|
||||
css_get_many(&memcg->css, batch);
|
||||
if (batch > nr_pages)
|
||||
refill_stock(memcg, batch - nr_pages);
|
||||
if (!(gfp_mask & __GFP_WAIT))
|
||||
goto done;
|
||||
|
||||
/*
|
||||
* If the hierarchy is above the normal consumption range,
|
||||
* make the charging task trim their excess contribution.
|
||||
* If the hierarchy is above the normal consumption range, schedule
|
||||
* reclaim on returning to userland. We can perform reclaim here
|
||||
* if __GFP_WAIT but let's always punt for simplicity and so that
|
||||
* GFP_KERNEL can consistently be used during reclaim. @memcg is
|
||||
* not recorded as it most likely matches current's and won't
|
||||
* change in the meantime. As high limit is checked again before
|
||||
* reclaim, the cost of mismatch is negligible.
|
||||
*/
|
||||
do {
|
||||
if (page_counter_read(&memcg->memory) <= memcg->high)
|
||||
continue;
|
||||
mem_cgroup_events(memcg, MEMCG_HIGH, 1);
|
||||
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
|
||||
if (page_counter_read(&memcg->memory) > memcg->high) {
|
||||
current->memcg_nr_pages_over_high += nr_pages;
|
||||
set_notify_resume(current);
|
||||
break;
|
||||
}
|
||||
} while ((memcg = parent_mem_cgroup(memcg)));
|
||||
done:
|
||||
return ret;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
|
||||
@ -2174,55 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
struct page_counter *counter;
|
||||
int ret = 0;
|
||||
|
||||
ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = try_charge(memcg, gfp, nr_pages);
|
||||
if (ret == -EINTR) {
|
||||
/*
|
||||
* try_charge() chose to bypass to root due to OOM kill or
|
||||
* fatal signal. Since our only options are to either fail
|
||||
* the allocation or charge it to this cgroup, do it as a
|
||||
* temporary condition. But we can't fail. From a kmem/slab
|
||||
* perspective, the cache has already been selected, by
|
||||
* mem_cgroup_kmem_get_cache(), so it is too late to change
|
||||
* our minds.
|
||||
*
|
||||
* This condition will only trigger if the task entered
|
||||
* memcg_charge_kmem in a sane state, but was OOM-killed
|
||||
* during try_charge() above. Tasks that were already dying
|
||||
* when the allocation triggers should have been already
|
||||
* directed to the root cgroup in memcontrol.h
|
||||
*/
|
||||
page_counter_charge(&memcg->memory, nr_pages);
|
||||
if (do_swap_account)
|
||||
page_counter_charge(&memcg->memsw, nr_pages);
|
||||
css_get_many(&memcg->css, nr_pages);
|
||||
ret = 0;
|
||||
} else if (ret)
|
||||
page_counter_uncharge(&memcg->kmem, nr_pages);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
|
||||
{
|
||||
page_counter_uncharge(&memcg->memory, nr_pages);
|
||||
if (do_swap_account)
|
||||
page_counter_uncharge(&memcg->memsw, nr_pages);
|
||||
|
||||
page_counter_uncharge(&memcg->kmem, nr_pages);
|
||||
|
||||
css_put_many(&memcg->css, nr_pages);
|
||||
}
|
||||
|
||||
static int memcg_alloc_cache_id(void)
|
||||
{
|
||||
int id, size;
|
||||
@ -2384,85 +2376,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
|
||||
css_put(&cachep->memcg_params.memcg->css);
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to verify if the allocation against current->mm->owner's memcg is
|
||||
* possible for the given order. But the page is not allocated yet, so we'll
|
||||
* need a further commit step to do the final arrangements.
|
||||
*
|
||||
* It is possible for the task to switch cgroups in this mean time, so at
|
||||
* commit time, we can't rely on task conversion any longer. We'll then use
|
||||
* the handle argument to return to the caller which cgroup we should commit
|
||||
* against. We could also return the memcg directly and avoid the pointer
|
||||
* passing, but a boolean return value gives better semantics considering
|
||||
* the compiled-out case as well.
|
||||
*
|
||||
* Returning true means the allocation is possible.
|
||||
*/
|
||||
bool
|
||||
__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
|
||||
int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
unsigned int nr_pages = 1 << order;
|
||||
struct page_counter *counter;
|
||||
int ret;
|
||||
|
||||
if (!memcg_kmem_is_active(memcg))
|
||||
return 0;
|
||||
|
||||
if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
|
||||
return -ENOMEM;
|
||||
|
||||
ret = try_charge(memcg, gfp, nr_pages);
|
||||
if (ret) {
|
||||
page_counter_uncharge(&memcg->kmem, nr_pages);
|
||||
return ret;
|
||||
}
|
||||
|
||||
page->mem_cgroup = memcg;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
int ret;
|
||||
|
||||
*_memcg = NULL;
|
||||
|
||||
memcg = get_mem_cgroup_from_mm(current->mm);
|
||||
|
||||
if (!memcg_kmem_is_active(memcg)) {
|
||||
css_put(&memcg->css);
|
||||
return true;
|
||||
}
|
||||
|
||||
ret = memcg_charge_kmem(memcg, gfp, 1 << order);
|
||||
if (!ret)
|
||||
*_memcg = memcg;
|
||||
|
||||
ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
|
||||
css_put(&memcg->css);
|
||||
return (ret == 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
|
||||
int order)
|
||||
{
|
||||
VM_BUG_ON(mem_cgroup_is_root(memcg));
|
||||
|
||||
/* The page allocation failed. Revert */
|
||||
if (!page) {
|
||||
memcg_uncharge_kmem(memcg, 1 << order);
|
||||
return;
|
||||
}
|
||||
page->mem_cgroup = memcg;
|
||||
}
|
||||
|
||||
void __memcg_kmem_uncharge_pages(struct page *page, int order)
|
||||
void __memcg_kmem_uncharge(struct page *page, int order)
|
||||
{
|
||||
struct mem_cgroup *memcg = page->mem_cgroup;
|
||||
unsigned int nr_pages = 1 << order;
|
||||
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
|
||||
|
||||
memcg_uncharge_kmem(memcg, 1 << order);
|
||||
page_counter_uncharge(&memcg->kmem, nr_pages);
|
||||
page_counter_uncharge(&memcg->memory, nr_pages);
|
||||
if (do_swap_account)
|
||||
page_counter_uncharge(&memcg->memsw, nr_pages);
|
||||
|
||||
page->mem_cgroup = NULL;
|
||||
}
|
||||
|
||||
struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
|
||||
{
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
struct kmem_cache *cachep;
|
||||
struct page *page;
|
||||
|
||||
page = virt_to_head_page(ptr);
|
||||
if (PageSlab(page)) {
|
||||
cachep = page->slab_cache;
|
||||
if (!is_root_cache(cachep))
|
||||
memcg = cachep->memcg_params.memcg;
|
||||
} else
|
||||
/* page allocated by alloc_kmem_pages */
|
||||
memcg = page->mem_cgroup;
|
||||
|
||||
return memcg;
|
||||
css_put_many(&memcg->css, nr_pages);
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
|
||||
@ -2836,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
||||
static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
||||
{
|
||||
u64 val;
|
||||
unsigned long val;
|
||||
|
||||
if (mem_cgroup_is_root(memcg)) {
|
||||
val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
|
||||
@ -2851,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
||||
else
|
||||
val = page_counter_read(&memcg->memsw);
|
||||
}
|
||||
return val << PAGE_SHIFT;
|
||||
return val;
|
||||
}
|
||||
|
||||
enum {
|
||||
@ -2885,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
|
||||
switch (MEMFILE_ATTR(cft->private)) {
|
||||
case RES_USAGE:
|
||||
if (counter == &memcg->memory)
|
||||
return mem_cgroup_usage(memcg, false);
|
||||
return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
|
||||
if (counter == &memcg->memsw)
|
||||
return mem_cgroup_usage(memcg, true);
|
||||
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
|
||||
return (u64)page_counter_read(counter) * PAGE_SIZE;
|
||||
case RES_LIMIT:
|
||||
return (u64)counter->limit * PAGE_SIZE;
|
||||
@ -3387,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
||||
ret = page_counter_memparse(args, "-1", &threshold);
|
||||
if (ret)
|
||||
return ret;
|
||||
threshold <<= PAGE_SHIFT;
|
||||
|
||||
mutex_lock(&memcg->thresholds_lock);
|
||||
|
||||
@ -4406,22 +4370,10 @@ static int mem_cgroup_do_precharge(unsigned long count)
|
||||
mc.precharge += count;
|
||||
return ret;
|
||||
}
|
||||
if (ret == -EINTR) {
|
||||
cancel_charge(root_mem_cgroup, count);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Try charges one by one with reclaim */
|
||||
while (count--) {
|
||||
ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
|
||||
/*
|
||||
* In case of failure, any residual charges against
|
||||
* mc.to will be dropped by mem_cgroup_clear_mc()
|
||||
* later on. However, cancel any charges that are
|
||||
* bypassed to root right away or they'll be lost.
|
||||
*/
|
||||
if (ret == -EINTR)
|
||||
cancel_charge(root_mem_cgroup, 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
mc.precharge++;
|
||||
@ -4576,9 +4528,8 @@ static int mem_cgroup_move_account(struct page *page,
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
|
||||
* of its source page while we change it: page migration takes
|
||||
* both pages off the LRU, but page cache replacement doesn't.
|
||||
* Prevent mem_cgroup_replace_page() from looking at
|
||||
* page->mem_cgroup of its source page while we change it.
|
||||
*/
|
||||
if (!trylock_page(page))
|
||||
goto out;
|
||||
@ -5085,7 +5036,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
|
||||
static u64 memory_current_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
return mem_cgroup_usage(mem_cgroup_from_css(css), false);
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
|
||||
}
|
||||
|
||||
static int memory_low_show(struct seq_file *m, void *v)
|
||||
@ -5197,6 +5150,7 @@ static int memory_events_show(struct seq_file *m, void *v)
|
||||
static struct cftype memory_files[] = {
|
||||
{
|
||||
.name = "current",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.read_u64 = memory_current_read,
|
||||
},
|
||||
{
|
||||
@ -5340,11 +5294,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
|
||||
ret = try_charge(memcg, gfp_mask, nr_pages);
|
||||
|
||||
css_put(&memcg->css);
|
||||
|
||||
if (ret == -EINTR) {
|
||||
memcg = root_mem_cgroup;
|
||||
ret = 0;
|
||||
}
|
||||
out:
|
||||
*memcgp = memcg;
|
||||
return ret;
|
||||
@ -5559,7 +5508,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_migrate - migrate a charge to another page
|
||||
* mem_cgroup_replace_page - migrate a charge to another page
|
||||
* @oldpage: currently charged page
|
||||
* @newpage: page to transfer the charge to
|
||||
* @lrucare: either or both pages might be on the LRU already
|
||||
@ -5568,16 +5517,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
|
||||
*
|
||||
* Both pages must be locked, @newpage->mapping must be set up.
|
||||
*/
|
||||
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
|
||||
bool lrucare)
|
||||
void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
int isolated;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
|
||||
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
|
||||
VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
|
||||
VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
|
||||
VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
|
||||
VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
|
||||
newpage);
|
||||
@ -5589,25 +5535,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
|
||||
if (newpage->mem_cgroup)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Swapcache readahead pages can get migrated before being
|
||||
* charged, and migration from compaction can happen to an
|
||||
* uncharged page when the PFN walker finds a page that
|
||||
* reclaim just put back on the LRU but has not released yet.
|
||||
*/
|
||||
/* Swapcache readahead pages can get replaced before being charged */
|
||||
memcg = oldpage->mem_cgroup;
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
if (lrucare)
|
||||
lock_page_lru(oldpage, &isolated);
|
||||
|
||||
lock_page_lru(oldpage, &isolated);
|
||||
oldpage->mem_cgroup = NULL;
|
||||
unlock_page_lru(oldpage, isolated);
|
||||
|
||||
if (lrucare)
|
||||
unlock_page_lru(oldpage, isolated);
|
||||
|
||||
commit_charge(newpage, memcg, lrucare);
|
||||
commit_charge(newpage, memcg, true);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -56,6 +56,7 @@
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/kfifo.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include "internal.h"
|
||||
#include "ras/ras_event.h"
|
||||
|
||||
@ -1403,6 +1404,12 @@ static int __init memory_failure_init(void)
|
||||
}
|
||||
core_initcall(memory_failure_init);
|
||||
|
||||
#define unpoison_pr_info(fmt, pfn, rs) \
|
||||
({ \
|
||||
if (__ratelimit(rs)) \
|
||||
pr_info(fmt, pfn); \
|
||||
})
|
||||
|
||||
/**
|
||||
* unpoison_memory - Unpoison a previously poisoned page
|
||||
* @pfn: Page number of the to be unpoisoned page
|
||||
@ -1421,6 +1428,8 @@ int unpoison_memory(unsigned long pfn)
|
||||
struct page *p;
|
||||
int freeit = 0;
|
||||
unsigned int nr_pages;
|
||||
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
|
||||
if (!pfn_valid(pfn))
|
||||
return -ENXIO;
|
||||
@ -1429,23 +1438,26 @@ int unpoison_memory(unsigned long pfn)
|
||||
page = compound_head(p);
|
||||
|
||||
if (!PageHWPoison(p)) {
|
||||
pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
|
||||
unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (page_count(page) > 1) {
|
||||
pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
|
||||
unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (page_mapped(page)) {
|
||||
pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
|
||||
unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (page_mapping(page)) {
|
||||
pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
|
||||
pfn);
|
||||
unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1455,7 +1467,8 @@ int unpoison_memory(unsigned long pfn)
|
||||
* In such case, we yield to memory_failure() and make unpoison fail.
|
||||
*/
|
||||
if (!PageHuge(page) && PageTransHuge(page)) {
|
||||
pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
|
||||
unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1469,12 +1482,14 @@ int unpoison_memory(unsigned long pfn)
|
||||
* to the end.
|
||||
*/
|
||||
if (PageHuge(page)) {
|
||||
pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
|
||||
unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
if (TestClearPageHWPoison(p))
|
||||
num_poisoned_pages_dec();
|
||||
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
|
||||
unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1486,7 +1501,8 @@ int unpoison_memory(unsigned long pfn)
|
||||
* the free buddy page pool.
|
||||
*/
|
||||
if (TestClearPageHWPoison(page)) {
|
||||
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
|
||||
unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
freeit = 1;
|
||||
if (PageHuge(page))
|
||||
|
@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone,
|
||||
unsigned long start_pfn, unsigned long num_pages)
|
||||
{
|
||||
if (!zone_is_initialized(zone))
|
||||
return init_currently_empty_zone(zone, start_pfn, num_pages,
|
||||
MEMMAP_HOTPLUG);
|
||||
return init_currently_empty_zone(zone, start_pfn, num_pages);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
247
mm/migrate.c
247
mm/migrate.c
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Memory Migration functionality - linux/mm/migration.c
|
||||
* Memory Migration functionality - linux/mm/migrate.c
|
||||
*
|
||||
* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
|
||||
*
|
||||
@ -30,7 +30,7 @@
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/hugetlb_cgroup.h>
|
||||
@ -171,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
|
||||
else
|
||||
page_add_file_rmap(new);
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_vma_page(new);
|
||||
|
||||
/* No need to invalidate - it was non-present before */
|
||||
update_mmu_cache(vma, addr, ptep);
|
||||
unlock:
|
||||
@ -311,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping,
|
||||
struct buffer_head *head, enum migrate_mode mode,
|
||||
int extra_count)
|
||||
{
|
||||
struct zone *oldzone, *newzone;
|
||||
int dirty;
|
||||
int expected_count = 1 + extra_count;
|
||||
void **pslot;
|
||||
|
||||
@ -318,9 +323,20 @@ int migrate_page_move_mapping(struct address_space *mapping,
|
||||
/* Anonymous page without mapping */
|
||||
if (page_count(page) != expected_count)
|
||||
return -EAGAIN;
|
||||
|
||||
/* No turning back from here */
|
||||
set_page_memcg(newpage, page_memcg(page));
|
||||
newpage->index = page->index;
|
||||
newpage->mapping = page->mapping;
|
||||
if (PageSwapBacked(page))
|
||||
SetPageSwapBacked(newpage);
|
||||
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
|
||||
oldzone = page_zone(page);
|
||||
newzone = page_zone(newpage);
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
|
||||
pslot = radix_tree_lookup_slot(&mapping->page_tree,
|
||||
@ -353,14 +369,28 @@ int migrate_page_move_mapping(struct address_space *mapping,
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we know that no one else is looking at the page.
|
||||
* Now we know that no one else is looking at the page:
|
||||
* no turning back from here.
|
||||
*/
|
||||
set_page_memcg(newpage, page_memcg(page));
|
||||
newpage->index = page->index;
|
||||
newpage->mapping = page->mapping;
|
||||
if (PageSwapBacked(page))
|
||||
SetPageSwapBacked(newpage);
|
||||
|
||||
get_page(newpage); /* add cache reference */
|
||||
if (PageSwapCache(page)) {
|
||||
SetPageSwapCache(newpage);
|
||||
set_page_private(newpage, page_private(page));
|
||||
}
|
||||
|
||||
/* Move dirty while page refs frozen and newpage not yet exposed */
|
||||
dirty = PageDirty(page);
|
||||
if (dirty) {
|
||||
ClearPageDirty(page);
|
||||
SetPageDirty(newpage);
|
||||
}
|
||||
|
||||
radix_tree_replace_slot(pslot, newpage);
|
||||
|
||||
/*
|
||||
@ -370,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping,
|
||||
*/
|
||||
page_unfreeze_refs(page, expected_count - 1);
|
||||
|
||||
spin_unlock(&mapping->tree_lock);
|
||||
/* Leave irq disabled to prevent preemption while updating stats */
|
||||
|
||||
/*
|
||||
* If moved to a different zone then also account
|
||||
* the page for that zone. Other VM counters will be
|
||||
@ -380,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping,
|
||||
* via NR_FILE_PAGES and NR_ANON_PAGES if they
|
||||
* are mapped to swap space.
|
||||
*/
|
||||
__dec_zone_page_state(page, NR_FILE_PAGES);
|
||||
__inc_zone_page_state(newpage, NR_FILE_PAGES);
|
||||
if (!PageSwapCache(page) && PageSwapBacked(page)) {
|
||||
__dec_zone_page_state(page, NR_SHMEM);
|
||||
__inc_zone_page_state(newpage, NR_SHMEM);
|
||||
if (newzone != oldzone) {
|
||||
__dec_zone_state(oldzone, NR_FILE_PAGES);
|
||||
__inc_zone_state(newzone, NR_FILE_PAGES);
|
||||
if (PageSwapBacked(page) && !PageSwapCache(page)) {
|
||||
__dec_zone_state(oldzone, NR_SHMEM);
|
||||
__inc_zone_state(newzone, NR_SHMEM);
|
||||
}
|
||||
if (dirty && mapping_cap_account_dirty(mapping)) {
|
||||
__dec_zone_state(oldzone, NR_FILE_DIRTY);
|
||||
__inc_zone_state(newzone, NR_FILE_DIRTY);
|
||||
}
|
||||
}
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
local_irq_enable();
|
||||
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
@ -401,12 +440,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
int expected_count;
|
||||
void **pslot;
|
||||
|
||||
if (!mapping) {
|
||||
if (page_count(page) != 1)
|
||||
return -EAGAIN;
|
||||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
|
||||
pslot = radix_tree_lookup_slot(&mapping->page_tree,
|
||||
@ -424,6 +457,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
set_page_memcg(newpage, page_memcg(page));
|
||||
newpage->index = page->index;
|
||||
newpage->mapping = page->mapping;
|
||||
get_page(newpage);
|
||||
|
||||
radix_tree_replace_slot(pslot, newpage);
|
||||
@ -510,20 +546,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
|
||||
if (PageMappedToDisk(page))
|
||||
SetPageMappedToDisk(newpage);
|
||||
|
||||
if (PageDirty(page)) {
|
||||
clear_page_dirty_for_io(page);
|
||||
/*
|
||||
* Want to mark the page and the radix tree as dirty, and
|
||||
* redo the accounting that clear_page_dirty_for_io undid,
|
||||
* but we can't use set_page_dirty because that function
|
||||
* is actually a signal that all of the page has become dirty.
|
||||
* Whereas only part of our page may be dirty.
|
||||
*/
|
||||
if (PageSwapBacked(page))
|
||||
SetPageDirty(newpage);
|
||||
else
|
||||
__set_page_dirty_nobuffers(newpage);
|
||||
}
|
||||
/* Move dirty on pages not done by migrate_page_move_mapping() */
|
||||
if (PageDirty(page))
|
||||
SetPageDirty(newpage);
|
||||
|
||||
if (page_is_young(page))
|
||||
set_page_young(newpage);
|
||||
@ -537,7 +562,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
|
||||
cpupid = page_cpupid_xchg_last(page, -1);
|
||||
page_cpupid_xchg_last(newpage, cpupid);
|
||||
|
||||
mlock_migrate_page(newpage, page);
|
||||
ksm_migrate_page(newpage, page);
|
||||
/*
|
||||
* Please do not reorder this without considering how mm/ksm.c's
|
||||
@ -721,33 +745,13 @@ static int fallback_migrate_page(struct address_space *mapping,
|
||||
* MIGRATEPAGE_SUCCESS - success
|
||||
*/
|
||||
static int move_to_new_page(struct page *newpage, struct page *page,
|
||||
int page_was_mapped, enum migrate_mode mode)
|
||||
enum migrate_mode mode)
|
||||
{
|
||||
struct address_space *mapping;
|
||||
int rc;
|
||||
|
||||
/*
|
||||
* Block others from accessing the page when we get around to
|
||||
* establishing additional references. We are the only one
|
||||
* holding a reference to the new page at this point.
|
||||
*/
|
||||
if (!trylock_page(newpage))
|
||||
BUG();
|
||||
|
||||
/* Prepare mapping for the new page.*/
|
||||
newpage->index = page->index;
|
||||
newpage->mapping = page->mapping;
|
||||
if (PageSwapBacked(page))
|
||||
SetPageSwapBacked(newpage);
|
||||
|
||||
/*
|
||||
* Indirectly called below, migrate_page_copy() copies PG_dirty and thus
|
||||
* needs newpage's memcg set to transfer memcg dirty page accounting.
|
||||
* So perform memcg migration in two steps:
|
||||
* 1. set newpage->mem_cgroup (here)
|
||||
* 2. clear page->mem_cgroup (below)
|
||||
*/
|
||||
set_page_memcg(newpage, page_memcg(page));
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
|
||||
|
||||
mapping = page_mapping(page);
|
||||
if (!mapping)
|
||||
@ -759,23 +763,19 @@ static int move_to_new_page(struct page *newpage, struct page *page,
|
||||
* space which also has its own migratepage callback. This
|
||||
* is the most common path for page migration.
|
||||
*/
|
||||
rc = mapping->a_ops->migratepage(mapping,
|
||||
newpage, page, mode);
|
||||
rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
|
||||
else
|
||||
rc = fallback_migrate_page(mapping, newpage, page, mode);
|
||||
|
||||
if (rc != MIGRATEPAGE_SUCCESS) {
|
||||
set_page_memcg(newpage, NULL);
|
||||
newpage->mapping = NULL;
|
||||
} else {
|
||||
/*
|
||||
* When successful, old pagecache page->mapping must be cleared before
|
||||
* page is freed; but stats require that PageAnon be left as PageAnon.
|
||||
*/
|
||||
if (rc == MIGRATEPAGE_SUCCESS) {
|
||||
set_page_memcg(page, NULL);
|
||||
if (page_was_mapped)
|
||||
remove_migration_ptes(page, newpage);
|
||||
page->mapping = NULL;
|
||||
if (!PageAnon(page))
|
||||
page->mapping = NULL;
|
||||
}
|
||||
|
||||
unlock_page(newpage);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -824,6 +824,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
goto out_unlock;
|
||||
wait_on_page_writeback(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
|
||||
* we cannot notice that anon_vma is freed while we migrates a page.
|
||||
@ -831,34 +832,26 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
* of migration. File cache pages are no problem because of page_lock()
|
||||
* File Caches may use write_page() or lock_page() in migration, then,
|
||||
* just care Anon page here.
|
||||
*
|
||||
* Only page_get_anon_vma() understands the subtleties of
|
||||
* getting a hold on an anon_vma from outside one of its mms.
|
||||
* But if we cannot get anon_vma, then we won't need it anyway,
|
||||
* because that implies that the anon page is no longer mapped
|
||||
* (and cannot be remapped so long as we hold the page lock).
|
||||
*/
|
||||
if (PageAnon(page) && !PageKsm(page)) {
|
||||
/*
|
||||
* Only page_lock_anon_vma_read() understands the subtleties of
|
||||
* getting a hold on an anon_vma from outside one of its mms.
|
||||
*/
|
||||
if (PageAnon(page) && !PageKsm(page))
|
||||
anon_vma = page_get_anon_vma(page);
|
||||
if (anon_vma) {
|
||||
/*
|
||||
* Anon page
|
||||
*/
|
||||
} else if (PageSwapCache(page)) {
|
||||
/*
|
||||
* We cannot be sure that the anon_vma of an unmapped
|
||||
* swapcache page is safe to use because we don't
|
||||
* know in advance if the VMA that this page belonged
|
||||
* to still exists. If the VMA and others sharing the
|
||||
* data have been freed, then the anon_vma could
|
||||
* already be invalid.
|
||||
*
|
||||
* To avoid this possibility, swapcache pages get
|
||||
* migrated but are not remapped when migration
|
||||
* completes
|
||||
*/
|
||||
} else {
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Block others from accessing the new page when we get around to
|
||||
* establishing additional references. We are usually the only one
|
||||
* holding a reference to newpage at this point. We used to have a BUG
|
||||
* here if trylock_page(newpage) fails, but would like to allow for
|
||||
* cases where there might be a race with the previous use of newpage.
|
||||
* This is much like races on refcount of oldpage: just don't BUG().
|
||||
*/
|
||||
if (unlikely(!trylock_page(newpage)))
|
||||
goto out_unlock;
|
||||
|
||||
if (unlikely(isolated_balloon_page(page))) {
|
||||
/*
|
||||
@ -869,7 +862,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
* the page migration right away (proteced by page lock).
|
||||
*/
|
||||
rc = balloon_page_migrate(newpage, page, mode);
|
||||
goto out_unlock;
|
||||
goto out_unlock_both;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -888,30 +881,30 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
VM_BUG_ON_PAGE(PageAnon(page), page);
|
||||
if (page_has_private(page)) {
|
||||
try_to_free_buffers(page);
|
||||
goto out_unlock;
|
||||
goto out_unlock_both;
|
||||
}
|
||||
goto skip_unmap;
|
||||
}
|
||||
|
||||
/* Establish migration ptes or remove ptes */
|
||||
if (page_mapped(page)) {
|
||||
} else if (page_mapped(page)) {
|
||||
/* Establish migration ptes */
|
||||
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
|
||||
page);
|
||||
try_to_unmap(page,
|
||||
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
|
||||
page_was_mapped = 1;
|
||||
}
|
||||
|
||||
skip_unmap:
|
||||
if (!page_mapped(page))
|
||||
rc = move_to_new_page(newpage, page, page_was_mapped, mode);
|
||||
rc = move_to_new_page(newpage, page, mode);
|
||||
|
||||
if (rc && page_was_mapped)
|
||||
remove_migration_ptes(page, page);
|
||||
if (page_was_mapped)
|
||||
remove_migration_ptes(page,
|
||||
rc == MIGRATEPAGE_SUCCESS ? newpage : page);
|
||||
|
||||
out_unlock_both:
|
||||
unlock_page(newpage);
|
||||
out_unlock:
|
||||
/* Drop an anon_vma reference if we took one */
|
||||
if (anon_vma)
|
||||
put_anon_vma(anon_vma);
|
||||
|
||||
out_unlock:
|
||||
unlock_page(page);
|
||||
out:
|
||||
return rc;
|
||||
@ -937,10 +930,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
|
||||
int force, enum migrate_mode mode,
|
||||
enum migrate_reason reason)
|
||||
{
|
||||
int rc = 0;
|
||||
int rc = MIGRATEPAGE_SUCCESS;
|
||||
int *result = NULL;
|
||||
struct page *newpage = get_new_page(page, private, &result);
|
||||
struct page *newpage;
|
||||
|
||||
newpage = get_new_page(page, private, &result);
|
||||
if (!newpage)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -954,6 +948,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
|
||||
goto out;
|
||||
|
||||
rc = __unmap_and_move(page, newpage, force, mode);
|
||||
if (rc == MIGRATEPAGE_SUCCESS)
|
||||
put_new_page = NULL;
|
||||
|
||||
out:
|
||||
if (rc != -EAGAIN) {
|
||||
@ -980,10 +976,9 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
|
||||
* it. Otherwise, putback_lru_page() will drop the reference grabbed
|
||||
* during isolation.
|
||||
*/
|
||||
if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
|
||||
ClearPageSwapBacked(newpage);
|
||||
if (put_new_page)
|
||||
put_new_page(newpage, private);
|
||||
} else if (unlikely(__is_movable_balloon_page(newpage))) {
|
||||
else if (unlikely(__is_movable_balloon_page(newpage))) {
|
||||
/* drop our reference, page already in the balloon */
|
||||
put_page(newpage);
|
||||
} else
|
||||
@ -1021,7 +1016,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
struct page *hpage, int force,
|
||||
enum migrate_mode mode)
|
||||
{
|
||||
int rc = 0;
|
||||
int rc = -EAGAIN;
|
||||
int *result = NULL;
|
||||
int page_was_mapped = 0;
|
||||
struct page *new_hpage;
|
||||
@ -1043,8 +1038,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
if (!new_hpage)
|
||||
return -ENOMEM;
|
||||
|
||||
rc = -EAGAIN;
|
||||
|
||||
if (!trylock_page(hpage)) {
|
||||
if (!force || mode != MIGRATE_SYNC)
|
||||
goto out;
|
||||
@ -1054,6 +1047,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
if (PageAnon(hpage))
|
||||
anon_vma = page_get_anon_vma(hpage);
|
||||
|
||||
if (unlikely(!trylock_page(new_hpage)))
|
||||
goto put_anon;
|
||||
|
||||
if (page_mapped(hpage)) {
|
||||
try_to_unmap(hpage,
|
||||
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
|
||||
@ -1061,16 +1057,22 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
}
|
||||
|
||||
if (!page_mapped(hpage))
|
||||
rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
|
||||
rc = move_to_new_page(new_hpage, hpage, mode);
|
||||
|
||||
if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
|
||||
remove_migration_ptes(hpage, hpage);
|
||||
if (page_was_mapped)
|
||||
remove_migration_ptes(hpage,
|
||||
rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
|
||||
|
||||
unlock_page(new_hpage);
|
||||
|
||||
put_anon:
|
||||
if (anon_vma)
|
||||
put_anon_vma(anon_vma);
|
||||
|
||||
if (rc == MIGRATEPAGE_SUCCESS)
|
||||
if (rc == MIGRATEPAGE_SUCCESS) {
|
||||
hugetlb_cgroup_migrate(hpage, new_hpage);
|
||||
put_new_page = NULL;
|
||||
}
|
||||
|
||||
unlock_page(hpage);
|
||||
out:
|
||||
@ -1082,7 +1084,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
* it. Otherwise, put_page() will drop the reference grabbed during
|
||||
* isolation.
|
||||
*/
|
||||
if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
|
||||
if (put_new_page)
|
||||
put_new_page(new_hpage, private);
|
||||
else
|
||||
putback_active_hugepage(new_hpage);
|
||||
@ -1112,7 +1114,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
*
|
||||
* The function returns after 10 attempts or if no pages are movable any more
|
||||
* because the list has become empty or no retryable pages exist any more.
|
||||
* The caller should call putback_lru_pages() to return pages to the LRU
|
||||
* The caller should call putback_movable_pages() to return pages to the LRU
|
||||
* or free list only if ret != 0.
|
||||
*
|
||||
* Returns the number of pages that were not migrated, or an error code.
|
||||
@ -1169,7 +1171,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = nr_failed + retry;
|
||||
nr_failed += retry;
|
||||
rc = nr_failed;
|
||||
out:
|
||||
if (nr_succeeded)
|
||||
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
|
||||
@ -1786,7 +1789,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
||||
SetPageActive(page);
|
||||
if (TestClearPageUnevictable(new_page))
|
||||
SetPageUnevictable(page);
|
||||
mlock_migrate_page(page, new_page);
|
||||
|
||||
unlock_page(new_page);
|
||||
put_page(new_page); /* Free it */
|
||||
@ -1828,8 +1830,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
||||
goto fail_putback;
|
||||
}
|
||||
|
||||
mem_cgroup_migrate(page, new_page, false);
|
||||
|
||||
mlock_migrate_page(new_page, page);
|
||||
set_page_memcg(new_page, page_memcg(page));
|
||||
set_page_memcg(page, NULL);
|
||||
page_remove_rmap(page);
|
||||
|
||||
spin_unlock(ptl);
|
||||
|
@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
|
||||
|
||||
/* This also avoids any overflows on PAGE_CACHE_ALIGN */
|
||||
pages = len >> PAGE_SHIFT;
|
||||
pages += (len & ~PAGE_MASK) != 0;
|
||||
pages += (offset_in_page(len)) != 0;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, vec, pages))
|
||||
return -EFAULT;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user