mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2024-12-29 09:13:38 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "146 patches. Subsystems affected by this patch series: kthread, ia64, scripts, ntfs, squashfs, ocfs2, vfs, and mm (slab-generic, slab, kmemleak, dax, kasan, debug, pagecache, gup, shmem, frontswap, memremap, memcg, selftests, pagemap, dma, vmalloc, memory-failure, hugetlb, userfaultfd, vmscan, mempolicy, oom-kill, hugetlbfs, migration, thp, ksm, page-poison, percpu, rmap, zswap, zram, cleanups, hmm, and damon)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (146 commits) mm/damon: hide kernel pointer from tracepoint event mm/damon/vaddr: hide kernel pointer from damon_va_three_regions() failure log mm/damon/vaddr: use pr_debug() for damon_va_three_regions() failure logging mm/damon/dbgfs: remove an unnecessary variable mm/damon: move the implementation of damon_insert_region to damon.h mm/damon: add access checking for hugetlb pages Docs/admin-guide/mm/damon/usage: update for schemes statistics mm/damon/dbgfs: support all DAMOS stats Docs/admin-guide/mm/damon/reclaim: document statistics parameters mm/damon/reclaim: provide reclamation statistics mm/damon/schemes: account how many times quota limit has exceeded mm/damon/schemes: account scheme actions that successfully applied mm/damon: remove a mistakenly added comment for a future feature Docs/admin-guide/mm/damon/usage: update for kdamond_pid and (mk|rm)_contexts Docs/admin-guide/mm/damon/usage: mention tracepoint at the beginning Docs/admin-guide/mm/damon/usage: remove redundant information Docs/admin-guide/mm/damon/usage: update for scheme quotas and watermarks mm/damon: convert macro functions to static inline functions mm/damon: modify damon_rand() macro to static inline function mm/damon: move damon_rand() definition into damon.h ...
This commit is contained in:
commit
f56caedaf9
@ -29,12 +29,14 @@ Brief summary of control files::
|
||||
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
|
||||
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
|
||||
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit
|
||||
hugetlb.<hugepagesize>.numa_stat # show the numa information of the hugetlb memory charged to this cgroup
|
||||
|
||||
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
|
||||
files include::
|
||||
|
||||
hugetlb.1GB.limit_in_bytes
|
||||
hugetlb.1GB.max_usage_in_bytes
|
||||
hugetlb.1GB.numa_stat
|
||||
hugetlb.1GB.usage_in_bytes
|
||||
hugetlb.1GB.failcnt
|
||||
hugetlb.1GB.rsvd.limit_in_bytes
|
||||
@ -43,6 +45,7 @@ files include::
|
||||
hugetlb.1GB.rsvd.failcnt
|
||||
hugetlb.64KB.limit_in_bytes
|
||||
hugetlb.64KB.max_usage_in_bytes
|
||||
hugetlb.64KB.numa_stat
|
||||
hugetlb.64KB.usage_in_bytes
|
||||
hugetlb.64KB.failcnt
|
||||
hugetlb.64KB.rsvd.limit_in_bytes
|
||||
@ -51,6 +54,7 @@ files include::
|
||||
hugetlb.64KB.rsvd.failcnt
|
||||
hugetlb.32MB.limit_in_bytes
|
||||
hugetlb.32MB.max_usage_in_bytes
|
||||
hugetlb.32MB.numa_stat
|
||||
hugetlb.32MB.usage_in_bytes
|
||||
hugetlb.32MB.failcnt
|
||||
hugetlb.32MB.rsvd.limit_in_bytes
|
||||
|
@ -1268,6 +1268,9 @@ PAGE_SIZE multiple when read back.
|
||||
The number of processes belonging to this cgroup
|
||||
killed by any kind of OOM killer.
|
||||
|
||||
oom_group_kill
|
||||
The number of times a group OOM has occurred.
|
||||
|
||||
memory.events.local
|
||||
Similar to memory.events but the fields in the file are local
|
||||
to the cgroup i.e. not hierarchical. The file modified event
|
||||
@ -1311,6 +1314,9 @@ PAGE_SIZE multiple when read back.
|
||||
sock (npn)
|
||||
Amount of memory used in network transmission buffers
|
||||
|
||||
vmalloc (npn)
|
||||
Amount of memory used for vmap backed memory.
|
||||
|
||||
shmem
|
||||
Amount of cached filesystem data that is swap-backed,
|
||||
such as tmpfs, shm segments, shared anonymous mmap()s
|
||||
@ -2260,6 +2266,11 @@ HugeTLB Interface Files
|
||||
are local to the cgroup i.e. not hierarchical. The file modified event
|
||||
generated on this file reflects only the local events.
|
||||
|
||||
hugetlb.<hugepagesize>.numa_stat
|
||||
Similar to memory.numa_stat, it shows the numa information of the
|
||||
hugetlb pages of <hugepagesize> in this cgroup. Only active in
|
||||
use hugetlb pages are included. The per-node values are in bytes.
|
||||
|
||||
Misc
|
||||
----
|
||||
|
||||
|
@ -208,6 +208,31 @@ PID of the DAMON thread.
|
||||
If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else,
|
||||
-1.
|
||||
|
||||
nr_reclaim_tried_regions
|
||||
------------------------
|
||||
|
||||
Number of memory regions that tried to be reclaimed by DAMON_RECLAIM.
|
||||
|
||||
bytes_reclaim_tried_regions
|
||||
---------------------------
|
||||
|
||||
Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM.
|
||||
|
||||
nr_reclaimed_regions
|
||||
--------------------
|
||||
|
||||
Number of memory regions that successfully be reclaimed by DAMON_RECLAIM.
|
||||
|
||||
bytes_reclaimed_regions
|
||||
-----------------------
|
||||
|
||||
Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM.
|
||||
|
||||
nr_quota_exceeds
|
||||
----------------
|
||||
|
||||
Number of times that the time/space quota limits have exceeded.
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
|
@ -7,37 +7,40 @@ Detailed Usages
|
||||
DAMON provides below three interfaces for different users.
|
||||
|
||||
- *DAMON user space tool.*
|
||||
This is for privileged people such as system administrators who want a
|
||||
just-working human-friendly interface. Using this, users can use the DAMON’s
|
||||
major features in a human-friendly way. It may not be highly tuned for
|
||||
special cases, though. It supports both virtual and physical address spaces
|
||||
monitoring.
|
||||
`This <https://github.com/awslabs/damo>`_ is for privileged people such as
|
||||
system administrators who want a just-working human-friendly interface.
|
||||
Using this, users can use the DAMON’s major features in a human-friendly way.
|
||||
It may not be highly tuned for special cases, though. It supports both
|
||||
virtual and physical address spaces monitoring. For more detail, please
|
||||
refer to its `usage document
|
||||
<https://github.com/awslabs/damo/blob/next/USAGE.md>`_.
|
||||
- *debugfs interface.*
|
||||
This is for privileged user space programmers who want more optimized use of
|
||||
DAMON. Using this, users can use DAMON’s major features by reading
|
||||
from and writing to special debugfs files. Therefore, you can write and use
|
||||
your personalized DAMON debugfs wrapper programs that reads/writes the
|
||||
debugfs files instead of you. The DAMON user space tool is also a reference
|
||||
implementation of such programs. It supports both virtual and physical
|
||||
address spaces monitoring.
|
||||
:ref:`This <debugfs_interface>` is for privileged user space programmers who
|
||||
want more optimized use of DAMON. Using this, users can use DAMON’s major
|
||||
features by reading from and writing to special debugfs files. Therefore,
|
||||
you can write and use your personalized DAMON debugfs wrapper programs that
|
||||
reads/writes the debugfs files instead of you. The `DAMON user space tool
|
||||
<https://github.com/awslabs/damo>`_ is one example of such programs. It
|
||||
supports both virtual and physical address spaces monitoring. Note that this
|
||||
interface provides only simple :ref:`statistics <damos_stats>` for the
|
||||
monitoring results. For detailed monitoring results, DAMON provides a
|
||||
:ref:`tracepoint <tracepoint>`.
|
||||
- *Kernel Space Programming Interface.*
|
||||
This is for kernel space programmers. Using this, users can utilize every
|
||||
feature of DAMON most flexibly and efficiently by writing kernel space
|
||||
DAMON application programs for you. You can even extend DAMON for various
|
||||
address spaces.
|
||||
:doc:`This </vm/damon/api>` is for kernel space programmers. Using this,
|
||||
users can utilize every feature of DAMON most flexibly and efficiently by
|
||||
writing kernel space DAMON application programs for you. You can even extend
|
||||
DAMON for various address spaces. For detail, please refer to the interface
|
||||
:doc:`document </vm/damon/api>`.
|
||||
|
||||
Nevertheless, you could write your own user space tool using the debugfs
|
||||
interface. A reference implementation is available at
|
||||
https://github.com/awslabs/damo. If you are a kernel programmer, you could
|
||||
refer to :doc:`/vm/damon/api` for the kernel space programming interface. For
|
||||
the reason, this document describes only the debugfs interface
|
||||
|
||||
.. _debugfs_interface:
|
||||
|
||||
debugfs Interface
|
||||
=================
|
||||
|
||||
DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes`` and ``monitor_on`` under its debugfs directory,
|
||||
``<debugfs>/damon/``.
|
||||
DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
|
||||
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
||||
|
||||
|
||||
Attributes
|
||||
@ -131,24 +134,38 @@ Schemes
|
||||
|
||||
For usual DAMON-based data access aware memory management optimizations, users
|
||||
would simply want the system to apply a memory management action to a memory
|
||||
region of a specific size having a specific access frequency for a specific
|
||||
time. DAMON receives such formalized operation schemes from the user and
|
||||
applies those to the target processes. It also counts the total number and
|
||||
size of regions that each scheme is applied. This statistics can be used for
|
||||
online analysis or tuning of the schemes.
|
||||
region of a specific access pattern. DAMON receives such formalized operation
|
||||
schemes from the user and applies those to the target processes.
|
||||
|
||||
Users can get and set the schemes by reading from and writing to ``schemes``
|
||||
debugfs file. Reading the file also shows the statistics of each scheme. To
|
||||
the file, each of the schemes should be represented in each line in below form:
|
||||
the file, each of the schemes should be represented in each line in below
|
||||
form::
|
||||
|
||||
min-size max-size min-acc max-acc min-age max-age action
|
||||
<target access pattern> <action> <quota> <watermarks>
|
||||
|
||||
Note that the ranges are closed interval. Bytes for the size of regions
|
||||
(``min-size`` and ``max-size``), number of monitored accesses per aggregate
|
||||
interval for access frequency (``min-acc`` and ``max-acc``), number of
|
||||
aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a
|
||||
predefined integer for memory management actions should be used. The supported
|
||||
numbers and their meanings are as below.
|
||||
You can disable schemes by simply writing an empty string to the file.
|
||||
|
||||
Target Access Pattern
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``<target access pattern>`` is constructed with three ranges in below
|
||||
form::
|
||||
|
||||
min-size max-size min-acc max-acc min-age max-age
|
||||
|
||||
Specifically, bytes for the size of regions (``min-size`` and ``max-size``),
|
||||
number of monitored accesses per aggregate interval for access frequency
|
||||
(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of
|
||||
regions (``min-age`` and ``max-age``) are specified. Note that the ranges are
|
||||
closed interval.
|
||||
|
||||
Action
|
||||
~~~~~~
|
||||
|
||||
The ``<action>`` is a predefined integer for memory management actions, which
|
||||
DAMON will apply to the regions having the target access pattern. The
|
||||
supported numbers and their meanings are as below.
|
||||
|
||||
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||
@ -157,20 +174,82 @@ numbers and their meanings are as below.
|
||||
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
- 5: Do nothing but count the statistics
|
||||
|
||||
You can disable schemes by simply writing an empty string to the file. For
|
||||
example, below commands applies a scheme saying "If a memory region of size in
|
||||
[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
|
||||
interval in [10, 20], page out the region", check the entered scheme again, and
|
||||
finally remove the scheme. ::
|
||||
Quota
|
||||
~~~~~
|
||||
|
||||
Optimal ``target access pattern`` for each ``action`` is workload dependent, so
|
||||
not easy to find. Worse yet, setting a scheme of some action too aggressive
|
||||
can cause severe overhead. To avoid such overhead, users can limit time and
|
||||
size quota for the scheme via the ``<quota>`` in below form::
|
||||
|
||||
<ms> <sz> <reset interval> <priority weights>
|
||||
|
||||
This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying
|
||||
the action to memory regions of the ``target access pattern`` within the
|
||||
``<reset interval>`` milliseconds, and to apply the action to only up to
|
||||
``<sz>`` bytes of memory regions within the ``<reset interval>``. Setting both
|
||||
``<ms>`` and ``<sz>`` zero disables the quota limits.
|
||||
|
||||
When the quota limit is expected to be exceeded, DAMON prioritizes found memory
|
||||
regions of the ``target access pattern`` based on their size, access frequency,
|
||||
and age. For personalized prioritization, users can set the weights for the
|
||||
three properties in ``<priority weights>`` in below form::
|
||||
|
||||
<size weight> <access frequency weight> <age weight>
|
||||
|
||||
Watermarks
|
||||
~~~~~~~~~~
|
||||
|
||||
Some schemes would need to run based on current value of the system's specific
|
||||
metrics like free memory ratio. For such cases, users can specify watermarks
|
||||
for the condition.::
|
||||
|
||||
<metric> <check interval> <high mark> <middle mark> <low mark>
|
||||
|
||||
``<metric>`` is a predefined integer for the metric to be checked. The
|
||||
supported numbers and their meanings are as below.
|
||||
|
||||
- 0: Ignore the watermarks
|
||||
- 1: System's free memory rate (per thousand)
|
||||
|
||||
The value of the metric is checked every ``<check interval>`` microseconds.
|
||||
|
||||
If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
|
||||
scheme is deactivated. If the value is lower than ``<mid mark>``, the scheme
|
||||
is activated.
|
||||
|
||||
.. _damos_stats:
|
||||
|
||||
Statistics
|
||||
~~~~~~~~~~
|
||||
|
||||
It also counts the total number and bytes of regions that each scheme is tried
|
||||
to be applied, the two numbers for the regions that each scheme is successfully
|
||||
applied, and the total number of the quota limit exceeds. This statistics can
|
||||
be used for online analysis or tuning of the schemes.
|
||||
|
||||
The statistics can be shown by reading the ``schemes`` file. Reading the file
|
||||
will show each scheme you entered in each line, and the five numbers for the
|
||||
statistics will be added at the end of each line.
|
||||
|
||||
Example
|
||||
~~~~~~~
|
||||
|
||||
Below commands applies a scheme saying "If a memory region of size in [4KiB,
|
||||
8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
|
||||
interval in [10, 20], page out the region. For the paging out, use only up to
|
||||
10ms per second, and also don't page out more than 1GiB per second. Under the
|
||||
limitation, page out memory regions having longer age first. Also, check the
|
||||
free memory rate of the system every 5 seconds, start the monitoring and paging
|
||||
out when the free memory rate becomes lower than 50%, but stop it if the free
|
||||
memory rate becomes larger than 60%, or lower than 30%".::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo "4096 8192 0 5 10 20 2" > schemes
|
||||
# cat schemes
|
||||
4096 8192 0 5 10 20 2 0 0
|
||||
# echo > schemes
|
||||
|
||||
The last two integers in the 4th line of above example is the total number and
|
||||
the total size of the regions that the scheme is applied.
|
||||
# scheme="4096 8192 0 5 10 20 2" # target access pattern and action
|
||||
# scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
|
||||
# scheme+=" 0 0 100" # prioritization weights
|
||||
# scheme+=" 1 5000000 600 500 300" # watermarks
|
||||
# echo "$scheme" > schemes
|
||||
|
||||
|
||||
Turning On/Off
|
||||
@ -195,6 +274,54 @@ the monitoring is turned on. If you write to the files while DAMON is running,
|
||||
an error code such as ``-EBUSY`` will be returned.
|
||||
|
||||
|
||||
Monitoring Thread PID
|
||||
---------------------
|
||||
|
||||
DAMON does requested monitoring with a kernel thread called ``kdamond``. You
|
||||
can get the pid of the thread by reading the ``kdamond_pid`` file. When the
|
||||
monitoring is turned off, reading the file returns ``none``. ::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# cat monitor_on
|
||||
off
|
||||
# cat kdamond_pid
|
||||
none
|
||||
# echo on > monitor_on
|
||||
# cat kdamond_pid
|
||||
18594
|
||||
|
||||
|
||||
Using Multiple Monitoring Threads
|
||||
---------------------------------
|
||||
|
||||
One ``kdamond`` thread is created for each monitoring context. You can create
|
||||
and remove monitoring contexts for multiple ``kdamond`` required use case using
|
||||
the ``mk_contexts`` and ``rm_contexts`` files.
|
||||
|
||||
Writing the name of the new context to the ``mk_contexts`` file creates a
|
||||
directory of the name on the DAMON debugfs directory. The directory will have
|
||||
DAMON debugfs files for the context. ::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
# echo foo > mk_contexts
|
||||
# ls foo
|
||||
# attrs init_regions kdamond_pid schemes target_ids
|
||||
|
||||
If the context is not needed anymore, you can remove it and the corresponding
|
||||
directory by putting the name of the context to the ``rm_contexts`` file. ::
|
||||
|
||||
# echo foo > rm_contexts
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
|
||||
Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
|
||||
root directory only.
|
||||
|
||||
|
||||
.. _tracepoint:
|
||||
|
||||
Tracepoint for Monitoring Results
|
||||
=================================
|
||||
|
||||
|
@ -408,7 +408,7 @@ follows:
|
||||
Memory Policy APIs
|
||||
==================
|
||||
|
||||
Linux supports 3 system calls for controlling memory policy. These APIS
|
||||
Linux supports 4 system calls for controlling memory policy. These APIS
|
||||
always affect only the calling task, the calling task's address space, or
|
||||
some shared object mapped into the calling task's address space.
|
||||
|
||||
@ -460,6 +460,20 @@ requested via the 'flags' argument.
|
||||
|
||||
See the mbind(2) man page for more details.
|
||||
|
||||
Set home node for a Range of Task's Address Spacec::
|
||||
|
||||
long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
|
||||
unsigned long home_node,
|
||||
unsigned long flags);
|
||||
|
||||
sys_set_mempolicy_home_node set the home node for a VMA policy present in the
|
||||
task's address range. The system call updates the home node only for the existing
|
||||
mempolicy range. Other address ranges are ignored. A home node is the NUMA node
|
||||
closest to which page allocation will come from. Specifying the home node override
|
||||
the default allocation policy to allocate memory close to the local node for an
|
||||
executing CPU.
|
||||
|
||||
|
||||
Memory Policy Command Line Interface
|
||||
====================================
|
||||
|
||||
|
@ -948,7 +948,7 @@ how much memory needs to be free before kswapd goes back to sleep.
|
||||
|
||||
The unit is in fractions of 10,000. The default value of 10 means the
|
||||
distances between watermarks are 0.1% of the available memory in the
|
||||
node/system. The maximum value is 1000, or 10% of memory.
|
||||
node/system. The maximum value is 3000, or 30% of memory.
|
||||
|
||||
A high rate of threads entering direct reclaim (allocstall) or kswapd
|
||||
going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
|
||||
|
@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data).
|
||||
The "pathname" shows the name associated file for this mapping. If the mapping
|
||||
is not associated with a file:
|
||||
|
||||
======= ====================================
|
||||
============= ====================================
|
||||
[heap] the heap of the program
|
||||
[stack] the stack of the main process
|
||||
[vdso] the "virtual dynamic shared object",
|
||||
the kernel system call handler
|
||||
======= ====================================
|
||||
[anon:<name>] an anonymous mapping that has been
|
||||
named by userspace
|
||||
============= ====================================
|
||||
|
||||
or if empty, the mapping is anonymous.
|
||||
|
||||
|
@ -66,9 +66,11 @@ PTE Page Table Helpers
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| pte_mknotpresent | Invalidates a mapped PTE |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| ptep_get_and_clear | Clears a PTE |
|
||||
| ptep_clear | Clears a PTE |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| ptep_get_and_clear_full | Clears a PTE |
|
||||
| ptep_get_and_clear | Clears and returns PTE |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| ptep_test_and_clear_young | Clears young from a PTE |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
@ -247,12 +249,12 @@ SWAP Page Table Helpers
|
||||
| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| is_migration_entry | Tests a migration (read or write) swapped entry |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| is_write_migration_entry | Tests a write migration swapped entry |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| make_migration_entry_read | Converts into read migration swapped entry |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| make_migration_entry | Creates a migration swapped entry (read or write)|
|
||||
+---------------------------+--------------------------------------------------+
|
||||
+-------------------------------+----------------------------------------------+
|
||||
| is_writable_migration_entry | Tests a write migration swapped entry |
|
||||
+-------------------------------+----------------------------------------------+
|
||||
| make_readable_migration_entry | Creates a read migration swapped entry |
|
||||
+-------------------------------+----------------------------------------------+
|
||||
| make_writable_migration_entry | Creates a write migration swapped entry |
|
||||
+-------------------------------+----------------------------------------------+
|
||||
|
||||
[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
|
||||
|
@ -31,10 +31,12 @@ algorithms. If you are looking for advice on simply allocating memory, see the
|
||||
page_migration
|
||||
page_frags
|
||||
page_owner
|
||||
page_table_check
|
||||
remap_file_pages
|
||||
slub
|
||||
split_page_table_lock
|
||||
transhuge
|
||||
unevictable-lru
|
||||
vmalloced-kernel-stacks
|
||||
z3fold
|
||||
zsmalloc
|
||||
|
@ -263,15 +263,15 @@ Monitoring Migration
|
||||
The following events (counters) can be used to monitor page migration.
|
||||
|
||||
1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a
|
||||
page was migrated. If the page was a non-THP page, then this counter is
|
||||
increased by one. If the page was a THP, then this counter is increased by
|
||||
the number of THP subpages. For example, migration of a single 2MB THP that
|
||||
has 4KB-size base pages (subpages) will cause this counter to increase by
|
||||
512.
|
||||
page was migrated. If the page was a non-THP and non-hugetlb page, then
|
||||
this counter is increased by one. If the page was a THP or hugetlb, then
|
||||
this counter is increased by the number of THP or hugetlb subpages.
|
||||
For example, migration of a single 2MB THP that has 4KB-size base pages
|
||||
(subpages) will cause this counter to increase by 512.
|
||||
|
||||
2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for
|
||||
PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages,
|
||||
if it was a THP.
|
||||
if it was a THP or hugetlb.
|
||||
|
||||
3. THP_MIGRATION_SUCCESS: A THP was migrated without being split.
|
||||
|
||||
|
56
Documentation/vm/page_table_check.rst
Normal file
56
Documentation/vm/page_table_check.rst
Normal file
@ -0,0 +1,56 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
.. _page_table_check:
|
||||
|
||||
================
|
||||
Page Table Check
|
||||
================
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
Page table check allows to hardern the kernel by ensuring that some types of
|
||||
the memory corruptions are prevented.
|
||||
|
||||
Page table check performs extra verifications at the time when new pages become
|
||||
accessible from the userspace by getting their page table entries (PTEs PMDs
|
||||
etc.) added into the table.
|
||||
|
||||
In case of detected corruption, the kernel is crashed. There is a small
|
||||
performance and memory overhead associated with the page table check. Therefore,
|
||||
it is disabled by default, but can be optionally enabled on systems where the
|
||||
extra hardening outweighs the performance costs. Also, because page table check
|
||||
is synchronous, it can help with debugging double map memory corruption issues,
|
||||
by crashing kernel at the time wrong mapping occurs instead of later which is
|
||||
often the case with memory corruptions bugs.
|
||||
|
||||
Double mapping detection logic
|
||||
==============================
|
||||
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Current Mapping | New mapping | Permissions | Rule |
|
||||
+===================+===================+===================+==================+
|
||||
| Anonymous | Anonymous | Read | Allow |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Anonymous | Anonymous | Read / Write | Prohibit |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Anonymous | Named | Any | Prohibit |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Named | Anonymous | Any | Prohibit |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Named | Named | Any | Allow |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
|
||||
Enabling Page Table Check
|
||||
=========================
|
||||
|
||||
Build kernel with:
|
||||
|
||||
- PAGE_TABLE_CHECK=y
|
||||
Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
||||
is available.
|
||||
|
||||
- Boot with 'page_table_check=on' kernel parameter.
|
||||
|
||||
Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page
|
||||
table support without extra kernel parameter.
|
153
Documentation/vm/vmalloced-kernel-stacks.rst
Normal file
153
Documentation/vm/vmalloced-kernel-stacks.rst
Normal file
@ -0,0 +1,153 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=====================================
|
||||
Virtually Mapped Kernel Stack Support
|
||||
=====================================
|
||||
|
||||
:Author: Shuah Khan <skhan@linuxfoundation.org>
|
||||
|
||||
.. contents:: :local:
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
This is a compilation of information from the code and original patch
|
||||
series that introduced the `Virtually Mapped Kernel Stacks feature
|
||||
<https://lwn.net/Articles/694348/>`
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
Kernel stack overflows are often hard to debug and make the kernel
|
||||
susceptible to exploits. Problems could show up at a later time making
|
||||
it difficult to isolate and root-cause.
|
||||
|
||||
Virtually-mapped kernel stacks with guard pages causes kernel stack
|
||||
overflows to be caught immediately rather than causing difficult to
|
||||
diagnose corruptions.
|
||||
|
||||
HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable
|
||||
support for virtually mapped stacks with guard pages. This feature
|
||||
causes reliable faults when the stack overflows. The usability of
|
||||
the stack trace after overflow and response to the overflow itself
|
||||
is architecture dependent.
|
||||
|
||||
.. note::
|
||||
As of this writing, arm64, powerpc, riscv, s390, um, and x86 have
|
||||
support for VMAP_STACK.
|
||||
|
||||
HAVE_ARCH_VMAP_STACK
|
||||
--------------------
|
||||
|
||||
Architectures that can support Virtually Mapped Kernel Stacks should
|
||||
enable this bool configuration option. The requirements are:
|
||||
|
||||
- vmalloc space must be large enough to hold many kernel stacks. This
|
||||
may rule out many 32-bit architectures.
|
||||
- Stacks in vmalloc space need to work reliably. For example, if
|
||||
vmap page tables are created on demand, either this mechanism
|
||||
needs to work while the stack points to a virtual address with
|
||||
unpopulated page tables or arch code (switch_to() and switch_mm(),
|
||||
most likely) needs to ensure that the stack's page table entries
|
||||
are populated before running on a possibly unpopulated stack.
|
||||
- If the stack overflows into a guard page, something reasonable
|
||||
should happen. The definition of "reasonable" is flexible, but
|
||||
instantly rebooting without logging anything would be unfriendly.
|
||||
|
||||
VMAP_STACK
|
||||
----------
|
||||
|
||||
VMAP_STACK bool configuration option when enabled allocates virtually
|
||||
mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK.
|
||||
|
||||
- Enable this if you want the use virtually-mapped kernel stacks
|
||||
with guard pages. This causes kernel stack overflows to be caught
|
||||
immediately rather than causing difficult-to-diagnose corruption.
|
||||
|
||||
.. note::
|
||||
|
||||
Using this feature with KASAN requires architecture support
|
||||
for backing virtual mappings with real shadow memory, and
|
||||
KASAN_VMALLOC must be enabled.
|
||||
|
||||
.. note::
|
||||
|
||||
VMAP_STACK is enabled, it is not possible to run DMA on stack
|
||||
allocated data.
|
||||
|
||||
Kernel configuration options and dependencies keep changing. Refer to
|
||||
the latest code base:
|
||||
|
||||
`Kconfig <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/Kconfig>`
|
||||
|
||||
Allocation
|
||||
-----------
|
||||
|
||||
When a new kernel thread is created, thread stack is allocated from
|
||||
virtually contiguous memory pages from the page level allocator. These
|
||||
pages are mapped into contiguous kernel virtual space with PAGE_KERNEL
|
||||
protections.
|
||||
|
||||
alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack
|
||||
with PAGE_KERNEL protections.
|
||||
|
||||
- Allocated stacks are cached and later reused by new threads, so memcg
|
||||
accounting is performed manually on assigning/releasing stacks to tasks.
|
||||
Hence, __vmalloc_node_range is called without __GFP_ACCOUNT.
|
||||
- vm_struct is cached to be able to find when thread free is initiated
|
||||
in interrupt context. free_thread_stack() can be called in interrupt
|
||||
context.
|
||||
- On arm64, all VMAP's stacks need to have the same alignment to ensure
|
||||
that VMAP'd stack overflow detection works correctly. Arch specific
|
||||
vmap stack allocator takes care of this detail.
|
||||
- This does not address interrupt stacks - according to the original patch
|
||||
|
||||
Thread stack allocation is initiated from clone(), fork(), vfork(),
|
||||
kernel_thread() via kernel_clone(). Leaving a few hints for searching
|
||||
the code base to understand when and how thread stack is allocated.
|
||||
|
||||
Bulk of the code is in:
|
||||
`kernel/fork.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/fork.c>`.
|
||||
|
||||
stack_vm_area pointer in task_struct keeps track of the virtually allocated
|
||||
stack and a non-null stack_vm_area pointer serves as a indication that the
|
||||
virtually mapped kernel stacks are enabled.
|
||||
|
||||
::
|
||||
|
||||
struct vm_struct *stack_vm_area;
|
||||
|
||||
Stack overflow handling
|
||||
-----------------------
|
||||
|
||||
Leading and trailing guard pages help detect stack overflows. When stack
|
||||
overflows into the guard pages, handlers have to be careful not overflow
|
||||
the stack again. When handlers are called, it is likely that very little
|
||||
stack space is left.
|
||||
|
||||
On x86, this is done by handling the page fault indicating the kernel
|
||||
stack overflow on the double-fault stack.
|
||||
|
||||
Testing VMAP allocation with guard pages
|
||||
----------------------------------------
|
||||
|
||||
How do we ensure that VMAP_STACK is actually allocating with a leading
|
||||
and trailing guard page? The following lkdtm tests can help detect any
|
||||
regressions.
|
||||
|
||||
::
|
||||
|
||||
void lkdtm_STACK_GUARD_PAGE_LEADING()
|
||||
void lkdtm_STACK_GUARD_PAGE_TRAILING()
|
||||
|
||||
Conclusions
|
||||
-----------
|
||||
|
||||
- A percpu cache of vmalloced stacks appears to be a bit faster than a
|
||||
high-order stack allocation, at least when the cache hits.
|
||||
- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and
|
||||
simply embed the thread_info (containing only flags) and 'int cpu' into
|
||||
task_struct.
|
||||
- The thread stack can be free'ed as soon as the task is dead (without
|
||||
waiting for RCU) and then, if vmapped stacks are in use, cache the
|
||||
entire stack for reuse on the same cpu.
|
@ -14541,6 +14541,15 @@ F: include/net/page_pool.h
|
||||
F: include/trace/events/page_pool.h
|
||||
F: net/core/page_pool.c
|
||||
|
||||
PAGE TABLE CHECK
|
||||
M: Pasha Tatashin <pasha.tatashin@soleen.com>
|
||||
M: Andrew Morton <akpm@linux-foundation.org>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: Documentation/vm/page_table_check.rst
|
||||
F: include/linux/page_table_check.h
|
||||
F: mm/page_table_check.c
|
||||
|
||||
PANASONIC LAPTOP ACPI EXTRAS DRIVER
|
||||
M: Kenneth Chan <kenneth.t.chan@gmail.com>
|
||||
L: platform-driver-x86@vger.kernel.org
|
||||
|
@ -1297,6 +1297,9 @@ config HAVE_ARCH_PFN_VALID
|
||||
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
||||
bool
|
||||
|
||||
config ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
||||
bool
|
||||
|
||||
config ARCH_SPLIT_ARG64
|
||||
bool
|
||||
help
|
||||
|
@ -489,3 +489,4 @@
|
||||
# 557 reserved for memfd_secret
|
||||
558 common process_mrelease sys_process_mrelease
|
||||
559 common futex_waitv sys_futex_waitv
|
||||
560 common set_mempolicy_home_node sys_ni_syscall
|
||||
|
@ -165,17 +165,15 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -149,8 +149,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
|
||||
/*
|
||||
* Fault retry nuances, mmap_lock already relinquished by core mm
|
||||
*/
|
||||
if (unlikely((fault & VM_FAULT_RETRY) &&
|
||||
(flags & FAULT_FLAG_ALLOW_RETRY))) {
|
||||
if (unlikely(fault & VM_FAULT_RETRY)) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
|
@ -322,7 +322,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (!(fault & VM_FAULT_ERROR)) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
|
@ -463,3 +463,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -38,7 +38,7 @@
|
||||
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
|
||||
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
|
||||
|
||||
#define __NR_compat_syscalls 450
|
||||
#define __NR_compat_syscalls 451
|
||||
#endif
|
||||
|
||||
#define __ARCH_WANT_SYS_CLONE
|
||||
|
@ -905,6 +905,8 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
|
||||
__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
|
||||
#define __NR_futex_waitv 449
|
||||
__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
|
||||
#define __NR_set_mempolicy_home_node 450
|
||||
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
|
||||
|
||||
/*
|
||||
* Please add new compat syscalls above this comment and update
|
||||
|
@ -36,7 +36,7 @@ void *module_alloc(unsigned long size)
|
||||
module_alloc_end = MODULES_END;
|
||||
|
||||
p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
|
||||
module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
|
||||
module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK,
|
||||
NUMA_NO_NODE, __builtin_return_address(0));
|
||||
|
||||
if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
|
||||
@ -58,7 +58,7 @@ void *module_alloc(unsigned long size)
|
||||
PAGE_KERNEL, 0, NUMA_NO_NODE,
|
||||
__builtin_return_address(0));
|
||||
|
||||
if (p && (kasan_module_alloc(p, size) < 0)) {
|
||||
if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
|
||||
vfree(p);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -608,10 +608,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
|
||||
}
|
||||
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
mm_flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
mm_flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
|
@ -98,11 +98,9 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
|
||||
|
||||
/* The most common case -- we are done. */
|
||||
if (likely(!(fault & VM_FAULT_ERROR))) {
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -848,7 +848,7 @@ register_unwind_table (struct module *mod)
|
||||
{
|
||||
struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr;
|
||||
struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start);
|
||||
struct unw_table_entry tmp, *e1, *e2, *core, *init;
|
||||
struct unw_table_entry *e1, *e2, *core, *init;
|
||||
unsigned long num_init = 0, num_core = 0;
|
||||
|
||||
/* First, count how many init and core unwind-table entries there are. */
|
||||
@ -865,9 +865,7 @@ register_unwind_table (struct module *mod)
|
||||
for (e1 = start; e1 < end; ++e1) {
|
||||
for (e2 = e1 + 1; e2 < end; ++e2) {
|
||||
if (e2->start_offset < e1->start_offset) {
|
||||
tmp = *e1;
|
||||
*e1 = *e2;
|
||||
*e2 = tmp;
|
||||
swap(*e1, *e2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -208,10 +208,7 @@ sort_regions (struct rsvd_region *rsvd_region, int max)
|
||||
while (max--) {
|
||||
for (j = 0; j < max; ++j) {
|
||||
if (rsvd_region[j].start > rsvd_region[j+1].start) {
|
||||
struct rsvd_region tmp;
|
||||
tmp = rsvd_region[j];
|
||||
rsvd_region[j] = rsvd_region[j + 1];
|
||||
rsvd_region[j + 1] = tmp;
|
||||
swap(rsvd_region[j], rsvd_region[j + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -370,3 +370,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -264,6 +264,7 @@ static struct attribute * cache_default_attrs[] = {
|
||||
&shared_cpu_map.attr,
|
||||
NULL
|
||||
};
|
||||
ATTRIBUTE_GROUPS(cache_default);
|
||||
|
||||
#define to_object(k) container_of(k, struct cache_info, kobj)
|
||||
#define to_attr(a) container_of(a, struct cache_attr, attr)
|
||||
@ -284,7 +285,7 @@ static const struct sysfs_ops cache_sysfs_ops = {
|
||||
|
||||
static struct kobj_type cache_ktype = {
|
||||
.sysfs_ops = &cache_sysfs_ops,
|
||||
.default_attrs = cache_default_attrs,
|
||||
.default_groups = cache_default_groups,
|
||||
};
|
||||
|
||||
static struct kobj_type cache_ktype_percpu_entry = {
|
||||
|
@ -171,7 +171,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
|
||||
* @n_pages: number of contiguous pages to allocate
|
||||
*
|
||||
* Allocate the specified number of contiguous uncached pages on the
|
||||
* the requested node. If not enough contiguous uncached pages are available
|
||||
* requested node. If not enough contiguous uncached pages are available
|
||||
* on the requested node, roundrobin starting with the next higher node.
|
||||
*/
|
||||
unsigned long uncached_alloc_page(int starting_nid, int n_pages)
|
||||
|
@ -156,17 +156,15 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -449,3 +449,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -153,18 +153,16 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -455,3 +455,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -232,18 +232,16 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -388,3 +388,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 n32 process_mrelease sys_process_mrelease
|
||||
449 n32 futex_waitv sys_futex_waitv
|
||||
450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -364,3 +364,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 n64 process_mrelease sys_process_mrelease
|
||||
449 n64 futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -437,3 +437,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 o32 process_mrelease sys_process_mrelease
|
||||
449 o32 futex_waitv sys_futex_waitv
|
||||
450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -171,18 +171,17 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
|
||||
goto do_sigbus;
|
||||
BUG();
|
||||
}
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
goto retry;
|
||||
}
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -230,16 +230,14 @@ void do_page_fault(unsigned long entry, unsigned long addr,
|
||||
goto bad_area;
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
goto retry;
|
||||
}
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -149,18 +149,16 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -177,18 +177,16 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
/*RGD modeled on Cris */
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
/*RGD modeled on Cris */
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -447,3 +447,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -324,16 +324,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
|
||||
goto bad_area;
|
||||
BUG();
|
||||
}
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
return;
|
||||
|
@ -529,3 +529,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -517,10 +517,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
|
||||
* case.
|
||||
*/
|
||||
if (unlikely(fault & VM_FAULT_RETRY)) {
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(current->mm);
|
||||
|
@ -330,7 +330,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
|
||||
if (fault_signal_pending(fault, regs))
|
||||
return;
|
||||
|
||||
if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) {
|
||||
if (unlikely(fault & VM_FAULT_RETRY)) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/*
|
||||
|
@ -37,14 +37,15 @@
|
||||
|
||||
void *module_alloc(unsigned long size)
|
||||
{
|
||||
gfp_t gfp_mask = GFP_KERNEL;
|
||||
void *p;
|
||||
|
||||
if (PAGE_ALIGN(size) > MODULES_LEN)
|
||||
return NULL;
|
||||
p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
|
||||
GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
|
||||
gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
|
||||
__builtin_return_address(0));
|
||||
if (p && (kasan_module_alloc(p, size) < 0)) {
|
||||
if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
|
||||
vfree(p);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -452,3 +452,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -452,21 +452,21 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
|
||||
if (unlikely(fault & VM_FAULT_ERROR))
|
||||
goto out_up;
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
|
||||
(flags & FAULT_FLAG_RETRY_NOWAIT)) {
|
||||
/* FAULT_FLAG_RETRY_NOWAIT has been set,
|
||||
* mmap_lock has not been released */
|
||||
current->thread.gmap_pfault = 1;
|
||||
fault = VM_FAULT_PFAULT;
|
||||
goto out_up;
|
||||
}
|
||||
flags &= ~FAULT_FLAG_RETRY_NOWAIT;
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
mmap_read_lock(mm);
|
||||
goto retry;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
|
||||
(flags & FAULT_FLAG_RETRY_NOWAIT)) {
|
||||
/*
|
||||
* FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
|
||||
* not been released
|
||||
*/
|
||||
current->thread.gmap_pfault = 1;
|
||||
fault = VM_FAULT_PFAULT;
|
||||
goto out_up;
|
||||
}
|
||||
flags &= ~FAULT_FLAG_RETRY_NOWAIT;
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
mmap_read_lock(mm);
|
||||
goto retry;
|
||||
}
|
||||
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
|
||||
address = __gmap_link(gmap, current->thread.gmap_addr,
|
||||
|
@ -452,3 +452,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -485,17 +485,15 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
|
||||
if (mm_fault_error(regs, error_code, address, fault))
|
||||
return;
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
goto retry;
|
||||
}
|
||||
/*
|
||||
* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -495,3 +495,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -200,17 +200,15 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -437,17 +437,15 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
|
@ -87,12 +87,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
|
||||
}
|
||||
BUG();
|
||||
}
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
goto retry;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
pmd = pmd_off(mm, address);
|
||||
|
@ -104,6 +104,7 @@ config X86
|
||||
select ARCH_SUPPORTS_ACPI
|
||||
select ARCH_SUPPORTS_ATOMIC_RMW
|
||||
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
||||
select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
|
||||
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
|
||||
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
|
||||
select ARCH_SUPPORTS_LTO_CLANG
|
||||
|
@ -454,3 +454,4 @@
|
||||
447 i386 memfd_secret sys_memfd_secret
|
||||
448 i386 process_mrelease sys_process_mrelease
|
||||
449 i386 futex_waitv sys_futex_waitv
|
||||
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -371,6 +371,7 @@
|
||||
447 common memfd_secret sys_memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
#
|
||||
# Due to a historical design error, certain syscalls are numbered differently
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <asm/pkru.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm-generic/pgtable_uffd.h>
|
||||
#include <linux/page_table_check.h>
|
||||
|
||||
extern pgd_t early_top_pgt[PTRS_PER_PGD];
|
||||
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
|
||||
@ -753,7 +754,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
|
||||
return true;
|
||||
|
||||
if ((pte_flags(a) & _PAGE_PROTNONE) &&
|
||||
mm_tlb_flush_pending(mm))
|
||||
atomic_read(&mm->tlb_flush_pending))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
@ -1007,18 +1008,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
|
||||
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte)
|
||||
{
|
||||
page_table_check_pte_set(mm, addr, ptep, pte);
|
||||
set_pte(ptep, pte);
|
||||
}
|
||||
|
||||
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmdp, pmd_t pmd)
|
||||
{
|
||||
page_table_check_pmd_set(mm, addr, pmdp, pmd);
|
||||
set_pmd(pmdp, pmd);
|
||||
}
|
||||
|
||||
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
|
||||
pud_t *pudp, pud_t pud)
|
||||
{
|
||||
page_table_check_pud_set(mm, addr, pudp, pud);
|
||||
native_set_pud(pudp, pud);
|
||||
}
|
||||
|
||||
@ -1049,6 +1053,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep)
|
||||
{
|
||||
pte_t pte = native_ptep_get_and_clear(ptep);
|
||||
page_table_check_pte_clear(mm, addr, pte);
|
||||
return pte;
|
||||
}
|
||||
|
||||
@ -1064,12 +1069,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
|
||||
* care about updates and native needs no locking
|
||||
*/
|
||||
pte = native_local_ptep_get_and_clear(ptep);
|
||||
page_table_check_pte_clear(mm, addr, pte);
|
||||
} else {
|
||||
pte = ptep_get_and_clear(mm, addr, ptep);
|
||||
}
|
||||
return pte;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_CLEAR
|
||||
static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK))
|
||||
ptep_get_and_clear(mm, addr, ptep);
|
||||
else
|
||||
pte_clear(mm, addr, ptep);
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
|
||||
static inline void ptep_set_wrprotect(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
@ -1110,14 +1126,22 @@ static inline int pmd_write(pmd_t pmd)
|
||||
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmdp)
|
||||
{
|
||||
return native_pmdp_get_and_clear(pmdp);
|
||||
pmd_t pmd = native_pmdp_get_and_clear(pmdp);
|
||||
|
||||
page_table_check_pmd_clear(mm, addr, pmd);
|
||||
|
||||
return pmd;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
|
||||
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pud_t *pudp)
|
||||
{
|
||||
return native_pudp_get_and_clear(pudp);
|
||||
pud_t pud = native_pudp_get_and_clear(pudp);
|
||||
|
||||
page_table_check_pud_clear(mm, addr, pud);
|
||||
|
||||
return pud;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
|
||||
@ -1138,6 +1162,7 @@ static inline int pud_write(pud_t pud)
|
||||
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmdp, pmd_t pmd)
|
||||
{
|
||||
page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
|
||||
if (IS_ENABLED(CONFIG_SMP)) {
|
||||
return xchg(pmdp, pmd);
|
||||
} else {
|
||||
|
@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void)
|
||||
|
||||
void *module_alloc(unsigned long size)
|
||||
{
|
||||
gfp_t gfp_mask = GFP_KERNEL;
|
||||
void *p;
|
||||
|
||||
if (PAGE_ALIGN(size) > MODULES_LEN)
|
||||
@ -74,10 +75,10 @@ void *module_alloc(unsigned long size)
|
||||
|
||||
p = __vmalloc_node_range(size, MODULE_ALIGN,
|
||||
MODULES_VADDR + get_module_load_offset(),
|
||||
MODULES_END, GFP_KERNEL,
|
||||
PAGE_KERNEL, 0, NUMA_NO_NODE,
|
||||
MODULES_END, gfp_mask,
|
||||
PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
|
||||
__builtin_return_address(0));
|
||||
if (p && (kasan_module_alloc(p, size) < 0)) {
|
||||
if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
|
||||
vfree(p);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -1413,8 +1413,7 @@ void do_user_addr_fault(struct pt_regs *regs,
|
||||
* and if there is a fatal signal pending there is no guarantee
|
||||
* that we made any progress. Handle this case first.
|
||||
*/
|
||||
if (unlikely((fault & VM_FAULT_RETRY) &&
|
||||
(flags & FAULT_FLAG_ALLOW_RETRY))) {
|
||||
if (unlikely(fault & VM_FAULT_RETRY)) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
|
@ -420,3 +420,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
@ -127,17 +127,16 @@ void do_page_fault(struct pt_regs *regs)
|
||||
goto do_sigbus;
|
||||
BUG();
|
||||
}
|
||||
if (flags & FAULT_FLAG_ALLOW_RETRY) {
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
if (fault & VM_FAULT_RETRY) {
|
||||
flags |= FAULT_FLAG_TRIED;
|
||||
|
||||
goto retry;
|
||||
}
|
||||
/* No need to mmap_read_unlock(mm) as we would
|
||||
* have already released it in __lock_page_or_retry
|
||||
* in mm/filemap.c.
|
||||
*/
|
||||
|
||||
goto retry;
|
||||
}
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
@ -1903,14 +1903,7 @@ static struct attribute *zram_disk_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group zram_disk_attr_group = {
|
||||
.attrs = zram_disk_attrs,
|
||||
};
|
||||
|
||||
static const struct attribute_group *zram_disk_attr_groups[] = {
|
||||
&zram_disk_attr_group,
|
||||
NULL,
|
||||
};
|
||||
ATTRIBUTE_GROUPS(zram_disk);
|
||||
|
||||
/*
|
||||
* Allocate and initialize new zram device. the function returns
|
||||
@ -1983,7 +1976,7 @@ static int zram_add(void)
|
||||
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
|
||||
ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
|
||||
ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
|
||||
if (ret)
|
||||
goto out_cleanup_disk;
|
||||
|
||||
|
@ -127,11 +127,35 @@ ATTRIBUTE_GROUPS(dax_drv);
|
||||
|
||||
static int dax_bus_match(struct device *dev, struct device_driver *drv);
|
||||
|
||||
/*
|
||||
* Static dax regions are regions created by an external subsystem
|
||||
* nvdimm where a single range is assigned. Its boundaries are by the external
|
||||
* subsystem and are usually limited to one physical memory range. For example,
|
||||
* for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a
|
||||
* single contiguous range)
|
||||
*
|
||||
* On dynamic dax regions, the assigned region can be partitioned by dax core
|
||||
* into multiple subdivisions. A subdivision is represented into one
|
||||
* /dev/daxN.M device composed by one or more potentially discontiguous ranges.
|
||||
*
|
||||
* When allocating a dax region, drivers must set whether it's static
|
||||
* (IORESOURCE_DAX_STATIC). On static dax devices, the @pgmap is pre-assigned
|
||||
* to dax core when calling devm_create_dev_dax(), whereas in dynamic dax
|
||||
* devices it is NULL but afterwards allocated by dax core on device ->probe().
|
||||
* Care is needed to make sure that dynamic dax devices are torn down with a
|
||||
* cleared @pgmap field (see kill_dev_dax()).
|
||||
*/
|
||||
static bool is_static(struct dax_region *dax_region)
|
||||
{
|
||||
return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
|
||||
}
|
||||
|
||||
bool static_dev_dax(struct dev_dax *dev_dax)
|
||||
{
|
||||
return is_static(dev_dax->region);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(static_dev_dax);
|
||||
|
||||
static u64 dev_dax_size(struct dev_dax *dev_dax)
|
||||
{
|
||||
u64 size = 0;
|
||||
@ -361,6 +385,14 @@ void kill_dev_dax(struct dev_dax *dev_dax)
|
||||
|
||||
kill_dax(dax_dev);
|
||||
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
|
||||
|
||||
/*
|
||||
* Dynamic dax region have the pgmap allocated via dev_kzalloc()
|
||||
* and thus freed by devm. Clear the pgmap to not have stale pgmap
|
||||
* ranges on probe() from previous reconfigurations of region devices.
|
||||
*/
|
||||
if (!static_dev_dax(dev_dax))
|
||||
dev_dax->pgmap = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kill_dev_dax);
|
||||
|
||||
|
@ -39,6 +39,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv,
|
||||
__dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
|
||||
void dax_driver_unregister(struct dax_device_driver *dax_drv);
|
||||
void kill_dev_dax(struct dev_dax *dev_dax);
|
||||
bool static_dev_dax(struct dev_dax *dev_dax);
|
||||
|
||||
/*
|
||||
* While run_dax() is potentially a generic operation that could be
|
||||
|
@ -73,11 +73,39 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
|
||||
unsigned long fault_size)
|
||||
{
|
||||
unsigned long i, nr_pages = fault_size / PAGE_SIZE;
|
||||
struct file *filp = vmf->vma->vm_file;
|
||||
struct dev_dax *dev_dax = filp->private_data;
|
||||
pgoff_t pgoff;
|
||||
|
||||
/* mapping is only set on the head */
|
||||
if (dev_dax->pgmap->vmemmap_shift)
|
||||
nr_pages = 1;
|
||||
|
||||
pgoff = linear_page_index(vmf->vma,
|
||||
ALIGN(vmf->address, fault_size));
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
|
||||
|
||||
page = compound_head(page);
|
||||
if (page->mapping)
|
||||
continue;
|
||||
|
||||
page->mapping = filp->f_mapping;
|
||||
page->index = pgoff + i;
|
||||
}
|
||||
}
|
||||
|
||||
static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
|
||||
struct vm_fault *vmf, pfn_t *pfn)
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
struct device *dev = &dev_dax->dev;
|
||||
phys_addr_t phys;
|
||||
pfn_t pfn;
|
||||
unsigned int fault_size = PAGE_SIZE;
|
||||
|
||||
if (check_vma(dev_dax, vmf->vma, __func__))
|
||||
@ -98,18 +126,21 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
|
||||
pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
|
||||
|
||||
return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
|
||||
dax_set_mapping(vmf, pfn, fault_size);
|
||||
|
||||
return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
|
||||
}
|
||||
|
||||
static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
|
||||
struct vm_fault *vmf, pfn_t *pfn)
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
unsigned long pmd_addr = vmf->address & PMD_MASK;
|
||||
struct device *dev = &dev_dax->dev;
|
||||
phys_addr_t phys;
|
||||
pgoff_t pgoff;
|
||||
pfn_t pfn;
|
||||
unsigned int fault_size = PMD_SIZE;
|
||||
|
||||
if (check_vma(dev_dax, vmf->vma, __func__))
|
||||
@ -138,19 +169,22 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
|
||||
pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
|
||||
|
||||
return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
|
||||
dax_set_mapping(vmf, pfn, fault_size);
|
||||
|
||||
return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
|
||||
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
|
||||
struct vm_fault *vmf, pfn_t *pfn)
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
unsigned long pud_addr = vmf->address & PUD_MASK;
|
||||
struct device *dev = &dev_dax->dev;
|
||||
phys_addr_t phys;
|
||||
pgoff_t pgoff;
|
||||
pfn_t pfn;
|
||||
unsigned int fault_size = PUD_SIZE;
|
||||
|
||||
|
||||
@ -180,13 +214,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
|
||||
pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
|
||||
|
||||
return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
|
||||
dax_set_mapping(vmf, pfn, fault_size);
|
||||
|
||||
return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
|
||||
}
|
||||
#else
|
||||
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
|
||||
struct vm_fault *vmf, pfn_t *pfn)
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
return VM_FAULT_FALLBACK;
|
||||
}
|
||||
@ -196,10 +232,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
|
||||
enum page_entry_size pe_size)
|
||||
{
|
||||
struct file *filp = vmf->vma->vm_file;
|
||||
unsigned long fault_size;
|
||||
vm_fault_t rc = VM_FAULT_SIGBUS;
|
||||
int id;
|
||||
pfn_t pfn;
|
||||
struct dev_dax *dev_dax = filp->private_data;
|
||||
|
||||
dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
|
||||
@ -209,43 +243,18 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
|
||||
id = dax_read_lock();
|
||||
switch (pe_size) {
|
||||
case PE_SIZE_PTE:
|
||||
fault_size = PAGE_SIZE;
|
||||
rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
|
||||
rc = __dev_dax_pte_fault(dev_dax, vmf);
|
||||
break;
|
||||
case PE_SIZE_PMD:
|
||||
fault_size = PMD_SIZE;
|
||||
rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
|
||||
rc = __dev_dax_pmd_fault(dev_dax, vmf);
|
||||
break;
|
||||
case PE_SIZE_PUD:
|
||||
fault_size = PUD_SIZE;
|
||||
rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
|
||||
rc = __dev_dax_pud_fault(dev_dax, vmf);
|
||||
break;
|
||||
default:
|
||||
rc = VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
if (rc == VM_FAULT_NOPAGE) {
|
||||
unsigned long i;
|
||||
pgoff_t pgoff;
|
||||
|
||||
/*
|
||||
* In the device-dax case the only possibility for a
|
||||
* VM_FAULT_NOPAGE result is when device-dax capacity is
|
||||
* mapped. No need to consider the zero page, or racing
|
||||
* conflicting mappings.
|
||||
*/
|
||||
pgoff = linear_page_index(vmf->vma, vmf->address
|
||||
& ~(fault_size - 1));
|
||||
for (i = 0; i < fault_size / PAGE_SIZE; i++) {
|
||||
struct page *page;
|
||||
|
||||
page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
|
||||
if (page->mapping)
|
||||
continue;
|
||||
page->mapping = filp->f_mapping;
|
||||
page->index = pgoff + i;
|
||||
}
|
||||
}
|
||||
dax_read_unlock(id);
|
||||
|
||||
return rc;
|
||||
@ -398,17 +407,34 @@ int dev_dax_probe(struct dev_dax *dev_dax)
|
||||
void *addr;
|
||||
int rc, i;
|
||||
|
||||
pgmap = dev_dax->pgmap;
|
||||
if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1,
|
||||
"static pgmap / multi-range device conflict\n"))
|
||||
return -EINVAL;
|
||||
if (static_dev_dax(dev_dax)) {
|
||||
if (dev_dax->nr_range > 1) {
|
||||
dev_warn(dev,
|
||||
"static pgmap / multi-range device conflict\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!pgmap) {
|
||||
pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range)
|
||||
* (dev_dax->nr_range - 1), GFP_KERNEL);
|
||||
pgmap = dev_dax->pgmap;
|
||||
} else {
|
||||
if (dev_dax->pgmap) {
|
||||
dev_warn(dev,
|
||||
"dynamic-dax with pre-populated page map\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
pgmap = devm_kzalloc(dev,
|
||||
struct_size(pgmap, ranges, dev_dax->nr_range - 1),
|
||||
GFP_KERNEL);
|
||||
if (!pgmap)
|
||||
return -ENOMEM;
|
||||
|
||||
pgmap->nr_range = dev_dax->nr_range;
|
||||
dev_dax->pgmap = pgmap;
|
||||
|
||||
for (i = 0; i < dev_dax->nr_range; i++) {
|
||||
struct range *range = &dev_dax->ranges[i].range;
|
||||
pgmap->ranges[i] = *range;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < dev_dax->nr_range; i++) {
|
||||
@ -420,12 +446,12 @@ int dev_dax_probe(struct dev_dax *dev_dax)
|
||||
i, range->start, range->end);
|
||||
return -EBUSY;
|
||||
}
|
||||
/* don't update the range for static pgmap */
|
||||
if (!dev_dax->pgmap)
|
||||
pgmap->ranges[i] = *range;
|
||||
}
|
||||
|
||||
pgmap->type = MEMORY_DEVICE_GENERIC;
|
||||
if (dev_dax->align > PAGE_SIZE)
|
||||
pgmap->vmemmap_shift =
|
||||
order_base_2(dev_dax->align >> PAGE_SHIFT);
|
||||
addr = devm_memremap_pages(dev, pgmap);
|
||||
if (IS_ERR(addr))
|
||||
return PTR_ERR(addr);
|
||||
|
@ -98,15 +98,14 @@ static int siw_create_tx_threads(void)
|
||||
continue;
|
||||
|
||||
siw_tx_thread[cpu] =
|
||||
kthread_create(siw_run_sq, (unsigned long *)(long)cpu,
|
||||
"siw_tx/%d", cpu);
|
||||
kthread_run_on_cpu(siw_run_sq,
|
||||
(unsigned long *)(long)cpu,
|
||||
cpu, "siw_tx/%u");
|
||||
if (IS_ERR(siw_tx_thread[cpu])) {
|
||||
siw_tx_thread[cpu] = NULL;
|
||||
continue;
|
||||
}
|
||||
kthread_bind(siw_tx_thread[cpu], cpu);
|
||||
|
||||
wake_up_process(siw_tx_thread[cpu]);
|
||||
assigned++;
|
||||
}
|
||||
return assigned;
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include <linux/serial_core.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/kmemleak.h>
|
||||
|
||||
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */
|
||||
#include <asm/page.h>
|
||||
@ -524,9 +525,12 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
|
||||
size = dt_mem_next_cell(dt_root_size_cells, &prop);
|
||||
|
||||
if (size &&
|
||||
early_init_dt_reserve_memory_arch(base, size, nomap) == 0)
|
||||
early_init_dt_reserve_memory_arch(base, size, nomap) == 0) {
|
||||
pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
|
||||
uname, &base, (unsigned long)(size / SZ_1M));
|
||||
if (!nomap)
|
||||
kmemleak_alloc_phys(base, size, 0, 0);
|
||||
}
|
||||
else
|
||||
pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
|
||||
uname, &base, (unsigned long)(size / SZ_1M));
|
||||
|
@ -27,8 +27,8 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/fiemap.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/iomap.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include "ext4_jbd2.h"
|
||||
#include "ext4_extents.h"
|
||||
#include "xattr.h"
|
||||
@ -4404,8 +4404,7 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
|
||||
err = ext4_es_remove_extent(inode, last_block,
|
||||
EXT_MAX_BLOCKS - last_block);
|
||||
if (err == -ENOMEM) {
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
memalloc_retry_wait(GFP_ATOMIC);
|
||||
goto retry;
|
||||
}
|
||||
if (err)
|
||||
@ -4413,8 +4412,7 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
|
||||
retry_remove_space:
|
||||
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
|
||||
if (err == -ENOMEM) {
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
memalloc_retry_wait(GFP_ATOMIC);
|
||||
goto retry_remove_space;
|
||||
}
|
||||
return err;
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <linux/iomap.h>
|
||||
#include <linux/fiemap.h>
|
||||
#include <linux/iversion.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "ext4_jbd2.h"
|
||||
#include "ext4.h"
|
||||
@ -1929,8 +1929,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
|
||||
retry:
|
||||
err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
|
||||
if (err == -ENOMEM) {
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
memalloc_retry_wait(GFP_ATOMIC);
|
||||
goto retry;
|
||||
}
|
||||
if (err)
|
||||
|
@ -24,7 +24,7 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "ext4_jbd2.h"
|
||||
#include "xattr.h"
|
||||
@ -523,12 +523,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
|
||||
ret = PTR_ERR(bounce_page);
|
||||
if (ret == -ENOMEM &&
|
||||
(io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
|
||||
gfp_flags = GFP_NOFS;
|
||||
gfp_t new_gfp_flags = GFP_NOFS;
|
||||
if (io->io_bio)
|
||||
ext4_io_submit(io);
|
||||
else
|
||||
gfp_flags |= __GFP_NOFAIL;
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
new_gfp_flags |= __GFP_NOFAIL;
|
||||
memalloc_retry_wait(gfp_flags);
|
||||
gfp_flags = new_gfp_flags;
|
||||
goto retry_encrypt;
|
||||
}
|
||||
|
||||
|
@ -8,9 +8,9 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/mpage.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/bio.h>
|
||||
@ -2542,7 +2542,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
|
||||
/* flush pending IOs and wait for a while in the ENOMEM case */
|
||||
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
|
||||
f2fs_flush_merged_writes(fio->sbi);
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
gfp_flags |= __GFP_NOFAIL;
|
||||
goto retry_encrypt;
|
||||
}
|
||||
|
@ -7,7 +7,6 @@
|
||||
*/
|
||||
#include <linux/fs.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/kthread.h>
|
||||
@ -15,6 +14,7 @@
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "node.h"
|
||||
@ -1375,8 +1375,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
|
||||
if (err) {
|
||||
clear_page_private_gcing(page);
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
if (is_dirty)
|
||||
|
@ -8,8 +8,8 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "node.h"
|
||||
@ -562,7 +562,7 @@ struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino)
|
||||
inode = f2fs_iget(sb, ino);
|
||||
if (IS_ERR(inode)) {
|
||||
if (PTR_ERR(inode) == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/mpage.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/swap.h>
|
||||
@ -2750,7 +2750,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
|
||||
retry:
|
||||
ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
|
||||
if (!ipage) {
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <asm/unaligned.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include "f2fs.h"
|
||||
#include "node.h"
|
||||
#include "segment.h"
|
||||
@ -587,7 +588,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
|
||||
err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
|
||||
if (err) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry_dn;
|
||||
}
|
||||
goto out;
|
||||
@ -670,8 +671,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
|
||||
err = check_index_in_prev_nodes(sbi, dest, &dn);
|
||||
if (err) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry_prev;
|
||||
}
|
||||
goto err;
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/swap.h>
|
||||
@ -245,9 +246,7 @@ static int __revoke_inmem_pages(struct inode *inode,
|
||||
LOOKUP_NODE);
|
||||
if (err) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
cond_resched();
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
err = -EAGAIN;
|
||||
@ -424,9 +423,7 @@ static int __f2fs_commit_inmem_pages(struct inode *inode)
|
||||
err = f2fs_do_write_data_page(&fio);
|
||||
if (err) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
cond_resched();
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
unlock_page(page);
|
||||
|
@ -8,9 +8,9 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/statfs.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/parser.h>
|
||||
#include <linux/mount.h>
|
||||
@ -2415,8 +2415,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
|
||||
page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
|
||||
if (IS_ERR(page)) {
|
||||
if (PTR_ERR(page) == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto repeat;
|
||||
}
|
||||
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
|
||||
|
@ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
/*
|
||||
* end == 0 indicates that the entire range after
|
||||
* start should be unmapped.
|
||||
* end == 0 indicates that the entire range after start should be
|
||||
* unmapped. Note, end is exclusive, whereas the interval tree takes
|
||||
* an inclusive "last".
|
||||
*/
|
||||
vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
|
||||
vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
|
||||
unsigned long v_offset;
|
||||
unsigned long v_end;
|
||||
|
||||
|
49
fs/inode.c
49
fs/inode.c
@ -526,6 +526,55 @@ void __remove_inode_hash(struct inode *inode)
|
||||
}
|
||||
EXPORT_SYMBOL(__remove_inode_hash);
|
||||
|
||||
void dump_mapping(const struct address_space *mapping)
|
||||
{
|
||||
struct inode *host;
|
||||
const struct address_space_operations *a_ops;
|
||||
struct hlist_node *dentry_first;
|
||||
struct dentry *dentry_ptr;
|
||||
struct dentry dentry;
|
||||
unsigned long ino;
|
||||
|
||||
/*
|
||||
* If mapping is an invalid pointer, we don't want to crash
|
||||
* accessing it, so probe everything depending on it carefully.
|
||||
*/
|
||||
if (get_kernel_nofault(host, &mapping->host) ||
|
||||
get_kernel_nofault(a_ops, &mapping->a_ops)) {
|
||||
pr_warn("invalid mapping:%px\n", mapping);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!host) {
|
||||
pr_warn("aops:%ps\n", a_ops);
|
||||
return;
|
||||
}
|
||||
|
||||
if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
|
||||
get_kernel_nofault(ino, &host->i_ino)) {
|
||||
pr_warn("aops:%ps invalid inode:%px\n", a_ops, host);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!dentry_first) {
|
||||
pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
|
||||
return;
|
||||
}
|
||||
|
||||
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
|
||||
if (get_kernel_nofault(dentry, dentry_ptr)) {
|
||||
pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
|
||||
a_ops, ino, dentry_ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* if dentry is corrupted, the %pd handler may still crash,
|
||||
* but it's unlikely that we reach here with a corrupt mapping
|
||||
*/
|
||||
pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry);
|
||||
}
|
||||
|
||||
void clear_inode(struct inode *inode)
|
||||
{
|
||||
/*
|
||||
|
@ -430,7 +430,7 @@ static int ioctl_file_dedupe_range(struct file *file,
|
||||
goto out;
|
||||
}
|
||||
|
||||
size = offsetof(struct file_dedupe_range __user, info[count]);
|
||||
size = offsetof(struct file_dedupe_range, info[count]);
|
||||
if (size > PAGE_SIZE) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
|
@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/**
|
||||
/*
|
||||
* attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
|
||||
*
|
||||
* Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
|
||||
|
@ -2040,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
|
||||
int i, idx;
|
||||
struct ocfs2_extent_list *el, *left_el, *right_el;
|
||||
struct ocfs2_extent_rec *left_rec, *right_rec;
|
||||
struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
|
||||
struct buffer_head *root_bh;
|
||||
|
||||
/*
|
||||
* Update the counts and position values within all the
|
||||
|
@ -1799,20 +1799,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
|
||||
*/
|
||||
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
|
||||
cluster_of_pages, mmap_page);
|
||||
if (ret && ret != -EAGAIN) {
|
||||
mlog_errno(ret);
|
||||
goto out_quota;
|
||||
}
|
||||
if (ret) {
|
||||
/*
|
||||
* ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
|
||||
* the target page. In this case, we exit with no error and no target
|
||||
* page. This will trigger the caller, page_mkwrite(), to re-try
|
||||
* the operation.
|
||||
*/
|
||||
if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) {
|
||||
BUG_ON(wc->w_target_page);
|
||||
ret = 0;
|
||||
goto out_quota;
|
||||
}
|
||||
|
||||
/*
|
||||
* ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
|
||||
* the target page. In this case, we exit with no error and no target
|
||||
* page. This will trigger the caller, page_mkwrite(), to re-try
|
||||
* the operation.
|
||||
*/
|
||||
if (ret == -EAGAIN) {
|
||||
BUG_ON(wc->w_target_page);
|
||||
ret = 0;
|
||||
mlog_errno(ret);
|
||||
goto out_quota;
|
||||
}
|
||||
|
||||
|
@ -120,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
|
||||
define_mask(KTHREAD),
|
||||
};
|
||||
|
||||
static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
|
||||
static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, };
|
||||
ATTRIBUTE_GROUPS(mlog_default);
|
||||
|
||||
static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
|
||||
char *buf)
|
||||
@ -144,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = {
|
||||
};
|
||||
|
||||
static struct kobj_type mlog_ktype = {
|
||||
.default_attrs = mlog_attr_ptrs,
|
||||
.sysfs_ops = &mlog_attr_ops,
|
||||
.default_groups = mlog_default_groups,
|
||||
.sysfs_ops = &mlog_attr_ops,
|
||||
};
|
||||
|
||||
static struct kset mlog_kset = {
|
||||
@ -157,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset)
|
||||
int i = 0;
|
||||
|
||||
while (mlog_attrs[i].attr.mode) {
|
||||
mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
|
||||
mlog_default_attrs[i] = &mlog_attrs[i].attr;
|
||||
i++;
|
||||
}
|
||||
mlog_attr_ptrs[i] = NULL;
|
||||
mlog_default_attrs[i] = NULL;
|
||||
|
||||
kobject_set_name(&mlog_kset.kobj, "logmask");
|
||||
mlog_kset.kobj.kset = o2cb_kset;
|
||||
|
@ -3343,7 +3343,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
|
||||
struct ocfs2_dir_entry *de, *last_de = NULL;
|
||||
char *de_buf, *limit;
|
||||
unsigned long offset = 0;
|
||||
unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
|
||||
unsigned int rec_len, new_rec_len, free_space;
|
||||
|
||||
/*
|
||||
* This calculates how many free bytes we'd have in block zero, should
|
||||
|
@ -94,6 +94,7 @@ static struct attribute *ocfs2_filecheck_attrs[] = {
|
||||
&ocfs2_filecheck_attr_set.attr,
|
||||
NULL
|
||||
};
|
||||
ATTRIBUTE_GROUPS(ocfs2_filecheck);
|
||||
|
||||
static void ocfs2_filecheck_release(struct kobject *kobj)
|
||||
{
|
||||
@ -138,7 +139,7 @@ static const struct sysfs_ops ocfs2_filecheck_ops = {
|
||||
};
|
||||
|
||||
static struct kobj_type ocfs2_ktype_filecheck = {
|
||||
.default_attrs = ocfs2_filecheck_attrs,
|
||||
.default_groups = ocfs2_filecheck_groups,
|
||||
.sysfs_ops = &ocfs2_filecheck_ops,
|
||||
.release = ocfs2_filecheck_release,
|
||||
};
|
||||
|
@ -1669,8 +1669,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
|
||||
status = jbd2_journal_load(journal);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
if (!igrab(inode))
|
||||
BUG();
|
||||
BUG_ON(!igrab(inode));
|
||||
jbd2_journal_destroy(journal);
|
||||
goto done;
|
||||
}
|
||||
@ -1699,8 +1698,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
if (!igrab(inode))
|
||||
BUG();
|
||||
BUG_ON(!igrab(inode));
|
||||
|
||||
jbd2_journal_destroy(journal);
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/vmacache.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/huge_mm.h>
|
||||
#include <linux/mount.h>
|
||||
@ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
|
||||
|
||||
name = arch_vma_name(vma);
|
||||
if (!name) {
|
||||
const char *anon_name;
|
||||
|
||||
if (!mm) {
|
||||
name = "[vdso]";
|
||||
goto done;
|
||||
@ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (is_stack(vma))
|
||||
if (is_stack(vma)) {
|
||||
name = "[stack]";
|
||||
goto done;
|
||||
}
|
||||
|
||||
anon_name = vma_anon_name(vma);
|
||||
if (anon_name) {
|
||||
seq_pad(m, ' ');
|
||||
seq_printf(m, "[anon:%s]", anon_name);
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/magic.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/backing-dev.h>
|
||||
|
||||
#include "squashfs_fs.h"
|
||||
#include "squashfs_fs_sb.h"
|
||||
@ -112,6 +113,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
|
||||
return decompressor;
|
||||
}
|
||||
|
||||
static int squashfs_bdi_init(struct super_block *sb)
|
||||
{
|
||||
int err;
|
||||
unsigned int major = MAJOR(sb->s_dev);
|
||||
unsigned int minor = MINOR(sb->s_dev);
|
||||
|
||||
bdi_put(sb->s_bdi);
|
||||
sb->s_bdi = &noop_backing_dev_info;
|
||||
|
||||
err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
sb->s_bdi->ra_pages = 0;
|
||||
sb->s_bdi->io_pages = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
|
||||
{
|
||||
@ -127,6 +146,20 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
|
||||
|
||||
TRACE("Entered squashfs_fill_superblock\n");
|
||||
|
||||
/*
|
||||
* squashfs provides 'backing_dev_info' in order to disable read-ahead. For
|
||||
* squashfs, I/O is not deferred, it is done immediately in readpage,
|
||||
* which means the user would always have to wait their own I/O. So the effect
|
||||
* of readahead is very weak for squashfs. squashfs_bdi_init will set
|
||||
* sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for
|
||||
* squashfs.
|
||||
*/
|
||||
err = squashfs_bdi_init(sb);
|
||||
if (err) {
|
||||
errorf(fc, "squashfs init bdi failed");
|
||||
return err;
|
||||
}
|
||||
|
||||
sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
|
||||
if (sb->s_fs_info == NULL) {
|
||||
ERROR("Failed to allocate squashfs_sb_info\n");
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/slab.h>
|
||||
@ -877,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
|
||||
new_flags, vma->anon_vma,
|
||||
vma->vm_file, vma->vm_pgoff,
|
||||
vma_policy(vma),
|
||||
NULL_VM_UFFD_CTX);
|
||||
NULL_VM_UFFD_CTX, vma_anon_name(vma));
|
||||
if (prev)
|
||||
vma = prev;
|
||||
else
|
||||
@ -1436,7 +1437,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
||||
prev = vma_merge(mm, prev, start, vma_end, new_flags,
|
||||
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
|
||||
vma_policy(vma),
|
||||
((struct vm_userfaultfd_ctx){ ctx }));
|
||||
((struct vm_userfaultfd_ctx){ ctx }),
|
||||
vma_anon_name(vma));
|
||||
if (prev) {
|
||||
vma = prev;
|
||||
goto next;
|
||||
@ -1613,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
|
||||
prev = vma_merge(mm, prev, start, vma_end, new_flags,
|
||||
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
|
||||
vma_policy(vma),
|
||||
NULL_VM_UFFD_CTX);
|
||||
NULL_VM_UFFD_CTX, vma_anon_name(vma));
|
||||
if (prev) {
|
||||
vma = prev;
|
||||
goto next;
|
||||
|
@ -4,7 +4,6 @@
|
||||
* All Rights Reserved.
|
||||
*/
|
||||
#include "xfs.h"
|
||||
#include <linux/backing-dev.h>
|
||||
#include "xfs_message.h"
|
||||
#include "xfs_trace.h"
|
||||
|
||||
@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
|
||||
"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
|
||||
current->comm, current->pid,
|
||||
(unsigned int)size, __func__, lflags);
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
memalloc_retry_wait(lflags);
|
||||
} while (1);
|
||||
}
|
||||
|
@ -394,7 +394,7 @@ xfs_buf_alloc_pages(
|
||||
}
|
||||
|
||||
XFS_STATS_INC(bp->b_mount, xb_page_retries);
|
||||
congestion_wait(BLK_RW_ASYNC, HZ / 50);
|
||||
memalloc_retry_wait(gfp_mask);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data);
|
||||
|
||||
extern const char *ceph_msg_type_name(int type);
|
||||
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
|
||||
extern void *ceph_kvmalloc(size_t size, gfp_t flags);
|
||||
|
||||
struct fs_parameter;
|
||||
struct fc_log;
|
||||
|
@ -11,12 +11,19 @@
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/time64.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
/* Minimal region size. Every damon_region is aligned by this. */
|
||||
#define DAMON_MIN_REGION PAGE_SIZE
|
||||
/* Max priority score for DAMON-based operation schemes */
|
||||
#define DAMOS_MAX_SCORE (99)
|
||||
|
||||
/* Get a random number in [l, r) */
|
||||
static inline unsigned long damon_rand(unsigned long l, unsigned long r)
|
||||
{
|
||||
return l + prandom_u32_max(r - l);
|
||||
}
|
||||
|
||||
/**
|
||||
* struct damon_addr_range - Represents an address region of [@start, @end).
|
||||
* @start: Start address of the region (inclusive).
|
||||
@ -185,6 +192,22 @@ struct damos_watermarks {
|
||||
bool activated;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct damos_stat - Statistics on a given scheme.
|
||||
* @nr_tried: Total number of regions that the scheme is tried to be applied.
|
||||
* @sz_tried: Total size of regions that the scheme is tried to be applied.
|
||||
* @nr_applied: Total number of regions that the scheme is applied.
|
||||
* @sz_applied: Total size of regions that the scheme is applied.
|
||||
* @qt_exceeds: Total number of times the quota of the scheme has exceeded.
|
||||
*/
|
||||
struct damos_stat {
|
||||
unsigned long nr_tried;
|
||||
unsigned long sz_tried;
|
||||
unsigned long nr_applied;
|
||||
unsigned long sz_applied;
|
||||
unsigned long qt_exceeds;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct damos - Represents a Data Access Monitoring-based Operation Scheme.
|
||||
* @min_sz_region: Minimum size of target regions.
|
||||
@ -196,8 +219,7 @@ struct damos_watermarks {
|
||||
* @action: &damo_action to be applied to the target regions.
|
||||
* @quota: Control the aggressiveness of this scheme.
|
||||
* @wmarks: Watermarks for automated (in)activation of this scheme.
|
||||
* @stat_count: Total number of regions that this scheme is applied.
|
||||
* @stat_sz: Total size of regions that this scheme is applied.
|
||||
* @stat: Statistics of this scheme.
|
||||
* @list: List head for siblings.
|
||||
*
|
||||
* For each aggregation interval, DAMON finds regions which fit in the
|
||||
@ -228,8 +250,7 @@ struct damos {
|
||||
enum damos_action action;
|
||||
struct damos_quota quota;
|
||||
struct damos_watermarks wmarks;
|
||||
unsigned long stat_count;
|
||||
unsigned long stat_sz;
|
||||
struct damos_stat stat;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
@ -274,7 +295,8 @@ struct damon_ctx;
|
||||
* as an integer in [0, &DAMOS_MAX_SCORE].
|
||||
* @apply_scheme is called from @kdamond when a region for user provided
|
||||
* DAMON-based operation scheme is found. It should apply the scheme's action
|
||||
* to the region. This is not used for &DAMON_ARBITRARY_TARGET case.
|
||||
* to the region and return bytes of the region that the action is successfully
|
||||
* applied.
|
||||
* @target_valid should check whether the target is still valid for the
|
||||
* monitoring.
|
||||
* @cleanup is called from @kdamond just before its termination.
|
||||
@ -288,8 +310,9 @@ struct damon_primitive {
|
||||
int (*get_scheme_score)(struct damon_ctx *context,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme);
|
||||
int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
unsigned long (*apply_scheme)(struct damon_ctx *context,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme);
|
||||
bool (*target_valid)(void *target);
|
||||
void (*cleanup)(struct damon_ctx *context);
|
||||
};
|
||||
@ -392,14 +415,20 @@ struct damon_ctx {
|
||||
struct list_head schemes;
|
||||
};
|
||||
|
||||
#define damon_next_region(r) \
|
||||
(container_of(r->list.next, struct damon_region, list))
|
||||
static inline struct damon_region *damon_next_region(struct damon_region *r)
|
||||
{
|
||||
return container_of(r->list.next, struct damon_region, list);
|
||||
}
|
||||
|
||||
#define damon_prev_region(r) \
|
||||
(container_of(r->list.prev, struct damon_region, list))
|
||||
static inline struct damon_region *damon_prev_region(struct damon_region *r)
|
||||
{
|
||||
return container_of(r->list.prev, struct damon_region, list);
|
||||
}
|
||||
|
||||
#define damon_last_region(t) \
|
||||
(list_last_entry(&t->regions_list, struct damon_region, list))
|
||||
static inline struct damon_region *damon_last_region(struct damon_target *t)
|
||||
{
|
||||
return list_last_entry(&t->regions_list, struct damon_region, list);
|
||||
}
|
||||
|
||||
#define damon_for_each_region(r, t) \
|
||||
list_for_each_entry(r, &t->regions_list, list)
|
||||
@ -422,9 +451,18 @@ struct damon_ctx {
|
||||
#ifdef CONFIG_DAMON
|
||||
|
||||
struct damon_region *damon_new_region(unsigned long start, unsigned long end);
|
||||
inline void damon_insert_region(struct damon_region *r,
|
||||
|
||||
/*
|
||||
* Add a region between two other regions
|
||||
*/
|
||||
static inline void damon_insert_region(struct damon_region *r,
|
||||
struct damon_region *prev, struct damon_region *next,
|
||||
struct damon_target *t);
|
||||
struct damon_target *t)
|
||||
{
|
||||
__list_add(&r->list, &prev->list, &next->list);
|
||||
t->nr_regions++;
|
||||
}
|
||||
|
||||
void damon_add_region(struct damon_region *r, struct damon_target *t);
|
||||
void damon_destroy_region(struct damon_region *r, struct damon_target *t);
|
||||
|
||||
@ -461,34 +499,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
|
||||
#endif /* CONFIG_DAMON */
|
||||
|
||||
#ifdef CONFIG_DAMON_VADDR
|
||||
|
||||
/* Monitoring primitives for virtual memory address spaces */
|
||||
void damon_va_init(struct damon_ctx *ctx);
|
||||
void damon_va_update(struct damon_ctx *ctx);
|
||||
void damon_va_prepare_access_checks(struct damon_ctx *ctx);
|
||||
unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
|
||||
bool damon_va_target_valid(void *t);
|
||||
void damon_va_cleanup(struct damon_ctx *ctx);
|
||||
int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
void damon_va_set_primitives(struct damon_ctx *ctx);
|
||||
|
||||
#endif /* CONFIG_DAMON_VADDR */
|
||||
|
||||
#ifdef CONFIG_DAMON_PADDR
|
||||
|
||||
/* Monitoring primitives for the physical memory address space */
|
||||
void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
|
||||
unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
|
||||
bool damon_pa_target_valid(void *t);
|
||||
int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
void damon_pa_set_primitives(struct damon_ctx *ctx);
|
||||
|
||||
#endif /* CONFIG_DAMON_PADDR */
|
||||
|
||||
#endif /* _DAMON_H */
|
||||
|
@ -3093,6 +3093,7 @@ extern void unlock_new_inode(struct inode *);
|
||||
extern void discard_new_inode(struct inode *);
|
||||
extern unsigned int get_next_ino(void);
|
||||
extern void evict_inodes(struct super_block *sb);
|
||||
void dump_mapping(const struct address_space *);
|
||||
|
||||
/*
|
||||
* Userspace may rely on the the inode number being non-zero. For example, glibc
|
||||
|
@ -302,7 +302,9 @@ struct vm_area_struct;
|
||||
* lowest zone as a type of emergency reserve.
|
||||
*
|
||||
* %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
|
||||
* address.
|
||||
* address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory
|
||||
* because the DMA32 kmalloc cache array is not implemented.
|
||||
* (Reason: there is no such user in kernel).
|
||||
*
|
||||
* %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
|
||||
* do not need to be directly accessible by the kernel but that cannot
|
||||
@ -598,9 +600,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order);
|
||||
struct folio *folio_alloc(gfp_t gfp, unsigned order);
|
||||
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
int node, bool hugepage);
|
||||
bool hugepage);
|
||||
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
||||
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
|
||||
alloc_pages_vma(gfp_mask, order, vma, addr, true)
|
||||
#else
|
||||
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
|
||||
{
|
||||
@ -610,14 +612,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
|
||||
{
|
||||
return __folio_alloc_node(gfp, order, numa_node_id());
|
||||
}
|
||||
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
|
||||
#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\
|
||||
alloc_pages(gfp_mask, order)
|
||||
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
||||
alloc_pages(gfp_mask, order)
|
||||
#endif
|
||||
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
|
||||
#define alloc_page_vma(gfp_mask, vma, addr) \
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, false)
|
||||
|
||||
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
|
||||
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
|
||||
|
@ -622,8 +622,8 @@ struct hstate {
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_HUGETLB
|
||||
/* cgroup control files */
|
||||
struct cftype cgroup_files_dfl[7];
|
||||
struct cftype cgroup_files_legacy[9];
|
||||
struct cftype cgroup_files_dfl[8];
|
||||
struct cftype cgroup_files_legacy[10];
|
||||
#endif
|
||||
char name[HSTATE_NAME_LEN];
|
||||
};
|
||||
|
@ -36,6 +36,11 @@ enum hugetlb_memory_event {
|
||||
HUGETLB_NR_MEMORY_EVENTS,
|
||||
};
|
||||
|
||||
struct hugetlb_cgroup_per_node {
|
||||
/* hugetlb usage in pages over all hstates. */
|
||||
unsigned long usage[HUGE_MAX_HSTATE];
|
||||
};
|
||||
|
||||
struct hugetlb_cgroup {
|
||||
struct cgroup_subsys_state css;
|
||||
|
||||
@ -57,6 +62,8 @@ struct hugetlb_cgroup {
|
||||
|
||||
/* Handle for "hugetlb.events.local" */
|
||||
struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
|
||||
|
||||
struct hugetlb_cgroup_per_node *nodeinfo[];
|
||||
};
|
||||
|
||||
static inline struct hugetlb_cgroup *
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user