Merge branch 'akpm' (patches from Andrew)

Merge misc updates from Andrew Morton:
 "146 patches.

  Subsystems affected by this patch series: kthread, ia64, scripts,
  ntfs, squashfs, ocfs2, vfs, and mm (slab-generic, slab, kmemleak,
  dax, kasan, debug, pagecache, gup, shmem, frontswap, memremap,
  memcg, selftests, pagemap, dma, vmalloc, memory-failure, hugetlb,
  userfaultfd, vmscan, mempolicy, oom-kill, hugetlbfs, migration, thp,
  ksm, page-poison, percpu, rmap, zswap, zram, cleanups, hmm, and
  damon)"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (146 commits)
  mm/damon: hide kernel pointer from tracepoint event
  mm/damon/vaddr: hide kernel pointer from damon_va_three_regions() failure log
  mm/damon/vaddr: use pr_debug() for damon_va_three_regions() failure logging
  mm/damon/dbgfs: remove an unnecessary variable
  mm/damon: move the implementation of damon_insert_region to damon.h
  mm/damon: add access checking for hugetlb pages
  Docs/admin-guide/mm/damon/usage: update for schemes statistics
  mm/damon/dbgfs: support all DAMOS stats
  Docs/admin-guide/mm/damon/reclaim: document statistics parameters
  mm/damon/reclaim: provide reclamation statistics
  mm/damon/schemes: account how many times quota limit has exceeded
  mm/damon/schemes: account scheme actions that successfully applied
  mm/damon: remove a mistakenly added comment for a future feature
  Docs/admin-guide/mm/damon/usage: update for kdamond_pid and (mk|rm)_contexts
  Docs/admin-guide/mm/damon/usage: mention tracepoint at the beginning
  Docs/admin-guide/mm/damon/usage: remove redundant information
  Docs/admin-guide/mm/damon/usage: update for scheme quotas and watermarks
  mm/damon: convert macro functions to static inline functions
  mm/damon: modify damon_rand() macro to static inline function
  mm/damon: move damon_rand() definition into damon.h
  ...
This commit is contained in:
Linus Torvalds 2022-01-15 20:37:06 +02:00
commit f56caedaf9
211 changed files with 3825 additions and 1604 deletions

View File

@ -29,12 +29,14 @@ Brief summary of control files::
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit
hugetlb.<hugepagesize>.numa_stat # show the numa information of the hugetlb memory charged to this cgroup
For a system supporting three hugepage sizes (64k, 32M and 1G), the control For a system supporting three hugepage sizes (64k, 32M and 1G), the control
files include:: files include::
hugetlb.1GB.limit_in_bytes hugetlb.1GB.limit_in_bytes
hugetlb.1GB.max_usage_in_bytes hugetlb.1GB.max_usage_in_bytes
hugetlb.1GB.numa_stat
hugetlb.1GB.usage_in_bytes hugetlb.1GB.usage_in_bytes
hugetlb.1GB.failcnt hugetlb.1GB.failcnt
hugetlb.1GB.rsvd.limit_in_bytes hugetlb.1GB.rsvd.limit_in_bytes
@ -43,6 +45,7 @@ files include::
hugetlb.1GB.rsvd.failcnt hugetlb.1GB.rsvd.failcnt
hugetlb.64KB.limit_in_bytes hugetlb.64KB.limit_in_bytes
hugetlb.64KB.max_usage_in_bytes hugetlb.64KB.max_usage_in_bytes
hugetlb.64KB.numa_stat
hugetlb.64KB.usage_in_bytes hugetlb.64KB.usage_in_bytes
hugetlb.64KB.failcnt hugetlb.64KB.failcnt
hugetlb.64KB.rsvd.limit_in_bytes hugetlb.64KB.rsvd.limit_in_bytes
@ -51,6 +54,7 @@ files include::
hugetlb.64KB.rsvd.failcnt hugetlb.64KB.rsvd.failcnt
hugetlb.32MB.limit_in_bytes hugetlb.32MB.limit_in_bytes
hugetlb.32MB.max_usage_in_bytes hugetlb.32MB.max_usage_in_bytes
hugetlb.32MB.numa_stat
hugetlb.32MB.usage_in_bytes hugetlb.32MB.usage_in_bytes
hugetlb.32MB.failcnt hugetlb.32MB.failcnt
hugetlb.32MB.rsvd.limit_in_bytes hugetlb.32MB.rsvd.limit_in_bytes

View File

@ -1268,6 +1268,9 @@ PAGE_SIZE multiple when read back.
The number of processes belonging to this cgroup The number of processes belonging to this cgroup
killed by any kind of OOM killer. killed by any kind of OOM killer.
oom_group_kill
The number of times a group OOM has occurred.
memory.events.local memory.events.local
Similar to memory.events but the fields in the file are local Similar to memory.events but the fields in the file are local
to the cgroup i.e. not hierarchical. The file modified event to the cgroup i.e. not hierarchical. The file modified event
@ -1311,6 +1314,9 @@ PAGE_SIZE multiple when read back.
sock (npn) sock (npn)
Amount of memory used in network transmission buffers Amount of memory used in network transmission buffers
vmalloc (npn)
Amount of memory used for vmap backed memory.
shmem shmem
Amount of cached filesystem data that is swap-backed, Amount of cached filesystem data that is swap-backed,
such as tmpfs, shm segments, shared anonymous mmap()s such as tmpfs, shm segments, shared anonymous mmap()s
@ -2260,6 +2266,11 @@ HugeTLB Interface Files
are local to the cgroup i.e. not hierarchical. The file modified event are local to the cgroup i.e. not hierarchical. The file modified event
generated on this file reflects only the local events. generated on this file reflects only the local events.
hugetlb.<hugepagesize>.numa_stat
Similar to memory.numa_stat, it shows the numa information of the
hugetlb pages of <hugepagesize> in this cgroup. Only active in
use hugetlb pages are included. The per-node values are in bytes.
Misc Misc
---- ----

View File

@ -208,6 +208,31 @@ PID of the DAMON thread.
If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else, If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else,
-1. -1.
nr_reclaim_tried_regions
------------------------
Number of memory regions that tried to be reclaimed by DAMON_RECLAIM.
bytes_reclaim_tried_regions
---------------------------
Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM.
nr_reclaimed_regions
--------------------
Number of memory regions that successfully be reclaimed by DAMON_RECLAIM.
bytes_reclaimed_regions
-----------------------
Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM.
nr_quota_exceeds
----------------
Number of times that the time/space quota limits have exceeded.
Example Example
======= =======

View File

@ -7,37 +7,40 @@ Detailed Usages
DAMON provides below three interfaces for different users. DAMON provides below three interfaces for different users.
- *DAMON user space tool.* - *DAMON user space tool.*
This is for privileged people such as system administrators who want a `This <https://github.com/awslabs/damo>`_ is for privileged people such as
just-working human-friendly interface. Using this, users can use the DAMONs system administrators who want a just-working human-friendly interface.
major features in a human-friendly way. It may not be highly tuned for Using this, users can use the DAMONs major features in a human-friendly way.
special cases, though. It supports both virtual and physical address spaces It may not be highly tuned for special cases, though. It supports both
monitoring. virtual and physical address spaces monitoring. For more detail, please
refer to its `usage document
<https://github.com/awslabs/damo/blob/next/USAGE.md>`_.
- *debugfs interface.* - *debugfs interface.*
This is for privileged user space programmers who want more optimized use of :ref:`This <debugfs_interface>` is for privileged user space programmers who
DAMON. Using this, users can use DAMONs major features by reading want more optimized use of DAMON. Using this, users can use DAMONs major
from and writing to special debugfs files. Therefore, you can write and use features by reading from and writing to special debugfs files. Therefore,
your personalized DAMON debugfs wrapper programs that reads/writes the you can write and use your personalized DAMON debugfs wrapper programs that
debugfs files instead of you. The DAMON user space tool is also a reference reads/writes the debugfs files instead of you. The `DAMON user space tool
implementation of such programs. It supports both virtual and physical <https://github.com/awslabs/damo>`_ is one example of such programs. It
address spaces monitoring. supports both virtual and physical address spaces monitoring. Note that this
interface provides only simple :ref:`statistics <damos_stats>` for the
monitoring results. For detailed monitoring results, DAMON provides a
:ref:`tracepoint <tracepoint>`.
- *Kernel Space Programming Interface.* - *Kernel Space Programming Interface.*
This is for kernel space programmers. Using this, users can utilize every :doc:`This </vm/damon/api>` is for kernel space programmers. Using this,
feature of DAMON most flexibly and efficiently by writing kernel space users can utilize every feature of DAMON most flexibly and efficiently by
DAMON application programs for you. You can even extend DAMON for various writing kernel space DAMON application programs for you. You can even extend
address spaces. DAMON for various address spaces. For detail, please refer to the interface
:doc:`document </vm/damon/api>`.
Nevertheless, you could write your own user space tool using the debugfs
interface. A reference implementation is available at .. _debugfs_interface:
https://github.com/awslabs/damo. If you are a kernel programmer, you could
refer to :doc:`/vm/damon/api` for the kernel space programming interface. For
the reason, this document describes only the debugfs interface
debugfs Interface debugfs Interface
================= =================
DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``, DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
``schemes`` and ``monitor_on`` under its debugfs directory, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
``<debugfs>/damon/``. ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
Attributes Attributes
@ -131,24 +134,38 @@ Schemes
For usual DAMON-based data access aware memory management optimizations, users For usual DAMON-based data access aware memory management optimizations, users
would simply want the system to apply a memory management action to a memory would simply want the system to apply a memory management action to a memory
region of a specific size having a specific access frequency for a specific region of a specific access pattern. DAMON receives such formalized operation
time. DAMON receives such formalized operation schemes from the user and schemes from the user and applies those to the target processes.
applies those to the target processes. It also counts the total number and
size of regions that each scheme is applied. This statistics can be used for
online analysis or tuning of the schemes.
Users can get and set the schemes by reading from and writing to ``schemes`` Users can get and set the schemes by reading from and writing to ``schemes``
debugfs file. Reading the file also shows the statistics of each scheme. To debugfs file. Reading the file also shows the statistics of each scheme. To
the file, each of the schemes should be represented in each line in below form: the file, each of the schemes should be represented in each line in below
form::
min-size max-size min-acc max-acc min-age max-age action <target access pattern> <action> <quota> <watermarks>
Note that the ranges are closed interval. Bytes for the size of regions You can disable schemes by simply writing an empty string to the file.
(``min-size`` and ``max-size``), number of monitored accesses per aggregate
interval for access frequency (``min-acc`` and ``max-acc``), number of Target Access Pattern
aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a ~~~~~~~~~~~~~~~~~~~~~
predefined integer for memory management actions should be used. The supported
numbers and their meanings are as below. The ``<target access pattern>`` is constructed with three ranges in below
form::
min-size max-size min-acc max-acc min-age max-age
Specifically, bytes for the size of regions (``min-size`` and ``max-size``),
number of monitored accesses per aggregate interval for access frequency
(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of
regions (``min-age`` and ``max-age``) are specified. Note that the ranges are
closed interval.
Action
~~~~~~
The ``<action>`` is a predefined integer for memory management actions, which
DAMON will apply to the regions having the target access pattern. The
supported numbers and their meanings are as below.
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
- 1: Call ``madvise()`` for the region with ``MADV_COLD`` - 1: Call ``madvise()`` for the region with ``MADV_COLD``
@ -157,20 +174,82 @@ numbers and their meanings are as below.
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
- 5: Do nothing but count the statistics - 5: Do nothing but count the statistics
You can disable schemes by simply writing an empty string to the file. For Quota
example, below commands applies a scheme saying "If a memory region of size in ~~~~~
[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
interval in [10, 20], page out the region", check the entered scheme again, and Optimal ``target access pattern`` for each ``action`` is workload dependent, so
finally remove the scheme. :: not easy to find. Worse yet, setting a scheme of some action too aggressive
can cause severe overhead. To avoid such overhead, users can limit time and
size quota for the scheme via the ``<quota>`` in below form::
<ms> <sz> <reset interval> <priority weights>
This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying
the action to memory regions of the ``target access pattern`` within the
``<reset interval>`` milliseconds, and to apply the action to only up to
``<sz>`` bytes of memory regions within the ``<reset interval>``. Setting both
``<ms>`` and ``<sz>`` zero disables the quota limits.
When the quota limit is expected to be exceeded, DAMON prioritizes found memory
regions of the ``target access pattern`` based on their size, access frequency,
and age. For personalized prioritization, users can set the weights for the
three properties in ``<priority weights>`` in below form::
<size weight> <access frequency weight> <age weight>
Watermarks
~~~~~~~~~~
Some schemes would need to run based on current value of the system's specific
metrics like free memory ratio. For such cases, users can specify watermarks
for the condition.::
<metric> <check interval> <high mark> <middle mark> <low mark>
``<metric>`` is a predefined integer for the metric to be checked. The
supported numbers and their meanings are as below.
- 0: Ignore the watermarks
- 1: System's free memory rate (per thousand)
The value of the metric is checked every ``<check interval>`` microseconds.
If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
scheme is deactivated. If the value is lower than ``<mid mark>``, the scheme
is activated.
.. _damos_stats:
Statistics
~~~~~~~~~~
It also counts the total number and bytes of regions that each scheme is tried
to be applied, the two numbers for the regions that each scheme is successfully
applied, and the total number of the quota limit exceeds. This statistics can
be used for online analysis or tuning of the schemes.
The statistics can be shown by reading the ``schemes`` file. Reading the file
will show each scheme you entered in each line, and the five numbers for the
statistics will be added at the end of each line.
Example
~~~~~~~
Below commands applies a scheme saying "If a memory region of size in [4KiB,
8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
interval in [10, 20], page out the region. For the paging out, use only up to
10ms per second, and also don't page out more than 1GiB per second. Under the
limitation, page out memory regions having longer age first. Also, check the
free memory rate of the system every 5 seconds, start the monitoring and paging
out when the free memory rate becomes lower than 50%, but stop it if the free
memory rate becomes larger than 60%, or lower than 30%".::
# cd <debugfs>/damon # cd <debugfs>/damon
# echo "4096 8192 0 5 10 20 2" > schemes # scheme="4096 8192 0 5 10 20 2" # target access pattern and action
# cat schemes # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
4096 8192 0 5 10 20 2 0 0 # scheme+=" 0 0 100" # prioritization weights
# echo > schemes # scheme+=" 1 5000000 600 500 300" # watermarks
# echo "$scheme" > schemes
The last two integers in the 4th line of above example is the total number and
the total size of the regions that the scheme is applied.
Turning On/Off Turning On/Off
@ -195,6 +274,54 @@ the monitoring is turned on. If you write to the files while DAMON is running,
an error code such as ``-EBUSY`` will be returned. an error code such as ``-EBUSY`` will be returned.
Monitoring Thread PID
---------------------
DAMON does requested monitoring with a kernel thread called ``kdamond``. You
can get the pid of the thread by reading the ``kdamond_pid`` file. When the
monitoring is turned off, reading the file returns ``none``. ::
# cd <debugfs>/damon
# cat monitor_on
off
# cat kdamond_pid
none
# echo on > monitor_on
# cat kdamond_pid
18594
Using Multiple Monitoring Threads
---------------------------------
One ``kdamond`` thread is created for each monitoring context. You can create
and remove monitoring contexts for multiple ``kdamond`` required use case using
the ``mk_contexts`` and ``rm_contexts`` files.
Writing the name of the new context to the ``mk_contexts`` file creates a
directory of the name on the DAMON debugfs directory. The directory will have
DAMON debugfs files for the context. ::
# cd <debugfs>/damon
# ls foo
# ls: cannot access 'foo': No such file or directory
# echo foo > mk_contexts
# ls foo
# attrs init_regions kdamond_pid schemes target_ids
If the context is not needed anymore, you can remove it and the corresponding
directory by putting the name of the context to the ``rm_contexts`` file. ::
# echo foo > rm_contexts
# ls foo
# ls: cannot access 'foo': No such file or directory
Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
root directory only.
.. _tracepoint:
Tracepoint for Monitoring Results Tracepoint for Monitoring Results
================================= =================================

View File

@ -408,7 +408,7 @@ follows:
Memory Policy APIs Memory Policy APIs
================== ==================
Linux supports 3 system calls for controlling memory policy. These APIS Linux supports 4 system calls for controlling memory policy. These APIS
always affect only the calling task, the calling task's address space, or always affect only the calling task, the calling task's address space, or
some shared object mapped into the calling task's address space. some shared object mapped into the calling task's address space.
@ -460,6 +460,20 @@ requested via the 'flags' argument.
See the mbind(2) man page for more details. See the mbind(2) man page for more details.
Set home node for a Range of Task's Address Spacec::
long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
unsigned long home_node,
unsigned long flags);
sys_set_mempolicy_home_node set the home node for a VMA policy present in the
task's address range. The system call updates the home node only for the existing
mempolicy range. Other address ranges are ignored. A home node is the NUMA node
closest to which page allocation will come from. Specifying the home node override
the default allocation policy to allocate memory close to the local node for an
executing CPU.
Memory Policy Command Line Interface Memory Policy Command Line Interface
==================================== ====================================

View File

@ -948,7 +948,7 @@ how much memory needs to be free before kswapd goes back to sleep.
The unit is in fractions of 10,000. The default value of 10 means the The unit is in fractions of 10,000. The default value of 10 means the
distances between watermarks are 0.1% of the available memory in the distances between watermarks are 0.1% of the available memory in the
node/system. The maximum value is 1000, or 10% of memory. node/system. The maximum value is 3000, or 30% of memory.
A high rate of threads entering direct reclaim (allocstall) or kswapd A high rate of threads entering direct reclaim (allocstall) or kswapd
going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate

View File

@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data).
The "pathname" shows the name associated file for this mapping. If the mapping The "pathname" shows the name associated file for this mapping. If the mapping
is not associated with a file: is not associated with a file:
======= ==================================== ============= ====================================
[heap] the heap of the program [heap] the heap of the program
[stack] the stack of the main process [stack] the stack of the main process
[vdso] the "virtual dynamic shared object", [vdso] the "virtual dynamic shared object",
the kernel system call handler the kernel system call handler
======= ==================================== [anon:<name>] an anonymous mapping that has been
named by userspace
============= ====================================
or if empty, the mapping is anonymous. or if empty, the mapping is anonymous.

View File

@ -66,9 +66,11 @@ PTE Page Table Helpers
+---------------------------+--------------------------------------------------+ +---------------------------+--------------------------------------------------+
| pte_mknotpresent | Invalidates a mapped PTE | | pte_mknotpresent | Invalidates a mapped PTE |
+---------------------------+--------------------------------------------------+ +---------------------------+--------------------------------------------------+
| ptep_get_and_clear | Clears a PTE | | ptep_clear | Clears a PTE |
+---------------------------+--------------------------------------------------+ +---------------------------+--------------------------------------------------+
| ptep_get_and_clear_full | Clears a PTE | | ptep_get_and_clear | Clears and returns PTE |
+---------------------------+--------------------------------------------------+
| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) |
+---------------------------+--------------------------------------------------+ +---------------------------+--------------------------------------------------+
| ptep_test_and_clear_young | Clears young from a PTE | | ptep_test_and_clear_young | Clears young from a PTE |
+---------------------------+--------------------------------------------------+ +---------------------------+--------------------------------------------------+
@ -247,12 +249,12 @@ SWAP Page Table Helpers
| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | | __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) |
+---------------------------+--------------------------------------------------+ +---------------------------+--------------------------------------------------+
| is_migration_entry | Tests a migration (read or write) swapped entry | | is_migration_entry | Tests a migration (read or write) swapped entry |
+---------------------------+--------------------------------------------------+ +-------------------------------+----------------------------------------------+
| is_write_migration_entry | Tests a write migration swapped entry | | is_writable_migration_entry | Tests a write migration swapped entry |
+---------------------------+--------------------------------------------------+ +-------------------------------+----------------------------------------------+
| make_migration_entry_read | Converts into read migration swapped entry | | make_readable_migration_entry | Creates a read migration swapped entry |
+---------------------------+--------------------------------------------------+ +-------------------------------+----------------------------------------------+
| make_migration_entry | Creates a migration swapped entry (read or write)| | make_writable_migration_entry | Creates a write migration swapped entry |
+---------------------------+--------------------------------------------------+ +-------------------------------+----------------------------------------------+
[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/

View File

@ -31,10 +31,12 @@ algorithms. If you are looking for advice on simply allocating memory, see the
page_migration page_migration
page_frags page_frags
page_owner page_owner
page_table_check
remap_file_pages remap_file_pages
slub slub
split_page_table_lock split_page_table_lock
transhuge transhuge
unevictable-lru unevictable-lru
vmalloced-kernel-stacks
z3fold z3fold
zsmalloc zsmalloc

View File

@ -263,15 +263,15 @@ Monitoring Migration
The following events (counters) can be used to monitor page migration. The following events (counters) can be used to monitor page migration.
1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a 1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a
page was migrated. If the page was a non-THP page, then this counter is page was migrated. If the page was a non-THP and non-hugetlb page, then
increased by one. If the page was a THP, then this counter is increased by this counter is increased by one. If the page was a THP or hugetlb, then
the number of THP subpages. For example, migration of a single 2MB THP that this counter is increased by the number of THP or hugetlb subpages.
has 4KB-size base pages (subpages) will cause this counter to increase by For example, migration of a single 2MB THP that has 4KB-size base pages
512. (subpages) will cause this counter to increase by 512.
2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for 2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for
PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages,
if it was a THP. if it was a THP or hugetlb.
3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. 3. THP_MIGRATION_SUCCESS: A THP was migrated without being split.

View File

@ -0,0 +1,56 @@
.. SPDX-License-Identifier: GPL-2.0
.. _page_table_check:
================
Page Table Check
================
Introduction
============
Page table check allows to hardern the kernel by ensuring that some types of
the memory corruptions are prevented.
Page table check performs extra verifications at the time when new pages become
accessible from the userspace by getting their page table entries (PTEs PMDs
etc.) added into the table.
In case of detected corruption, the kernel is crashed. There is a small
performance and memory overhead associated with the page table check. Therefore,
it is disabled by default, but can be optionally enabled on systems where the
extra hardening outweighs the performance costs. Also, because page table check
is synchronous, it can help with debugging double map memory corruption issues,
by crashing kernel at the time wrong mapping occurs instead of later which is
often the case with memory corruptions bugs.
Double mapping detection logic
==============================
+-------------------+-------------------+-------------------+------------------+
| Current Mapping | New mapping | Permissions | Rule |
+===================+===================+===================+==================+
| Anonymous | Anonymous | Read | Allow |
+-------------------+-------------------+-------------------+------------------+
| Anonymous | Anonymous | Read / Write | Prohibit |
+-------------------+-------------------+-------------------+------------------+
| Anonymous | Named | Any | Prohibit |
+-------------------+-------------------+-------------------+------------------+
| Named | Anonymous | Any | Prohibit |
+-------------------+-------------------+-------------------+------------------+
| Named | Named | Any | Allow |
+-------------------+-------------------+-------------------+------------------+
Enabling Page Table Check
=========================
Build kernel with:
- PAGE_TABLE_CHECK=y
Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK
is available.
- Boot with 'page_table_check=on' kernel parameter.
Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page
table support without extra kernel parameter.

View File

@ -0,0 +1,153 @@
.. SPDX-License-Identifier: GPL-2.0
=====================================
Virtually Mapped Kernel Stack Support
=====================================
:Author: Shuah Khan <skhan@linuxfoundation.org>
.. contents:: :local:
Overview
--------
This is a compilation of information from the code and original patch
series that introduced the `Virtually Mapped Kernel Stacks feature
<https://lwn.net/Articles/694348/>`
Introduction
------------
Kernel stack overflows are often hard to debug and make the kernel
susceptible to exploits. Problems could show up at a later time making
it difficult to isolate and root-cause.
Virtually-mapped kernel stacks with guard pages causes kernel stack
overflows to be caught immediately rather than causing difficult to
diagnose corruptions.
HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable
support for virtually mapped stacks with guard pages. This feature
causes reliable faults when the stack overflows. The usability of
the stack trace after overflow and response to the overflow itself
is architecture dependent.
.. note::
As of this writing, arm64, powerpc, riscv, s390, um, and x86 have
support for VMAP_STACK.
HAVE_ARCH_VMAP_STACK
--------------------
Architectures that can support Virtually Mapped Kernel Stacks should
enable this bool configuration option. The requirements are:
- vmalloc space must be large enough to hold many kernel stacks. This
may rule out many 32-bit architectures.
- Stacks in vmalloc space need to work reliably. For example, if
vmap page tables are created on demand, either this mechanism
needs to work while the stack points to a virtual address with
unpopulated page tables or arch code (switch_to() and switch_mm(),
most likely) needs to ensure that the stack's page table entries
are populated before running on a possibly unpopulated stack.
- If the stack overflows into a guard page, something reasonable
should happen. The definition of "reasonable" is flexible, but
instantly rebooting without logging anything would be unfriendly.
VMAP_STACK
----------
VMAP_STACK bool configuration option when enabled allocates virtually
mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK.
- Enable this if you want the use virtually-mapped kernel stacks
with guard pages. This causes kernel stack overflows to be caught
immediately rather than causing difficult-to-diagnose corruption.
.. note::
Using this feature with KASAN requires architecture support
for backing virtual mappings with real shadow memory, and
KASAN_VMALLOC must be enabled.
.. note::
VMAP_STACK is enabled, it is not possible to run DMA on stack
allocated data.
Kernel configuration options and dependencies keep changing. Refer to
the latest code base:
`Kconfig <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/Kconfig>`
Allocation
-----------
When a new kernel thread is created, thread stack is allocated from
virtually contiguous memory pages from the page level allocator. These
pages are mapped into contiguous kernel virtual space with PAGE_KERNEL
protections.
alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack
with PAGE_KERNEL protections.
- Allocated stacks are cached and later reused by new threads, so memcg
accounting is performed manually on assigning/releasing stacks to tasks.
Hence, __vmalloc_node_range is called without __GFP_ACCOUNT.
- vm_struct is cached to be able to find when thread free is initiated
in interrupt context. free_thread_stack() can be called in interrupt
context.
- On arm64, all VMAP's stacks need to have the same alignment to ensure
that VMAP'd stack overflow detection works correctly. Arch specific
vmap stack allocator takes care of this detail.
- This does not address interrupt stacks - according to the original patch
Thread stack allocation is initiated from clone(), fork(), vfork(),
kernel_thread() via kernel_clone(). Leaving a few hints for searching
the code base to understand when and how thread stack is allocated.
Bulk of the code is in:
`kernel/fork.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/fork.c>`.
stack_vm_area pointer in task_struct keeps track of the virtually allocated
stack and a non-null stack_vm_area pointer serves as a indication that the
virtually mapped kernel stacks are enabled.
::
struct vm_struct *stack_vm_area;
Stack overflow handling
-----------------------
Leading and trailing guard pages help detect stack overflows. When stack
overflows into the guard pages, handlers have to be careful not overflow
the stack again. When handlers are called, it is likely that very little
stack space is left.
On x86, this is done by handling the page fault indicating the kernel
stack overflow on the double-fault stack.
Testing VMAP allocation with guard pages
----------------------------------------
How do we ensure that VMAP_STACK is actually allocating with a leading
and trailing guard page? The following lkdtm tests can help detect any
regressions.
::
void lkdtm_STACK_GUARD_PAGE_LEADING()
void lkdtm_STACK_GUARD_PAGE_TRAILING()
Conclusions
-----------
- A percpu cache of vmalloced stacks appears to be a bit faster than a
high-order stack allocation, at least when the cache hits.
- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and
simply embed the thread_info (containing only flags) and 'int cpu' into
task_struct.
- The thread stack can be free'ed as soon as the task is dead (without
waiting for RCU) and then, if vmapped stacks are in use, cache the
entire stack for reuse on the same cpu.

View File

@ -14541,6 +14541,15 @@ F: include/net/page_pool.h
F: include/trace/events/page_pool.h F: include/trace/events/page_pool.h
F: net/core/page_pool.c F: net/core/page_pool.c
PAGE TABLE CHECK
M: Pasha Tatashin <pasha.tatashin@soleen.com>
M: Andrew Morton <akpm@linux-foundation.org>
L: linux-mm@kvack.org
S: Maintained
F: Documentation/vm/page_table_check.rst
F: include/linux/page_table_check.h
F: mm/page_table_check.c
PANASONIC LAPTOP ACPI EXTRAS DRIVER PANASONIC LAPTOP ACPI EXTRAS DRIVER
M: Kenneth Chan <kenneth.t.chan@gmail.com> M: Kenneth Chan <kenneth.t.chan@gmail.com>
L: platform-driver-x86@vger.kernel.org L: platform-driver-x86@vger.kernel.org

View File

@ -1297,6 +1297,9 @@ config HAVE_ARCH_PFN_VALID
config ARCH_SUPPORTS_DEBUG_PAGEALLOC config ARCH_SUPPORTS_DEBUG_PAGEALLOC
bool bool
config ARCH_SUPPORTS_PAGE_TABLE_CHECK
bool
config ARCH_SPLIT_ARG64 config ARCH_SPLIT_ARG64
bool bool
help help

View File

@ -489,3 +489,4 @@
# 557 reserved for memfd_secret # 557 reserved for memfd_secret
558 common process_mrelease sys_process_mrelease 558 common process_mrelease sys_process_mrelease
559 common futex_waitv sys_futex_waitv 559 common futex_waitv sys_futex_waitv
560 common set_mempolicy_home_node sys_ni_syscall

View File

@ -165,17 +165,15 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* No need to mmap_read_unlock(mm) as we would /* No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -149,8 +149,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
/* /*
* Fault retry nuances, mmap_lock already relinquished by core mm * Fault retry nuances, mmap_lock already relinquished by core mm
*/ */
if (unlikely((fault & VM_FAULT_RETRY) && if (unlikely(fault & VM_FAULT_RETRY)) {
(flags & FAULT_FLAG_ALLOW_RETRY))) {
flags |= FAULT_FLAG_TRIED; flags |= FAULT_FLAG_TRIED;
goto retry; goto retry;
} }

View File

@ -322,7 +322,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
return 0; return 0;
} }
if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) { if (!(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_RETRY) { if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED; flags |= FAULT_FLAG_TRIED;
goto retry; goto retry;

View File

@ -463,3 +463,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
#define __NR_compat_syscalls 450 #define __NR_compat_syscalls 451
#endif #endif
#define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_CLONE

View File

@ -905,6 +905,8 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
__SYSCALL(__NR_process_mrelease, sys_process_mrelease) __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
#define __NR_futex_waitv 449 #define __NR_futex_waitv 449
__SYSCALL(__NR_futex_waitv, sys_futex_waitv) __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
#define __NR_set_mempolicy_home_node 450
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
/* /*
* Please add new compat syscalls above this comment and update * Please add new compat syscalls above this comment and update

View File

@ -36,7 +36,7 @@ void *module_alloc(unsigned long size)
module_alloc_end = MODULES_END; module_alloc_end = MODULES_END;
p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
module_alloc_end, gfp_mask, PAGE_KERNEL, 0, module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK,
NUMA_NO_NODE, __builtin_return_address(0)); NUMA_NO_NODE, __builtin_return_address(0));
if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
@ -58,7 +58,7 @@ void *module_alloc(unsigned long size)
PAGE_KERNEL, 0, NUMA_NO_NODE, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0)); __builtin_return_address(0));
if (p && (kasan_module_alloc(p, size) < 0)) { if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
vfree(p); vfree(p);
return NULL; return NULL;
} }

View File

@ -608,10 +608,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
} }
if (fault & VM_FAULT_RETRY) { if (fault & VM_FAULT_RETRY) {
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { mm_flags |= FAULT_FLAG_TRIED;
mm_flags |= FAULT_FLAG_TRIED; goto retry;
goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -98,11 +98,9 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
/* The most common case -- we are done. */ /* The most common case -- we are done. */
if (likely(!(fault & VM_FAULT_ERROR))) { if (likely(!(fault & VM_FAULT_ERROR))) {
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED; goto retry;
goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -848,7 +848,7 @@ register_unwind_table (struct module *mod)
{ {
struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr; struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr;
struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start); struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start);
struct unw_table_entry tmp, *e1, *e2, *core, *init; struct unw_table_entry *e1, *e2, *core, *init;
unsigned long num_init = 0, num_core = 0; unsigned long num_init = 0, num_core = 0;
/* First, count how many init and core unwind-table entries there are. */ /* First, count how many init and core unwind-table entries there are. */
@ -865,9 +865,7 @@ register_unwind_table (struct module *mod)
for (e1 = start; e1 < end; ++e1) { for (e1 = start; e1 < end; ++e1) {
for (e2 = e1 + 1; e2 < end; ++e2) { for (e2 = e1 + 1; e2 < end; ++e2) {
if (e2->start_offset < e1->start_offset) { if (e2->start_offset < e1->start_offset) {
tmp = *e1; swap(*e1, *e2);
*e1 = *e2;
*e2 = tmp;
} }
} }
} }

View File

@ -208,10 +208,7 @@ sort_regions (struct rsvd_region *rsvd_region, int max)
while (max--) { while (max--) {
for (j = 0; j < max; ++j) { for (j = 0; j < max; ++j) {
if (rsvd_region[j].start > rsvd_region[j+1].start) { if (rsvd_region[j].start > rsvd_region[j+1].start) {
struct rsvd_region tmp; swap(rsvd_region[j], rsvd_region[j + 1]);
tmp = rsvd_region[j];
rsvd_region[j] = rsvd_region[j + 1];
rsvd_region[j + 1] = tmp;
} }
} }
} }

View File

@ -370,3 +370,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -264,6 +264,7 @@ static struct attribute * cache_default_attrs[] = {
&shared_cpu_map.attr, &shared_cpu_map.attr,
NULL NULL
}; };
ATTRIBUTE_GROUPS(cache_default);
#define to_object(k) container_of(k, struct cache_info, kobj) #define to_object(k) container_of(k, struct cache_info, kobj)
#define to_attr(a) container_of(a, struct cache_attr, attr) #define to_attr(a) container_of(a, struct cache_attr, attr)
@ -284,7 +285,7 @@ static const struct sysfs_ops cache_sysfs_ops = {
static struct kobj_type cache_ktype = { static struct kobj_type cache_ktype = {
.sysfs_ops = &cache_sysfs_ops, .sysfs_ops = &cache_sysfs_ops,
.default_attrs = cache_default_attrs, .default_groups = cache_default_groups,
}; };
static struct kobj_type cache_ktype_percpu_entry = { static struct kobj_type cache_ktype_percpu_entry = {

View File

@ -171,7 +171,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
* @n_pages: number of contiguous pages to allocate * @n_pages: number of contiguous pages to allocate
* *
* Allocate the specified number of contiguous uncached pages on the * Allocate the specified number of contiguous uncached pages on the
* the requested node. If not enough contiguous uncached pages are available * requested node. If not enough contiguous uncached pages are available
* on the requested node, roundrobin starting with the next higher node. * on the requested node, roundrobin starting with the next higher node.
*/ */
unsigned long uncached_alloc_page(int starting_nid, int n_pages) unsigned long uncached_alloc_page(int starting_nid, int n_pages)

View File

@ -156,17 +156,15 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* No need to mmap_read_unlock(mm) as we would /* No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -449,3 +449,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -153,18 +153,16 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* /*
* No need to mmap_read_unlock(mm) as we would * No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -455,3 +455,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -232,18 +232,16 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* /*
* No need to mmap_read_unlock(mm) as we would * No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -388,3 +388,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 n32 process_mrelease sys_process_mrelease 448 n32 process_mrelease sys_process_mrelease
449 n32 futex_waitv sys_futex_waitv 449 n32 futex_waitv sys_futex_waitv
450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -364,3 +364,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 n64 process_mrelease sys_process_mrelease 448 n64 process_mrelease sys_process_mrelease
449 n64 futex_waitv sys_futex_waitv 449 n64 futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -437,3 +437,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 o32 process_mrelease sys_process_mrelease 448 o32 process_mrelease sys_process_mrelease
449 o32 futex_waitv sys_futex_waitv 449 o32 futex_waitv sys_futex_waitv
450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -171,18 +171,17 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
goto do_sigbus; goto do_sigbus;
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
/* if (fault & VM_FAULT_RETRY) {
* No need to mmap_read_unlock(mm) as we would flags |= FAULT_FLAG_TRIED;
* have already released it in __lock_page_or_retry
* in mm/filemap.c.
*/
goto retry; /*
} * No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry
* in mm/filemap.c.
*/
goto retry;
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -230,16 +230,14 @@ void do_page_fault(unsigned long entry, unsigned long addr,
goto bad_area; goto bad_area;
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* No need to mmap_read_unlock(mm) as we would /* No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -149,18 +149,16 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* /*
* No need to mmap_read_unlock(mm) as we would * No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -177,18 +177,16 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { /*RGD modeled on Cris */
/*RGD modeled on Cris */ if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* No need to mmap_read_unlock(mm) as we would /* No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -447,3 +447,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -324,16 +324,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
goto bad_area; goto bad_area;
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { /*
/* * No need to mmap_read_unlock(mm) as we would
* No need to mmap_read_unlock(mm) as we would * have already released it in __lock_page_or_retry
* have already released it in __lock_page_or_retry * in mm/filemap.c.
* in mm/filemap.c. */
*/ flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED; goto retry;
goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);
return; return;

View File

@ -529,3 +529,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -517,10 +517,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
* case. * case.
*/ */
if (unlikely(fault & VM_FAULT_RETRY)) { if (unlikely(fault & VM_FAULT_RETRY)) {
if (flags & FAULT_FLAG_ALLOW_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED; goto retry;
goto retry;
}
} }
mmap_read_unlock(current->mm); mmap_read_unlock(current->mm);

View File

@ -330,7 +330,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
if (fault_signal_pending(fault, regs)) if (fault_signal_pending(fault, regs))
return; return;
if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { if (unlikely(fault & VM_FAULT_RETRY)) {
flags |= FAULT_FLAG_TRIED; flags |= FAULT_FLAG_TRIED;
/* /*

View File

@ -37,14 +37,15 @@
void *module_alloc(unsigned long size) void *module_alloc(unsigned long size)
{ {
gfp_t gfp_mask = GFP_KERNEL;
void *p; void *p;
if (PAGE_ALIGN(size) > MODULES_LEN) if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL; return NULL;
p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
__builtin_return_address(0)); __builtin_return_address(0));
if (p && (kasan_module_alloc(p, size) < 0)) { if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
vfree(p); vfree(p);
return NULL; return NULL;
} }

View File

@ -452,3 +452,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -452,21 +452,21 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
if (unlikely(fault & VM_FAULT_ERROR)) if (unlikely(fault & VM_FAULT_ERROR))
goto out_up; goto out_up;
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
if (IS_ENABLED(CONFIG_PGSTE) && gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
(flags & FAULT_FLAG_RETRY_NOWAIT)) { /*
/* FAULT_FLAG_RETRY_NOWAIT has been set, * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
* mmap_lock has not been released */ * not been released
current->thread.gmap_pfault = 1; */
fault = VM_FAULT_PFAULT; current->thread.gmap_pfault = 1;
goto out_up; fault = VM_FAULT_PFAULT;
} goto out_up;
flags &= ~FAULT_FLAG_RETRY_NOWAIT;
flags |= FAULT_FLAG_TRIED;
mmap_read_lock(mm);
goto retry;
} }
flags &= ~FAULT_FLAG_RETRY_NOWAIT;
flags |= FAULT_FLAG_TRIED;
mmap_read_lock(mm);
goto retry;
} }
if (IS_ENABLED(CONFIG_PGSTE) && gmap) { if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr, address = __gmap_link(gmap, current->thread.gmap_addr,

View File

@ -452,3 +452,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -485,17 +485,15 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
if (mm_fault_error(regs, error_code, address, fault)) if (mm_fault_error(regs, error_code, address, fault))
return; return;
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* /*
* No need to mmap_read_unlock(mm) as we would * No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -495,3 +495,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -200,17 +200,15 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* No need to mmap_read_unlock(mm) as we would /* No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -437,17 +437,15 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
/* No need to mmap_read_unlock(mm) as we would /* No need to mmap_read_unlock(mm) as we would
* have already released it in __lock_page_or_retry * have already released it in __lock_page_or_retry
* in mm/filemap.c. * in mm/filemap.c.
*/ */
goto retry; goto retry;
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -87,12 +87,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
} }
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_RETRY) {
if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED;
flags |= FAULT_FLAG_TRIED;
goto retry; goto retry;
}
} }
pmd = pmd_off(mm, address); pmd = pmd_off(mm, address);

View File

@ -104,6 +104,7 @@ config X86
select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_DEBUG_PAGEALLOC
select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG

View File

@ -454,3 +454,4 @@
447 i386 memfd_secret sys_memfd_secret 447 i386 memfd_secret sys_memfd_secret
448 i386 process_mrelease sys_process_mrelease 448 i386 process_mrelease sys_process_mrelease
449 i386 futex_waitv sys_futex_waitv 449 i386 futex_waitv sys_futex_waitv
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -371,6 +371,7 @@
447 common memfd_secret sys_memfd_secret 447 common memfd_secret sys_memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
# #
# Due to a historical design error, certain syscalls are numbered differently # Due to a historical design error, certain syscalls are numbered differently

View File

@ -27,6 +27,7 @@
#include <asm/pkru.h> #include <asm/pkru.h>
#include <asm/fpu/api.h> #include <asm/fpu/api.h>
#include <asm-generic/pgtable_uffd.h> #include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@ -753,7 +754,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
return true; return true;
if ((pte_flags(a) & _PAGE_PROTNONE) && if ((pte_flags(a) & _PAGE_PROTNONE) &&
mm_tlb_flush_pending(mm)) atomic_read(&mm->tlb_flush_pending))
return true; return true;
return false; return false;
@ -1007,18 +1008,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte) pte_t *ptep, pte_t pte)
{ {
page_table_check_pte_set(mm, addr, ptep, pte);
set_pte(ptep, pte); set_pte(ptep, pte);
} }
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd) pmd_t *pmdp, pmd_t pmd)
{ {
page_table_check_pmd_set(mm, addr, pmdp, pmd);
set_pmd(pmdp, pmd); set_pmd(pmdp, pmd);
} }
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud) pud_t *pudp, pud_t pud)
{ {
page_table_check_pud_set(mm, addr, pudp, pud);
native_set_pud(pudp, pud); native_set_pud(pudp, pud);
} }
@ -1049,6 +1053,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep) pte_t *ptep)
{ {
pte_t pte = native_ptep_get_and_clear(ptep); pte_t pte = native_ptep_get_and_clear(ptep);
page_table_check_pte_clear(mm, addr, pte);
return pte; return pte;
} }
@ -1064,12 +1069,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
* care about updates and native needs no locking * care about updates and native needs no locking
*/ */
pte = native_local_ptep_get_and_clear(ptep); pte = native_local_ptep_get_and_clear(ptep);
page_table_check_pte_clear(mm, addr, pte);
} else { } else {
pte = ptep_get_and_clear(mm, addr, ptep); pte = ptep_get_and_clear(mm, addr, ptep);
} }
return pte; return pte;
} }
#define __HAVE_ARCH_PTEP_CLEAR
static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK))
ptep_get_and_clear(mm, addr, ptep);
else
pte_clear(mm, addr, ptep);
}
#define __HAVE_ARCH_PTEP_SET_WRPROTECT #define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm, static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep) unsigned long addr, pte_t *ptep)
@ -1110,14 +1126,22 @@ static inline int pmd_write(pmd_t pmd)
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp) pmd_t *pmdp)
{ {
return native_pmdp_get_and_clear(pmdp); pmd_t pmd = native_pmdp_get_and_clear(pmdp);
page_table_check_pmd_clear(mm, addr, pmd);
return pmd;
} }
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pud_t *pudp) unsigned long addr, pud_t *pudp)
{ {
return native_pudp_get_and_clear(pudp); pud_t pud = native_pudp_get_and_clear(pudp);
page_table_check_pud_clear(mm, addr, pud);
return pud;
} }
#define __HAVE_ARCH_PMDP_SET_WRPROTECT #define __HAVE_ARCH_PMDP_SET_WRPROTECT
@ -1138,6 +1162,7 @@ static inline int pud_write(pud_t pud)
static inline pmd_t pmdp_establish(struct vm_area_struct *vma, static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd) unsigned long address, pmd_t *pmdp, pmd_t pmd)
{ {
page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
if (IS_ENABLED(CONFIG_SMP)) { if (IS_ENABLED(CONFIG_SMP)) {
return xchg(pmdp, pmd); return xchg(pmdp, pmd);
} else { } else {

View File

@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void)
void *module_alloc(unsigned long size) void *module_alloc(unsigned long size)
{ {
gfp_t gfp_mask = GFP_KERNEL;
void *p; void *p;
if (PAGE_ALIGN(size) > MODULES_LEN) if (PAGE_ALIGN(size) > MODULES_LEN)
@ -74,10 +75,10 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN, p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(), MODULES_VADDR + get_module_load_offset(),
MODULES_END, GFP_KERNEL, MODULES_END, gfp_mask,
PAGE_KERNEL, 0, NUMA_NO_NODE, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
__builtin_return_address(0)); __builtin_return_address(0));
if (p && (kasan_module_alloc(p, size) < 0)) { if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
vfree(p); vfree(p);
return NULL; return NULL;
} }

View File

@ -1413,8 +1413,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* and if there is a fatal signal pending there is no guarantee * and if there is a fatal signal pending there is no guarantee
* that we made any progress. Handle this case first. * that we made any progress. Handle this case first.
*/ */
if (unlikely((fault & VM_FAULT_RETRY) && if (unlikely(fault & VM_FAULT_RETRY)) {
(flags & FAULT_FLAG_ALLOW_RETRY))) {
flags |= FAULT_FLAG_TRIED; flags |= FAULT_FLAG_TRIED;
goto retry; goto retry;
} }

View File

@ -420,3 +420,4 @@
# 447 reserved for memfd_secret # 447 reserved for memfd_secret
448 common process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node

View File

@ -127,17 +127,16 @@ void do_page_fault(struct pt_regs *regs)
goto do_sigbus; goto do_sigbus;
BUG(); BUG();
} }
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
/* No need to mmap_read_unlock(mm) as we would if (fault & VM_FAULT_RETRY) {
* have already released it in __lock_page_or_retry flags |= FAULT_FLAG_TRIED;
* in mm/filemap.c.
*/
goto retry; /* No need to mmap_read_unlock(mm) as we would
} * have already released it in __lock_page_or_retry
* in mm/filemap.c.
*/
goto retry;
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);

View File

@ -1903,14 +1903,7 @@ static struct attribute *zram_disk_attrs[] = {
NULL, NULL,
}; };
static const struct attribute_group zram_disk_attr_group = { ATTRIBUTE_GROUPS(zram_disk);
.attrs = zram_disk_attrs,
};
static const struct attribute_group *zram_disk_attr_groups[] = {
&zram_disk_attr_group,
NULL,
};
/* /*
* Allocate and initialize new zram device. the function returns * Allocate and initialize new zram device. the function returns
@ -1983,7 +1976,7 @@ static int zram_add(void)
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups); ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret) if (ret)
goto out_cleanup_disk; goto out_cleanup_disk;

View File

@ -127,11 +127,35 @@ ATTRIBUTE_GROUPS(dax_drv);
static int dax_bus_match(struct device *dev, struct device_driver *drv); static int dax_bus_match(struct device *dev, struct device_driver *drv);
/*
* Static dax regions are regions created by an external subsystem
* nvdimm where a single range is assigned. Its boundaries are by the external
* subsystem and are usually limited to one physical memory range. For example,
* for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a
* single contiguous range)
*
* On dynamic dax regions, the assigned region can be partitioned by dax core
* into multiple subdivisions. A subdivision is represented into one
* /dev/daxN.M device composed by one or more potentially discontiguous ranges.
*
* When allocating a dax region, drivers must set whether it's static
* (IORESOURCE_DAX_STATIC). On static dax devices, the @pgmap is pre-assigned
* to dax core when calling devm_create_dev_dax(), whereas in dynamic dax
* devices it is NULL but afterwards allocated by dax core on device ->probe().
* Care is needed to make sure that dynamic dax devices are torn down with a
* cleared @pgmap field (see kill_dev_dax()).
*/
static bool is_static(struct dax_region *dax_region) static bool is_static(struct dax_region *dax_region)
{ {
return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
} }
bool static_dev_dax(struct dev_dax *dev_dax)
{
return is_static(dev_dax->region);
}
EXPORT_SYMBOL_GPL(static_dev_dax);
static u64 dev_dax_size(struct dev_dax *dev_dax) static u64 dev_dax_size(struct dev_dax *dev_dax)
{ {
u64 size = 0; u64 size = 0;
@ -361,6 +385,14 @@ void kill_dev_dax(struct dev_dax *dev_dax)
kill_dax(dax_dev); kill_dax(dax_dev);
unmap_mapping_range(inode->i_mapping, 0, 0, 1); unmap_mapping_range(inode->i_mapping, 0, 0, 1);
/*
* Dynamic dax region have the pgmap allocated via dev_kzalloc()
* and thus freed by devm. Clear the pgmap to not have stale pgmap
* ranges on probe() from previous reconfigurations of region devices.
*/
if (!static_dev_dax(dev_dax))
dev_dax->pgmap = NULL;
} }
EXPORT_SYMBOL_GPL(kill_dev_dax); EXPORT_SYMBOL_GPL(kill_dev_dax);

View File

@ -39,6 +39,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv,
__dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
void dax_driver_unregister(struct dax_device_driver *dax_drv); void dax_driver_unregister(struct dax_device_driver *dax_drv);
void kill_dev_dax(struct dev_dax *dev_dax); void kill_dev_dax(struct dev_dax *dev_dax);
bool static_dev_dax(struct dev_dax *dev_dax);
/* /*
* While run_dax() is potentially a generic operation that could be * While run_dax() is potentially a generic operation that could be

View File

@ -73,11 +73,39 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
return -1; return -1;
} }
static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
unsigned long fault_size)
{
unsigned long i, nr_pages = fault_size / PAGE_SIZE;
struct file *filp = vmf->vma->vm_file;
struct dev_dax *dev_dax = filp->private_data;
pgoff_t pgoff;
/* mapping is only set on the head */
if (dev_dax->pgmap->vmemmap_shift)
nr_pages = 1;
pgoff = linear_page_index(vmf->vma,
ALIGN(vmf->address, fault_size));
for (i = 0; i < nr_pages; i++) {
struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
page = compound_head(page);
if (page->mapping)
continue;
page->mapping = filp->f_mapping;
page->index = pgoff + i;
}
}
static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
struct vm_fault *vmf, pfn_t *pfn) struct vm_fault *vmf)
{ {
struct device *dev = &dev_dax->dev; struct device *dev = &dev_dax->dev;
phys_addr_t phys; phys_addr_t phys;
pfn_t pfn;
unsigned int fault_size = PAGE_SIZE; unsigned int fault_size = PAGE_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__)) if (check_vma(dev_dax, vmf->vma, __func__))
@ -98,18 +126,21 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); dax_set_mapping(vmf, pfn, fault_size);
return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
} }
static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
struct vm_fault *vmf, pfn_t *pfn) struct vm_fault *vmf)
{ {
unsigned long pmd_addr = vmf->address & PMD_MASK; unsigned long pmd_addr = vmf->address & PMD_MASK;
struct device *dev = &dev_dax->dev; struct device *dev = &dev_dax->dev;
phys_addr_t phys; phys_addr_t phys;
pgoff_t pgoff; pgoff_t pgoff;
pfn_t pfn;
unsigned int fault_size = PMD_SIZE; unsigned int fault_size = PMD_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__)) if (check_vma(dev_dax, vmf->vma, __func__))
@ -138,19 +169,22 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); dax_set_mapping(vmf, pfn, fault_size);
return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
} }
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
struct vm_fault *vmf, pfn_t *pfn) struct vm_fault *vmf)
{ {
unsigned long pud_addr = vmf->address & PUD_MASK; unsigned long pud_addr = vmf->address & PUD_MASK;
struct device *dev = &dev_dax->dev; struct device *dev = &dev_dax->dev;
phys_addr_t phys; phys_addr_t phys;
pgoff_t pgoff; pgoff_t pgoff;
pfn_t pfn;
unsigned int fault_size = PUD_SIZE; unsigned int fault_size = PUD_SIZE;
@ -180,13 +214,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); dax_set_mapping(vmf, pfn, fault_size);
return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
} }
#else #else
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
struct vm_fault *vmf, pfn_t *pfn) struct vm_fault *vmf)
{ {
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
@ -196,10 +232,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size) enum page_entry_size pe_size)
{ {
struct file *filp = vmf->vma->vm_file; struct file *filp = vmf->vma->vm_file;
unsigned long fault_size;
vm_fault_t rc = VM_FAULT_SIGBUS; vm_fault_t rc = VM_FAULT_SIGBUS;
int id; int id;
pfn_t pfn;
struct dev_dax *dev_dax = filp->private_data; struct dev_dax *dev_dax = filp->private_data;
dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
@ -209,43 +243,18 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
id = dax_read_lock(); id = dax_read_lock();
switch (pe_size) { switch (pe_size) {
case PE_SIZE_PTE: case PE_SIZE_PTE:
fault_size = PAGE_SIZE; rc = __dev_dax_pte_fault(dev_dax, vmf);
rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
break; break;
case PE_SIZE_PMD: case PE_SIZE_PMD:
fault_size = PMD_SIZE; rc = __dev_dax_pmd_fault(dev_dax, vmf);
rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
break; break;
case PE_SIZE_PUD: case PE_SIZE_PUD:
fault_size = PUD_SIZE; rc = __dev_dax_pud_fault(dev_dax, vmf);
rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
break; break;
default: default:
rc = VM_FAULT_SIGBUS; rc = VM_FAULT_SIGBUS;
} }
if (rc == VM_FAULT_NOPAGE) {
unsigned long i;
pgoff_t pgoff;
/*
* In the device-dax case the only possibility for a
* VM_FAULT_NOPAGE result is when device-dax capacity is
* mapped. No need to consider the zero page, or racing
* conflicting mappings.
*/
pgoff = linear_page_index(vmf->vma, vmf->address
& ~(fault_size - 1));
for (i = 0; i < fault_size / PAGE_SIZE; i++) {
struct page *page;
page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
if (page->mapping)
continue;
page->mapping = filp->f_mapping;
page->index = pgoff + i;
}
}
dax_read_unlock(id); dax_read_unlock(id);
return rc; return rc;
@ -398,17 +407,34 @@ int dev_dax_probe(struct dev_dax *dev_dax)
void *addr; void *addr;
int rc, i; int rc, i;
pgmap = dev_dax->pgmap; if (static_dev_dax(dev_dax)) {
if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1, if (dev_dax->nr_range > 1) {
"static pgmap / multi-range device conflict\n")) dev_warn(dev,
return -EINVAL; "static pgmap / multi-range device conflict\n");
return -EINVAL;
}
if (!pgmap) { pgmap = dev_dax->pgmap;
pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range) } else {
* (dev_dax->nr_range - 1), GFP_KERNEL); if (dev_dax->pgmap) {
dev_warn(dev,
"dynamic-dax with pre-populated page map\n");
return -EINVAL;
}
pgmap = devm_kzalloc(dev,
struct_size(pgmap, ranges, dev_dax->nr_range - 1),
GFP_KERNEL);
if (!pgmap) if (!pgmap)
return -ENOMEM; return -ENOMEM;
pgmap->nr_range = dev_dax->nr_range; pgmap->nr_range = dev_dax->nr_range;
dev_dax->pgmap = pgmap;
for (i = 0; i < dev_dax->nr_range; i++) {
struct range *range = &dev_dax->ranges[i].range;
pgmap->ranges[i] = *range;
}
} }
for (i = 0; i < dev_dax->nr_range; i++) { for (i = 0; i < dev_dax->nr_range; i++) {
@ -420,12 +446,12 @@ int dev_dax_probe(struct dev_dax *dev_dax)
i, range->start, range->end); i, range->start, range->end);
return -EBUSY; return -EBUSY;
} }
/* don't update the range for static pgmap */
if (!dev_dax->pgmap)
pgmap->ranges[i] = *range;
} }
pgmap->type = MEMORY_DEVICE_GENERIC; pgmap->type = MEMORY_DEVICE_GENERIC;
if (dev_dax->align > PAGE_SIZE)
pgmap->vmemmap_shift =
order_base_2(dev_dax->align >> PAGE_SHIFT);
addr = devm_memremap_pages(dev, pgmap); addr = devm_memremap_pages(dev, pgmap);
if (IS_ERR(addr)) if (IS_ERR(addr))
return PTR_ERR(addr); return PTR_ERR(addr);

View File

@ -98,15 +98,14 @@ static int siw_create_tx_threads(void)
continue; continue;
siw_tx_thread[cpu] = siw_tx_thread[cpu] =
kthread_create(siw_run_sq, (unsigned long *)(long)cpu, kthread_run_on_cpu(siw_run_sq,
"siw_tx/%d", cpu); (unsigned long *)(long)cpu,
cpu, "siw_tx/%u");
if (IS_ERR(siw_tx_thread[cpu])) { if (IS_ERR(siw_tx_thread[cpu])) {
siw_tx_thread[cpu] = NULL; siw_tx_thread[cpu] = NULL;
continue; continue;
} }
kthread_bind(siw_tx_thread[cpu], cpu);
wake_up_process(siw_tx_thread[cpu]);
assigned++; assigned++;
} }
return assigned; return assigned;

View File

@ -26,6 +26,7 @@
#include <linux/serial_core.h> #include <linux/serial_core.h>
#include <linux/sysfs.h> #include <linux/sysfs.h>
#include <linux/random.h> #include <linux/random.h>
#include <linux/kmemleak.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include <asm/setup.h> /* for COMMAND_LINE_SIZE */
#include <asm/page.h> #include <asm/page.h>
@ -524,9 +525,12 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
size = dt_mem_next_cell(dt_root_size_cells, &prop); size = dt_mem_next_cell(dt_root_size_cells, &prop);
if (size && if (size &&
early_init_dt_reserve_memory_arch(base, size, nomap) == 0) early_init_dt_reserve_memory_arch(base, size, nomap) == 0) {
pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M)); uname, &base, (unsigned long)(size / SZ_1M));
if (!nomap)
kmemleak_alloc_phys(base, size, 0, 0);
}
else else
pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M)); uname, &base, (unsigned long)(size / SZ_1M));

View File

@ -27,8 +27,8 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/fiemap.h> #include <linux/fiemap.h>
#include <linux/backing-dev.h>
#include <linux/iomap.h> #include <linux/iomap.h>
#include <linux/sched/mm.h>
#include "ext4_jbd2.h" #include "ext4_jbd2.h"
#include "ext4_extents.h" #include "ext4_extents.h"
#include "xattr.h" #include "xattr.h"
@ -4404,8 +4404,7 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
err = ext4_es_remove_extent(inode, last_block, err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block); EXT_MAX_BLOCKS - last_block);
if (err == -ENOMEM) { if (err == -ENOMEM) {
cond_resched(); memalloc_retry_wait(GFP_ATOMIC);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry; goto retry;
} }
if (err) if (err)
@ -4413,8 +4412,7 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
retry_remove_space: retry_remove_space:
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
if (err == -ENOMEM) { if (err == -ENOMEM) {
cond_resched(); memalloc_retry_wait(GFP_ATOMIC);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry_remove_space; goto retry_remove_space;
} }
return err; return err;

View File

@ -7,7 +7,7 @@
#include <linux/iomap.h> #include <linux/iomap.h>
#include <linux/fiemap.h> #include <linux/fiemap.h>
#include <linux/iversion.h> #include <linux/iversion.h>
#include <linux/backing-dev.h> #include <linux/sched/mm.h>
#include "ext4_jbd2.h" #include "ext4_jbd2.h"
#include "ext4.h" #include "ext4.h"
@ -1929,8 +1929,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
retry: retry:
err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
if (err == -ENOMEM) { if (err == -ENOMEM) {
cond_resched(); memalloc_retry_wait(GFP_ATOMIC);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry; goto retry;
} }
if (err) if (err)

View File

@ -24,7 +24,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/backing-dev.h> #include <linux/sched/mm.h>
#include "ext4_jbd2.h" #include "ext4_jbd2.h"
#include "xattr.h" #include "xattr.h"
@ -523,12 +523,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ret = PTR_ERR(bounce_page); ret = PTR_ERR(bounce_page);
if (ret == -ENOMEM && if (ret == -ENOMEM &&
(io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
gfp_flags = GFP_NOFS; gfp_t new_gfp_flags = GFP_NOFS;
if (io->io_bio) if (io->io_bio)
ext4_io_submit(io); ext4_io_submit(io);
else else
gfp_flags |= __GFP_NOFAIL; new_gfp_flags |= __GFP_NOFAIL;
congestion_wait(BLK_RW_ASYNC, HZ/50); memalloc_retry_wait(gfp_flags);
gfp_flags = new_gfp_flags;
goto retry_encrypt; goto retry_encrypt;
} }

View File

@ -8,9 +8,9 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/f2fs_fs.h> #include <linux/f2fs_fs.h>
#include <linux/buffer_head.h> #include <linux/buffer_head.h>
#include <linux/sched/mm.h>
#include <linux/mpage.h> #include <linux/mpage.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h> #include <linux/pagevec.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/bio.h> #include <linux/bio.h>
@ -2542,7 +2542,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
/* flush pending IOs and wait for a while in the ENOMEM case */ /* flush pending IOs and wait for a while in the ENOMEM case */
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
f2fs_flush_merged_writes(fio->sbi); f2fs_flush_merged_writes(fio->sbi);
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); memalloc_retry_wait(GFP_NOFS);
gfp_flags |= __GFP_NOFAIL; gfp_flags |= __GFP_NOFAIL;
goto retry_encrypt; goto retry_encrypt;
} }

View File

@ -7,7 +7,6 @@
*/ */
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/f2fs_fs.h> #include <linux/f2fs_fs.h>
#include <linux/kthread.h> #include <linux/kthread.h>
@ -15,6 +14,7 @@
#include <linux/freezer.h> #include <linux/freezer.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/random.h> #include <linux/random.h>
#include <linux/sched/mm.h>
#include "f2fs.h" #include "f2fs.h"
#include "node.h" #include "node.h"
@ -1375,8 +1375,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
if (err) { if (err) {
clear_page_private_gcing(page); clear_page_private_gcing(page);
if (err == -ENOMEM) { if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, memalloc_retry_wait(GFP_NOFS);
DEFAULT_IO_TIMEOUT);
goto retry; goto retry;
} }
if (is_dirty) if (is_dirty)

View File

@ -8,8 +8,8 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/f2fs_fs.h> #include <linux/f2fs_fs.h>
#include <linux/buffer_head.h> #include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/sched/mm.h>
#include "f2fs.h" #include "f2fs.h"
#include "node.h" #include "node.h"
@ -562,7 +562,7 @@ struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino)
inode = f2fs_iget(sb, ino); inode = f2fs_iget(sb, ino);
if (IS_ERR(inode)) { if (IS_ERR(inode)) {
if (PTR_ERR(inode) == -ENOMEM) { if (PTR_ERR(inode) == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); memalloc_retry_wait(GFP_NOFS);
goto retry; goto retry;
} }
} }

View File

@ -8,7 +8,7 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/f2fs_fs.h> #include <linux/f2fs_fs.h>
#include <linux/mpage.h> #include <linux/mpage.h>
#include <linux/backing-dev.h> #include <linux/sched/mm.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/pagevec.h> #include <linux/pagevec.h>
#include <linux/swap.h> #include <linux/swap.h>
@ -2750,7 +2750,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
retry: retry:
ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
if (!ipage) { if (!ipage) {
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); memalloc_retry_wait(GFP_NOFS);
goto retry; goto retry;
} }

View File

@ -8,6 +8,7 @@
#include <asm/unaligned.h> #include <asm/unaligned.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/f2fs_fs.h> #include <linux/f2fs_fs.h>
#include <linux/sched/mm.h>
#include "f2fs.h" #include "f2fs.h"
#include "node.h" #include "node.h"
#include "segment.h" #include "segment.h"
@ -587,7 +588,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) { if (err) {
if (err == -ENOMEM) { if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); memalloc_retry_wait(GFP_NOFS);
goto retry_dn; goto retry_dn;
} }
goto out; goto out;
@ -670,8 +671,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
err = check_index_in_prev_nodes(sbi, dest, &dn); err = check_index_in_prev_nodes(sbi, dest, &dn);
if (err) { if (err) {
if (err == -ENOMEM) { if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, memalloc_retry_wait(GFP_NOFS);
DEFAULT_IO_TIMEOUT);
goto retry_prev; goto retry_prev;
} }
goto err; goto err;

View File

@ -9,6 +9,7 @@
#include <linux/f2fs_fs.h> #include <linux/f2fs_fs.h>
#include <linux/bio.h> #include <linux/bio.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/sched/mm.h>
#include <linux/prefetch.h> #include <linux/prefetch.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/swap.h> #include <linux/swap.h>
@ -245,9 +246,7 @@ static int __revoke_inmem_pages(struct inode *inode,
LOOKUP_NODE); LOOKUP_NODE);
if (err) { if (err) {
if (err == -ENOMEM) { if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, memalloc_retry_wait(GFP_NOFS);
DEFAULT_IO_TIMEOUT);
cond_resched();
goto retry; goto retry;
} }
err = -EAGAIN; err = -EAGAIN;
@ -424,9 +423,7 @@ static int __f2fs_commit_inmem_pages(struct inode *inode)
err = f2fs_do_write_data_page(&fio); err = f2fs_do_write_data_page(&fio);
if (err) { if (err) {
if (err == -ENOMEM) { if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, memalloc_retry_wait(GFP_NOFS);
DEFAULT_IO_TIMEOUT);
cond_resched();
goto retry; goto retry;
} }
unlock_page(page); unlock_page(page);

View File

@ -8,9 +8,9 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/sched/mm.h>
#include <linux/statfs.h> #include <linux/statfs.h>
#include <linux/buffer_head.h> #include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/parser.h> #include <linux/parser.h>
#include <linux/mount.h> #include <linux/mount.h>
@ -2415,8 +2415,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
if (IS_ERR(page)) { if (IS_ERR(page)) {
if (PTR_ERR(page) == -ENOMEM) { if (PTR_ERR(page) == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, memalloc_retry_wait(GFP_NOFS);
DEFAULT_IO_TIMEOUT);
goto repeat; goto repeat;
} }
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);

View File

@ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
struct vm_area_struct *vma; struct vm_area_struct *vma;
/* /*
* end == 0 indicates that the entire range after * end == 0 indicates that the entire range after start should be
* start should be unmapped. * unmapped. Note, end is exclusive, whereas the interval tree takes
* an inclusive "last".
*/ */
vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
unsigned long v_offset; unsigned long v_offset;
unsigned long v_end; unsigned long v_end;

View File

@ -526,6 +526,55 @@ void __remove_inode_hash(struct inode *inode)
} }
EXPORT_SYMBOL(__remove_inode_hash); EXPORT_SYMBOL(__remove_inode_hash);
void dump_mapping(const struct address_space *mapping)
{
struct inode *host;
const struct address_space_operations *a_ops;
struct hlist_node *dentry_first;
struct dentry *dentry_ptr;
struct dentry dentry;
unsigned long ino;
/*
* If mapping is an invalid pointer, we don't want to crash
* accessing it, so probe everything depending on it carefully.
*/
if (get_kernel_nofault(host, &mapping->host) ||
get_kernel_nofault(a_ops, &mapping->a_ops)) {
pr_warn("invalid mapping:%px\n", mapping);
return;
}
if (!host) {
pr_warn("aops:%ps\n", a_ops);
return;
}
if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
get_kernel_nofault(ino, &host->i_ino)) {
pr_warn("aops:%ps invalid inode:%px\n", a_ops, host);
return;
}
if (!dentry_first) {
pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
return;
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
if (get_kernel_nofault(dentry, dentry_ptr)) {
pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
a_ops, ino, dentry_ptr);
return;
}
/*
* if dentry is corrupted, the %pd handler may still crash,
* but it's unlikely that we reach here with a corrupt mapping
*/
pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry);
}
void clear_inode(struct inode *inode) void clear_inode(struct inode *inode)
{ {
/* /*

View File

@ -430,7 +430,7 @@ static int ioctl_file_dedupe_range(struct file *file,
goto out; goto out;
} }
size = offsetof(struct file_dedupe_range __user, info[count]); size = offsetof(struct file_dedupe_range, info[count]);
if (size > PAGE_SIZE) { if (size > PAGE_SIZE) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;

View File

@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
/** /*
* attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
* *
* Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.

View File

@ -2040,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
int i, idx; int i, idx;
struct ocfs2_extent_list *el, *left_el, *right_el; struct ocfs2_extent_list *el, *left_el, *right_el;
struct ocfs2_extent_rec *left_rec, *right_rec; struct ocfs2_extent_rec *left_rec, *right_rec;
struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; struct buffer_head *root_bh;
/* /*
* Update the counts and position values within all the * Update the counts and position values within all the

View File

@ -1799,20 +1799,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
*/ */
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page); cluster_of_pages, mmap_page);
if (ret && ret != -EAGAIN) { if (ret) {
mlog_errno(ret); /*
goto out_quota; * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
} * the target page. In this case, we exit with no error and no target
* page. This will trigger the caller, page_mkwrite(), to re-try
* the operation.
*/
if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) {
BUG_ON(wc->w_target_page);
ret = 0;
goto out_quota;
}
/* mlog_errno(ret);
* ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
* the target page. In this case, we exit with no error and no target
* page. This will trigger the caller, page_mkwrite(), to re-try
* the operation.
*/
if (ret == -EAGAIN) {
BUG_ON(wc->w_target_page);
ret = 0;
goto out_quota; goto out_quota;
} }

View File

@ -120,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(KTHREAD), define_mask(KTHREAD),
}; };
static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, };
ATTRIBUTE_GROUPS(mlog_default);
static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
char *buf) char *buf)
@ -144,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = {
}; };
static struct kobj_type mlog_ktype = { static struct kobj_type mlog_ktype = {
.default_attrs = mlog_attr_ptrs, .default_groups = mlog_default_groups,
.sysfs_ops = &mlog_attr_ops, .sysfs_ops = &mlog_attr_ops,
}; };
static struct kset mlog_kset = { static struct kset mlog_kset = {
@ -157,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset)
int i = 0; int i = 0;
while (mlog_attrs[i].attr.mode) { while (mlog_attrs[i].attr.mode) {
mlog_attr_ptrs[i] = &mlog_attrs[i].attr; mlog_default_attrs[i] = &mlog_attrs[i].attr;
i++; i++;
} }
mlog_attr_ptrs[i] = NULL; mlog_default_attrs[i] = NULL;
kobject_set_name(&mlog_kset.kobj, "logmask"); kobject_set_name(&mlog_kset.kobj, "logmask");
mlog_kset.kobj.kset = o2cb_kset; mlog_kset.kobj.kset = o2cb_kset;

View File

@ -3343,7 +3343,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
struct ocfs2_dir_entry *de, *last_de = NULL; struct ocfs2_dir_entry *de, *last_de = NULL;
char *de_buf, *limit; char *de_buf, *limit;
unsigned long offset = 0; unsigned long offset = 0;
unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; unsigned int rec_len, new_rec_len, free_space;
/* /*
* This calculates how many free bytes we'd have in block zero, should * This calculates how many free bytes we'd have in block zero, should

View File

@ -94,6 +94,7 @@ static struct attribute *ocfs2_filecheck_attrs[] = {
&ocfs2_filecheck_attr_set.attr, &ocfs2_filecheck_attr_set.attr,
NULL NULL
}; };
ATTRIBUTE_GROUPS(ocfs2_filecheck);
static void ocfs2_filecheck_release(struct kobject *kobj) static void ocfs2_filecheck_release(struct kobject *kobj)
{ {
@ -138,7 +139,7 @@ static const struct sysfs_ops ocfs2_filecheck_ops = {
}; };
static struct kobj_type ocfs2_ktype_filecheck = { static struct kobj_type ocfs2_ktype_filecheck = {
.default_attrs = ocfs2_filecheck_attrs, .default_groups = ocfs2_filecheck_groups,
.sysfs_ops = &ocfs2_filecheck_ops, .sysfs_ops = &ocfs2_filecheck_ops,
.release = ocfs2_filecheck_release, .release = ocfs2_filecheck_release,
}; };

View File

@ -1669,8 +1669,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
status = jbd2_journal_load(journal); status = jbd2_journal_load(journal);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
if (!igrab(inode)) BUG_ON(!igrab(inode));
BUG();
jbd2_journal_destroy(journal); jbd2_journal_destroy(journal);
goto done; goto done;
} }
@ -1699,8 +1698,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
if (status < 0) if (status < 0)
mlog_errno(status); mlog_errno(status);
if (!igrab(inode)) BUG_ON(!igrab(inode));
BUG();
jbd2_journal_destroy(journal); jbd2_journal_destroy(journal);

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h> #include <linux/pagewalk.h>
#include <linux/vmacache.h> #include <linux/vmacache.h>
#include <linux/mm_inline.h>
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
#include <linux/huge_mm.h> #include <linux/huge_mm.h>
#include <linux/mount.h> #include <linux/mount.h>
@ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma); name = arch_vma_name(vma);
if (!name) { if (!name) {
const char *anon_name;
if (!mm) { if (!mm) {
name = "[vdso]"; name = "[vdso]";
goto done; goto done;
@ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done; goto done;
} }
if (is_stack(vma)) if (is_stack(vma)) {
name = "[stack]"; name = "[stack]";
goto done;
}
anon_name = vma_anon_name(vma);
if (anon_name) {
seq_pad(m, ' ');
seq_printf(m, "[anon:%s]", anon_name);
}
} }
done: done:

View File

@ -29,6 +29,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/magic.h> #include <linux/magic.h>
#include <linux/xattr.h> #include <linux/xattr.h>
#include <linux/backing-dev.h>
#include "squashfs_fs.h" #include "squashfs_fs.h"
#include "squashfs_fs_sb.h" #include "squashfs_fs_sb.h"
@ -112,6 +113,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
return decompressor; return decompressor;
} }
static int squashfs_bdi_init(struct super_block *sb)
{
int err;
unsigned int major = MAJOR(sb->s_dev);
unsigned int minor = MINOR(sb->s_dev);
bdi_put(sb->s_bdi);
sb->s_bdi = &noop_backing_dev_info;
err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor);
if (err)
return err;
sb->s_bdi->ra_pages = 0;
sb->s_bdi->io_pages = 0;
return 0;
}
static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
{ {
@ -127,6 +146,20 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
TRACE("Entered squashfs_fill_superblock\n"); TRACE("Entered squashfs_fill_superblock\n");
/*
* squashfs provides 'backing_dev_info' in order to disable read-ahead. For
* squashfs, I/O is not deferred, it is done immediately in readpage,
* which means the user would always have to wait their own I/O. So the effect
* of readahead is very weak for squashfs. squashfs_bdi_init will set
* sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for
* squashfs.
*/
err = squashfs_bdi_init(sb);
if (err) {
errorf(fc, "squashfs init bdi failed");
return err;
}
sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
if (sb->s_fs_info == NULL) { if (sb->s_fs_info == NULL) {
ERROR("Failed to allocate squashfs_sb_info\n"); ERROR("Failed to allocate squashfs_sb_info\n");

View File

@ -15,6 +15,7 @@
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h> #include <linux/mmu_notifier.h>
#include <linux/poll.h> #include <linux/poll.h>
#include <linux/slab.h> #include <linux/slab.h>
@ -877,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma, new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff, vma->vm_file, vma->vm_pgoff,
vma_policy(vma), vma_policy(vma),
NULL_VM_UFFD_CTX); NULL_VM_UFFD_CTX, vma_anon_name(vma));
if (prev) if (prev)
vma = prev; vma = prev;
else else
@ -1436,7 +1437,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags, prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma), vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx })); ((struct vm_userfaultfd_ctx){ ctx }),
vma_anon_name(vma));
if (prev) { if (prev) {
vma = prev; vma = prev;
goto next; goto next;
@ -1613,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags, prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma), vma_policy(vma),
NULL_VM_UFFD_CTX); NULL_VM_UFFD_CTX, vma_anon_name(vma));
if (prev) { if (prev) {
vma = prev; vma = prev;
goto next; goto next;

View File

@ -4,7 +4,6 @@
* All Rights Reserved. * All Rights Reserved.
*/ */
#include "xfs.h" #include "xfs.h"
#include <linux/backing-dev.h>
#include "xfs_message.h" #include "xfs_message.h"
#include "xfs_trace.h" #include "xfs_trace.h"
@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
current->comm, current->pid, current->comm, current->pid,
(unsigned int)size, __func__, lflags); (unsigned int)size, __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50); memalloc_retry_wait(lflags);
} while (1); } while (1);
} }

View File

@ -394,7 +394,7 @@ xfs_buf_alloc_pages(
} }
XFS_STATS_INC(bp->b_mount, xb_page_retries); XFS_STATS_INC(bp->b_mount, xb_page_retries);
congestion_wait(BLK_RW_ASYNC, HZ / 50); memalloc_retry_wait(gfp_mask);
} }
return 0; return 0;
} }

View File

@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data);
extern const char *ceph_msg_type_name(int type); extern const char *ceph_msg_type_name(int type);
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
extern void *ceph_kvmalloc(size_t size, gfp_t flags);
struct fs_parameter; struct fs_parameter;
struct fc_log; struct fc_log;

View File

@ -11,12 +11,19 @@
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/time64.h> #include <linux/time64.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/random.h>
/* Minimal region size. Every damon_region is aligned by this. */ /* Minimal region size. Every damon_region is aligned by this. */
#define DAMON_MIN_REGION PAGE_SIZE #define DAMON_MIN_REGION PAGE_SIZE
/* Max priority score for DAMON-based operation schemes */ /* Max priority score for DAMON-based operation schemes */
#define DAMOS_MAX_SCORE (99) #define DAMOS_MAX_SCORE (99)
/* Get a random number in [l, r) */
static inline unsigned long damon_rand(unsigned long l, unsigned long r)
{
return l + prandom_u32_max(r - l);
}
/** /**
* struct damon_addr_range - Represents an address region of [@start, @end). * struct damon_addr_range - Represents an address region of [@start, @end).
* @start: Start address of the region (inclusive). * @start: Start address of the region (inclusive).
@ -185,6 +192,22 @@ struct damos_watermarks {
bool activated; bool activated;
}; };
/**
* struct damos_stat - Statistics on a given scheme.
* @nr_tried: Total number of regions that the scheme is tried to be applied.
* @sz_tried: Total size of regions that the scheme is tried to be applied.
* @nr_applied: Total number of regions that the scheme is applied.
* @sz_applied: Total size of regions that the scheme is applied.
* @qt_exceeds: Total number of times the quota of the scheme has exceeded.
*/
struct damos_stat {
unsigned long nr_tried;
unsigned long sz_tried;
unsigned long nr_applied;
unsigned long sz_applied;
unsigned long qt_exceeds;
};
/** /**
* struct damos - Represents a Data Access Monitoring-based Operation Scheme. * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
* @min_sz_region: Minimum size of target regions. * @min_sz_region: Minimum size of target regions.
@ -196,8 +219,7 @@ struct damos_watermarks {
* @action: &damo_action to be applied to the target regions. * @action: &damo_action to be applied to the target regions.
* @quota: Control the aggressiveness of this scheme. * @quota: Control the aggressiveness of this scheme.
* @wmarks: Watermarks for automated (in)activation of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme.
* @stat_count: Total number of regions that this scheme is applied. * @stat: Statistics of this scheme.
* @stat_sz: Total size of regions that this scheme is applied.
* @list: List head for siblings. * @list: List head for siblings.
* *
* For each aggregation interval, DAMON finds regions which fit in the * For each aggregation interval, DAMON finds regions which fit in the
@ -228,8 +250,7 @@ struct damos {
enum damos_action action; enum damos_action action;
struct damos_quota quota; struct damos_quota quota;
struct damos_watermarks wmarks; struct damos_watermarks wmarks;
unsigned long stat_count; struct damos_stat stat;
unsigned long stat_sz;
struct list_head list; struct list_head list;
}; };
@ -274,7 +295,8 @@ struct damon_ctx;
* as an integer in [0, &DAMOS_MAX_SCORE]. * as an integer in [0, &DAMOS_MAX_SCORE].
* @apply_scheme is called from @kdamond when a region for user provided * @apply_scheme is called from @kdamond when a region for user provided
* DAMON-based operation scheme is found. It should apply the scheme's action * DAMON-based operation scheme is found. It should apply the scheme's action
* to the region. This is not used for &DAMON_ARBITRARY_TARGET case. * to the region and return bytes of the region that the action is successfully
* applied.
* @target_valid should check whether the target is still valid for the * @target_valid should check whether the target is still valid for the
* monitoring. * monitoring.
* @cleanup is called from @kdamond just before its termination. * @cleanup is called from @kdamond just before its termination.
@ -288,8 +310,9 @@ struct damon_primitive {
int (*get_scheme_score)(struct damon_ctx *context, int (*get_scheme_score)(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r, struct damon_target *t, struct damon_region *r,
struct damos *scheme); struct damos *scheme);
int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, unsigned long (*apply_scheme)(struct damon_ctx *context,
struct damon_region *r, struct damos *scheme); struct damon_target *t, struct damon_region *r,
struct damos *scheme);
bool (*target_valid)(void *target); bool (*target_valid)(void *target);
void (*cleanup)(struct damon_ctx *context); void (*cleanup)(struct damon_ctx *context);
}; };
@ -392,14 +415,20 @@ struct damon_ctx {
struct list_head schemes; struct list_head schemes;
}; };
#define damon_next_region(r) \ static inline struct damon_region *damon_next_region(struct damon_region *r)
(container_of(r->list.next, struct damon_region, list)) {
return container_of(r->list.next, struct damon_region, list);
}
#define damon_prev_region(r) \ static inline struct damon_region *damon_prev_region(struct damon_region *r)
(container_of(r->list.prev, struct damon_region, list)) {
return container_of(r->list.prev, struct damon_region, list);
}
#define damon_last_region(t) \ static inline struct damon_region *damon_last_region(struct damon_target *t)
(list_last_entry(&t->regions_list, struct damon_region, list)) {
return list_last_entry(&t->regions_list, struct damon_region, list);
}
#define damon_for_each_region(r, t) \ #define damon_for_each_region(r, t) \
list_for_each_entry(r, &t->regions_list, list) list_for_each_entry(r, &t->regions_list, list)
@ -422,9 +451,18 @@ struct damon_ctx {
#ifdef CONFIG_DAMON #ifdef CONFIG_DAMON
struct damon_region *damon_new_region(unsigned long start, unsigned long end); struct damon_region *damon_new_region(unsigned long start, unsigned long end);
inline void damon_insert_region(struct damon_region *r,
/*
* Add a region between two other regions
*/
static inline void damon_insert_region(struct damon_region *r,
struct damon_region *prev, struct damon_region *next, struct damon_region *prev, struct damon_region *next,
struct damon_target *t); struct damon_target *t)
{
__list_add(&r->list, &prev->list, &next->list);
t->nr_regions++;
}
void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_add_region(struct damon_region *r, struct damon_target *t);
void damon_destroy_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t);
@ -461,34 +499,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
#endif /* CONFIG_DAMON */ #endif /* CONFIG_DAMON */
#ifdef CONFIG_DAMON_VADDR #ifdef CONFIG_DAMON_VADDR
/* Monitoring primitives for virtual memory address spaces */
void damon_va_init(struct damon_ctx *ctx);
void damon_va_update(struct damon_ctx *ctx);
void damon_va_prepare_access_checks(struct damon_ctx *ctx);
unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
bool damon_va_target_valid(void *t); bool damon_va_target_valid(void *t);
void damon_va_cleanup(struct damon_ctx *ctx);
int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
struct damon_region *r, struct damos *scheme);
int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
struct damon_region *r, struct damos *scheme);
void damon_va_set_primitives(struct damon_ctx *ctx); void damon_va_set_primitives(struct damon_ctx *ctx);
#endif /* CONFIG_DAMON_VADDR */ #endif /* CONFIG_DAMON_VADDR */
#ifdef CONFIG_DAMON_PADDR #ifdef CONFIG_DAMON_PADDR
/* Monitoring primitives for the physical memory address space */
void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
bool damon_pa_target_valid(void *t); bool damon_pa_target_valid(void *t);
int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
struct damon_region *r, struct damos *scheme);
int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
struct damon_region *r, struct damos *scheme);
void damon_pa_set_primitives(struct damon_ctx *ctx); void damon_pa_set_primitives(struct damon_ctx *ctx);
#endif /* CONFIG_DAMON_PADDR */ #endif /* CONFIG_DAMON_PADDR */
#endif /* _DAMON_H */ #endif /* _DAMON_H */

View File

@ -3093,6 +3093,7 @@ extern void unlock_new_inode(struct inode *);
extern void discard_new_inode(struct inode *); extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void); extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb); extern void evict_inodes(struct super_block *sb);
void dump_mapping(const struct address_space *);
/* /*
* Userspace may rely on the the inode number being non-zero. For example, glibc * Userspace may rely on the the inode number being non-zero. For example, glibc

View File

@ -302,7 +302,9 @@ struct vm_area_struct;
* lowest zone as a type of emergency reserve. * lowest zone as a type of emergency reserve.
* *
* %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
* address. * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory
* because the DMA32 kmalloc cache array is not implemented.
* (Reason: there is no such user in kernel).
* *
* %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
* do not need to be directly accessible by the kernel but that cannot * do not need to be directly accessible by the kernel but that cannot
@ -598,9 +600,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order);
struct folio *folio_alloc(gfp_t gfp, unsigned order); struct folio *folio_alloc(gfp_t gfp, unsigned order);
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr, struct vm_area_struct *vma, unsigned long addr,
int node, bool hugepage); bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) alloc_pages_vma(gfp_mask, order, vma, addr, true)
#else #else
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{ {
@ -610,14 +612,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
{ {
return __folio_alloc_node(gfp, order, numa_node_id()); return __folio_alloc_node(gfp, order, numa_node_id());
} }
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ #define alloc_pages_vma(gfp_mask, order, vma, addr, false)\
alloc_pages(gfp_mask, order) alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages(gfp_mask, order) alloc_pages(gfp_mask, order)
#endif #endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr) \ #define alloc_page_vma(gfp_mask, vma, addr) \
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) alloc_pages_vma(gfp_mask, 0, vma, addr, false)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask); extern unsigned long get_zeroed_page(gfp_t gfp_mask);

View File

@ -622,8 +622,8 @@ struct hstate {
#endif #endif
#ifdef CONFIG_CGROUP_HUGETLB #ifdef CONFIG_CGROUP_HUGETLB
/* cgroup control files */ /* cgroup control files */
struct cftype cgroup_files_dfl[7]; struct cftype cgroup_files_dfl[8];
struct cftype cgroup_files_legacy[9]; struct cftype cgroup_files_legacy[10];
#endif #endif
char name[HSTATE_NAME_LEN]; char name[HSTATE_NAME_LEN];
}; };

View File

@ -36,6 +36,11 @@ enum hugetlb_memory_event {
HUGETLB_NR_MEMORY_EVENTS, HUGETLB_NR_MEMORY_EVENTS,
}; };
struct hugetlb_cgroup_per_node {
/* hugetlb usage in pages over all hstates. */
unsigned long usage[HUGE_MAX_HSTATE];
};
struct hugetlb_cgroup { struct hugetlb_cgroup {
struct cgroup_subsys_state css; struct cgroup_subsys_state css;
@ -57,6 +62,8 @@ struct hugetlb_cgroup {
/* Handle for "hugetlb.events.local" */ /* Handle for "hugetlb.events.local" */
struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
struct hugetlb_cgroup_per_node *nodeinfo[];
}; };
static inline struct hugetlb_cgroup * static inline struct hugetlb_cgroup *

Some files were not shown because too many files have changed in this diff Show More