mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-12-28 00:35:01 +00:00
25 hotfixes. 16 are cc:stable. 19 are MM and 6 are non-MM.
The usual bunch of singletons and doubletons - please see the relevant changelogs for details. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZ2cghQAKCRDdBJ7gKXxA jgrsAQCvlSmHYYLXBE1A6cram4qWgEP/2vD94d6sVv9UipO/FAEA8y1K7dbT2AGX A5ESuRndu5Iy76mb6Tiarqa/yt56QgU= =ZYVx -----END PGP SIGNATURE----- Merge tag 'mm-hotfixes-stable-2024-12-21-12-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc fixes from Andrew Morton: "25 hotfixes. 16 are cc:stable. 19 are MM and 6 are non-MM. The usual bunch of singletons and doubletons - please see the relevant changelogs for details" * tag 'mm-hotfixes-stable-2024-12-21-12-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (25 commits) mm: huge_memory: handle strsep not finding delimiter alloc_tag: fix set_codetag_empty() when !CONFIG_MEM_ALLOC_PROFILING_DEBUG alloc_tag: fix module allocation tags populated area calculation mm/codetag: clear tags before swap mm/vmstat: fix a W=1 clang compiler warning mm: convert partially_mapped set/clear operations to be atomic nilfs2: fix buffer head leaks in calls to truncate_inode_pages() vmalloc: fix accounting with i915 mm/page_alloc: don't call pfn_to_page() on possibly non-existent PFN in split_large_buddy() fork: avoid inappropriate uprobe access to invalid mm nilfs2: prevent use of deleted inode zram: fix uninitialized ZRAM not releasing backing device zram: refuse to use zero sized block device as backing device mm: use clear_user_(high)page() for arch with special user folio handling mm: introduce cpu_icache_is_aliasing() across all architectures mm: add RCU annotation to pte_offset_map(_lock) mm: correctly reference merged VMA mm: use aligned address in copy_user_gigantic_page() mm: use aligned address in clear_gigantic_page() mm: shmem: fix ShmemHugePages at swapout ...
This commit is contained in:
commit
4aa748dd1a
1
.mailmap
1
.mailmap
@ -735,6 +735,7 @@ Wolfram Sang <wsa@kernel.org> <w.sang@pengutronix.de>
|
|||||||
Wolfram Sang <wsa@kernel.org> <wsa@the-dreams.de>
|
Wolfram Sang <wsa@kernel.org> <wsa@the-dreams.de>
|
||||||
Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
|
Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
|
||||||
Yanteng Si <si.yanteng@linux.dev> <siyanteng@loongson.cn>
|
Yanteng Si <si.yanteng@linux.dev> <siyanteng@loongson.cn>
|
||||||
|
Ying Huang <huang.ying.caritas@gmail.com> <ying.huang@intel.com>
|
||||||
Yusuke Goda <goda.yusuke@renesas.com>
|
Yusuke Goda <goda.yusuke@renesas.com>
|
||||||
Zack Rusin <zack.rusin@broadcom.com> <zackr@vmware.com>
|
Zack Rusin <zack.rusin@broadcom.com> <zackr@vmware.com>
|
||||||
Zhu Yanjun <zyjzyj2000@gmail.com> <yanjunz@nvidia.com>
|
Zhu Yanjun <zyjzyj2000@gmail.com> <yanjunz@nvidia.com>
|
||||||
|
@ -3,3 +3,853 @@
|
|||||||
=================
|
=================
|
||||||
Process Addresses
|
Process Addresses
|
||||||
=================
|
=================
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 3
|
||||||
|
|
||||||
|
|
||||||
|
Userland memory ranges are tracked by the kernel via Virtual Memory Areas or
|
||||||
|
'VMA's of type :c:struct:`!struct vm_area_struct`.
|
||||||
|
|
||||||
|
Each VMA describes a virtually contiguous memory range with identical
|
||||||
|
attributes, each described by a :c:struct:`!struct vm_area_struct`
|
||||||
|
object. Userland access outside of VMAs is invalid except in the case where an
|
||||||
|
adjacent stack VMA could be extended to contain the accessed address.
|
||||||
|
|
||||||
|
All VMAs are contained within one and only one virtual address space, described
|
||||||
|
by a :c:struct:`!struct mm_struct` object which is referenced by all tasks (that is,
|
||||||
|
threads) which share the virtual address space. We refer to this as the
|
||||||
|
:c:struct:`!mm`.
|
||||||
|
|
||||||
|
Each mm object contains a maple tree data structure which describes all VMAs
|
||||||
|
within the virtual address space.
|
||||||
|
|
||||||
|
.. note:: An exception to this is the 'gate' VMA which is provided by
|
||||||
|
architectures which use :c:struct:`!vsyscall` and is a global static
|
||||||
|
object which does not belong to any specific mm.
|
||||||
|
|
||||||
|
-------
|
||||||
|
Locking
|
||||||
|
-------
|
||||||
|
|
||||||
|
The kernel is designed to be highly scalable against concurrent read operations
|
||||||
|
on VMA **metadata** so a complicated set of locks are required to ensure memory
|
||||||
|
corruption does not occur.
|
||||||
|
|
||||||
|
.. note:: Locking VMAs for their metadata does not have any impact on the memory
|
||||||
|
they describe nor the page tables that map them.
|
||||||
|
|
||||||
|
Terminology
|
||||||
|
-----------
|
||||||
|
|
||||||
|
* **mmap locks** - Each MM has a read/write semaphore :c:member:`!mmap_lock`
|
||||||
|
which locks at a process address space granularity which can be acquired via
|
||||||
|
:c:func:`!mmap_read_lock`, :c:func:`!mmap_write_lock` and variants.
|
||||||
|
* **VMA locks** - The VMA lock is at VMA granularity (of course) which behaves
|
||||||
|
as a read/write semaphore in practice. A VMA read lock is obtained via
|
||||||
|
:c:func:`!lock_vma_under_rcu` (and unlocked via :c:func:`!vma_end_read`) and a
|
||||||
|
write lock via :c:func:`!vma_start_write` (all VMA write locks are unlocked
|
||||||
|
automatically when the mmap write lock is released). To take a VMA write lock
|
||||||
|
you **must** have already acquired an :c:func:`!mmap_write_lock`.
|
||||||
|
* **rmap locks** - When trying to access VMAs through the reverse mapping via a
|
||||||
|
:c:struct:`!struct address_space` or :c:struct:`!struct anon_vma` object
|
||||||
|
(reachable from a folio via :c:member:`!folio->mapping`). VMAs must be stabilised via
|
||||||
|
:c:func:`!anon_vma_[try]lock_read` or :c:func:`!anon_vma_[try]lock_write` for
|
||||||
|
anonymous memory and :c:func:`!i_mmap_[try]lock_read` or
|
||||||
|
:c:func:`!i_mmap_[try]lock_write` for file-backed memory. We refer to these
|
||||||
|
locks as the reverse mapping locks, or 'rmap locks' for brevity.
|
||||||
|
|
||||||
|
We discuss page table locks separately in the dedicated section below.
|
||||||
|
|
||||||
|
The first thing **any** of these locks achieve is to **stabilise** the VMA
|
||||||
|
within the MM tree. That is, guaranteeing that the VMA object will not be
|
||||||
|
deleted from under you nor modified (except for some specific fields
|
||||||
|
described below).
|
||||||
|
|
||||||
|
Stabilising a VMA also keeps the address space described by it around.
|
||||||
|
|
||||||
|
Lock usage
|
||||||
|
----------
|
||||||
|
|
||||||
|
If you want to **read** VMA metadata fields or just keep the VMA stable, you
|
||||||
|
must do one of the following:
|
||||||
|
|
||||||
|
* Obtain an mmap read lock at the MM granularity via :c:func:`!mmap_read_lock` (or a
|
||||||
|
suitable variant), unlocking it with a matching :c:func:`!mmap_read_unlock` when
|
||||||
|
you're done with the VMA, *or*
|
||||||
|
* Try to obtain a VMA read lock via :c:func:`!lock_vma_under_rcu`. This tries to
|
||||||
|
acquire the lock atomically so might fail, in which case fall-back logic is
|
||||||
|
required to instead obtain an mmap read lock if this returns :c:macro:`!NULL`,
|
||||||
|
*or*
|
||||||
|
* Acquire an rmap lock before traversing the locked interval tree (whether
|
||||||
|
anonymous or file-backed) to obtain the required VMA.
|
||||||
|
|
||||||
|
If you want to **write** VMA metadata fields, then things vary depending on the
|
||||||
|
field (we explore each VMA field in detail below). For the majority you must:
|
||||||
|
|
||||||
|
* Obtain an mmap write lock at the MM granularity via :c:func:`!mmap_write_lock` (or a
|
||||||
|
suitable variant), unlocking it with a matching :c:func:`!mmap_write_unlock` when
|
||||||
|
you're done with the VMA, *and*
|
||||||
|
* Obtain a VMA write lock via :c:func:`!vma_start_write` for each VMA you wish to
|
||||||
|
modify, which will be released automatically when :c:func:`!mmap_write_unlock` is
|
||||||
|
called.
|
||||||
|
* If you want to be able to write to **any** field, you must also hide the VMA
|
||||||
|
from the reverse mapping by obtaining an **rmap write lock**.
|
||||||
|
|
||||||
|
VMA locks are special in that you must obtain an mmap **write** lock **first**
|
||||||
|
in order to obtain a VMA **write** lock. A VMA **read** lock however can be
|
||||||
|
obtained without any other lock (:c:func:`!lock_vma_under_rcu` will acquire then
|
||||||
|
release an RCU lock to lookup the VMA for you).
|
||||||
|
|
||||||
|
This constrains the impact of writers on readers, as a writer can interact with
|
||||||
|
one VMA while a reader interacts with another simultaneously.
|
||||||
|
|
||||||
|
.. note:: The primary users of VMA read locks are page fault handlers, which
|
||||||
|
means that without a VMA write lock, page faults will run concurrent with
|
||||||
|
whatever you are doing.
|
||||||
|
|
||||||
|
Examining all valid lock states:
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
========= ======== ========= ======= ===== =========== ==========
|
||||||
|
mmap lock VMA lock rmap lock Stable? Read? Write most? Write all?
|
||||||
|
========= ======== ========= ======= ===== =========== ==========
|
||||||
|
\- \- \- N N N N
|
||||||
|
\- R \- Y Y N N
|
||||||
|
\- \- R/W Y Y N N
|
||||||
|
R/W \-/R \-/R/W Y Y N N
|
||||||
|
W W \-/R Y Y Y N
|
||||||
|
W W W Y Y Y Y
|
||||||
|
========= ======== ========= ======= ===== =========== ==========
|
||||||
|
|
||||||
|
.. warning:: While it's possible to obtain a VMA lock while holding an mmap read lock,
|
||||||
|
attempting to do the reverse is invalid as it can result in deadlock - if
|
||||||
|
another task already holds an mmap write lock and attempts to acquire a VMA
|
||||||
|
write lock that will deadlock on the VMA read lock.
|
||||||
|
|
||||||
|
All of these locks behave as read/write semaphores in practice, so you can
|
||||||
|
obtain either a read or a write lock for each of these.
|
||||||
|
|
||||||
|
.. note:: Generally speaking, a read/write semaphore is a class of lock which
|
||||||
|
permits concurrent readers. However a write lock can only be obtained
|
||||||
|
once all readers have left the critical region (and pending readers
|
||||||
|
made to wait).
|
||||||
|
|
||||||
|
This renders read locks on a read/write semaphore concurrent with other
|
||||||
|
readers and write locks exclusive against all others holding the semaphore.
|
||||||
|
|
||||||
|
VMA fields
|
||||||
|
^^^^^^^^^^
|
||||||
|
|
||||||
|
We can subdivide :c:struct:`!struct vm_area_struct` fields by their purpose, which makes it
|
||||||
|
easier to explore their locking characteristics:
|
||||||
|
|
||||||
|
.. note:: We exclude VMA lock-specific fields here to avoid confusion, as these
|
||||||
|
are in effect an internal implementation detail.
|
||||||
|
|
||||||
|
.. table:: Virtual layout fields
|
||||||
|
|
||||||
|
===================== ======================================== ===========
|
||||||
|
Field Description Write lock
|
||||||
|
===================== ======================================== ===========
|
||||||
|
:c:member:`!vm_start` Inclusive start virtual address of range mmap write,
|
||||||
|
VMA describes. VMA write,
|
||||||
|
rmap write.
|
||||||
|
:c:member:`!vm_end` Exclusive end virtual address of range mmap write,
|
||||||
|
VMA describes. VMA write,
|
||||||
|
rmap write.
|
||||||
|
:c:member:`!vm_pgoff` Describes the page offset into the file, mmap write,
|
||||||
|
the original page offset within the VMA write,
|
||||||
|
virtual address space (prior to any rmap write.
|
||||||
|
:c:func:`!mremap`), or PFN if a PFN map
|
||||||
|
and the architecture does not support
|
||||||
|
:c:macro:`!CONFIG_ARCH_HAS_PTE_SPECIAL`.
|
||||||
|
===================== ======================================== ===========
|
||||||
|
|
||||||
|
These fields describes the size, start and end of the VMA, and as such cannot be
|
||||||
|
modified without first being hidden from the reverse mapping since these fields
|
||||||
|
are used to locate VMAs within the reverse mapping interval trees.
|
||||||
|
|
||||||
|
.. table:: Core fields
|
||||||
|
|
||||||
|
============================ ======================================== =========================
|
||||||
|
Field Description Write lock
|
||||||
|
============================ ======================================== =========================
|
||||||
|
:c:member:`!vm_mm` Containing mm_struct. None - written once on
|
||||||
|
initial map.
|
||||||
|
:c:member:`!vm_page_prot` Architecture-specific page table mmap write, VMA write.
|
||||||
|
protection bits determined from VMA
|
||||||
|
flags.
|
||||||
|
:c:member:`!vm_flags` Read-only access to VMA flags describing N/A
|
||||||
|
attributes of the VMA, in union with
|
||||||
|
private writable
|
||||||
|
:c:member:`!__vm_flags`.
|
||||||
|
:c:member:`!__vm_flags` Private, writable access to VMA flags mmap write, VMA write.
|
||||||
|
field, updated by
|
||||||
|
:c:func:`!vm_flags_*` functions.
|
||||||
|
:c:member:`!vm_file` If the VMA is file-backed, points to a None - written once on
|
||||||
|
struct file object describing the initial map.
|
||||||
|
underlying file, if anonymous then
|
||||||
|
:c:macro:`!NULL`.
|
||||||
|
:c:member:`!vm_ops` If the VMA is file-backed, then either None - Written once on
|
||||||
|
the driver or file-system provides a initial map by
|
||||||
|
:c:struct:`!struct vm_operations_struct` :c:func:`!f_ops->mmap()`.
|
||||||
|
object describing callbacks to be
|
||||||
|
invoked on VMA lifetime events.
|
||||||
|
:c:member:`!vm_private_data` A :c:member:`!void *` field for Handled by driver.
|
||||||
|
driver-specific metadata.
|
||||||
|
============================ ======================================== =========================
|
||||||
|
|
||||||
|
These are the core fields which describe the MM the VMA belongs to and its attributes.
|
||||||
|
|
||||||
|
.. table:: Config-specific fields
|
||||||
|
|
||||||
|
================================= ===================== ======================================== ===============
|
||||||
|
Field Configuration option Description Write lock
|
||||||
|
================================= ===================== ======================================== ===============
|
||||||
|
:c:member:`!anon_name` CONFIG_ANON_VMA_NAME A field for storing a mmap write,
|
||||||
|
:c:struct:`!struct anon_vma_name` VMA write.
|
||||||
|
object providing a name for anonymous
|
||||||
|
mappings, or :c:macro:`!NULL` if none
|
||||||
|
is set or the VMA is file-backed. The
|
||||||
|
underlying object is reference counted
|
||||||
|
and can be shared across multiple VMAs
|
||||||
|
for scalability.
|
||||||
|
:c:member:`!swap_readahead_info` CONFIG_SWAP Metadata used by the swap mechanism mmap read,
|
||||||
|
to perform readahead. This field is swap-specific
|
||||||
|
accessed atomically. lock.
|
||||||
|
:c:member:`!vm_policy` CONFIG_NUMA :c:type:`!mempolicy` object which mmap write,
|
||||||
|
describes the NUMA behaviour of the VMA write.
|
||||||
|
VMA. The underlying object is reference
|
||||||
|
counted.
|
||||||
|
:c:member:`!numab_state` CONFIG_NUMA_BALANCING :c:type:`!vma_numab_state` object which mmap read,
|
||||||
|
describes the current state of numab-specific
|
||||||
|
NUMA balancing in relation to this VMA. lock.
|
||||||
|
Updated under mmap read lock by
|
||||||
|
:c:func:`!task_numa_work`.
|
||||||
|
:c:member:`!vm_userfaultfd_ctx` CONFIG_USERFAULTFD Userfaultfd context wrapper object of mmap write,
|
||||||
|
type :c:type:`!vm_userfaultfd_ctx`, VMA write.
|
||||||
|
either of zero size if userfaultfd is
|
||||||
|
disabled, or containing a pointer
|
||||||
|
to an underlying
|
||||||
|
:c:type:`!userfaultfd_ctx` object which
|
||||||
|
describes userfaultfd metadata.
|
||||||
|
================================= ===================== ======================================== ===============
|
||||||
|
|
||||||
|
These fields are present or not depending on whether the relevant kernel
|
||||||
|
configuration option is set.
|
||||||
|
|
||||||
|
.. table:: Reverse mapping fields
|
||||||
|
|
||||||
|
=================================== ========================================= ============================
|
||||||
|
Field Description Write lock
|
||||||
|
=================================== ========================================= ============================
|
||||||
|
:c:member:`!shared.rb` A red/black tree node used, if the mmap write, VMA write,
|
||||||
|
mapping is file-backed, to place the VMA i_mmap write.
|
||||||
|
in the
|
||||||
|
:c:member:`!struct address_space->i_mmap`
|
||||||
|
red/black interval tree.
|
||||||
|
:c:member:`!shared.rb_subtree_last` Metadata used for management of the mmap write, VMA write,
|
||||||
|
interval tree if the VMA is file-backed. i_mmap write.
|
||||||
|
:c:member:`!anon_vma_chain` List of pointers to both forked/CoW’d mmap read, anon_vma write.
|
||||||
|
:c:type:`!anon_vma` objects and
|
||||||
|
:c:member:`!vma->anon_vma` if it is
|
||||||
|
non-:c:macro:`!NULL`.
|
||||||
|
:c:member:`!anon_vma` :c:type:`!anon_vma` object used by When :c:macro:`NULL` and
|
||||||
|
anonymous folios mapped exclusively to setting non-:c:macro:`NULL`:
|
||||||
|
this VMA. Initially set by mmap read, page_table_lock.
|
||||||
|
:c:func:`!anon_vma_prepare` serialised
|
||||||
|
by the :c:macro:`!page_table_lock`. This When non-:c:macro:`NULL` and
|
||||||
|
is set as soon as any page is faulted in. setting :c:macro:`NULL`:
|
||||||
|
mmap write, VMA write,
|
||||||
|
anon_vma write.
|
||||||
|
=================================== ========================================= ============================
|
||||||
|
|
||||||
|
These fields are used to both place the VMA within the reverse mapping, and for
|
||||||
|
anonymous mappings, to be able to access both related :c:struct:`!struct anon_vma` objects
|
||||||
|
and the :c:struct:`!struct anon_vma` in which folios mapped exclusively to this VMA should
|
||||||
|
reside.
|
||||||
|
|
||||||
|
.. note:: If a file-backed mapping is mapped with :c:macro:`!MAP_PRIVATE` set
|
||||||
|
then it can be in both the :c:type:`!anon_vma` and :c:type:`!i_mmap`
|
||||||
|
trees at the same time, so all of these fields might be utilised at
|
||||||
|
once.
|
||||||
|
|
||||||
|
Page tables
|
||||||
|
-----------
|
||||||
|
|
||||||
|
We won't speak exhaustively on the subject but broadly speaking, page tables map
|
||||||
|
virtual addresses to physical ones through a series of page tables, each of
|
||||||
|
which contain entries with physical addresses for the next page table level
|
||||||
|
(along with flags), and at the leaf level the physical addresses of the
|
||||||
|
underlying physical data pages or a special entry such as a swap entry,
|
||||||
|
migration entry or other special marker. Offsets into these pages are provided
|
||||||
|
by the virtual address itself.
|
||||||
|
|
||||||
|
In Linux these are divided into five levels - PGD, P4D, PUD, PMD and PTE. Huge
|
||||||
|
pages might eliminate one or two of these levels, but when this is the case we
|
||||||
|
typically refer to the leaf level as the PTE level regardless.
|
||||||
|
|
||||||
|
.. note:: In instances where the architecture supports fewer page tables than
|
||||||
|
five the kernel cleverly 'folds' page table levels, that is stubbing
|
||||||
|
out functions related to the skipped levels. This allows us to
|
||||||
|
conceptually act as if there were always five levels, even if the
|
||||||
|
compiler might, in practice, eliminate any code relating to missing
|
||||||
|
ones.
|
||||||
|
|
||||||
|
There are four key operations typically performed on page tables:
|
||||||
|
|
||||||
|
1. **Traversing** page tables - Simply reading page tables in order to traverse
|
||||||
|
them. This only requires that the VMA is kept stable, so a lock which
|
||||||
|
establishes this suffices for traversal (there are also lockless variants
|
||||||
|
which eliminate even this requirement, such as :c:func:`!gup_fast`).
|
||||||
|
2. **Installing** page table mappings - Whether creating a new mapping or
|
||||||
|
modifying an existing one in such a way as to change its identity. This
|
||||||
|
requires that the VMA is kept stable via an mmap or VMA lock (explicitly not
|
||||||
|
rmap locks).
|
||||||
|
3. **Zapping/unmapping** page table entries - This is what the kernel calls
|
||||||
|
clearing page table mappings at the leaf level only, whilst leaving all page
|
||||||
|
tables in place. This is a very common operation in the kernel performed on
|
||||||
|
file truncation, the :c:macro:`!MADV_DONTNEED` operation via
|
||||||
|
:c:func:`!madvise`, and others. This is performed by a number of functions
|
||||||
|
including :c:func:`!unmap_mapping_range` and :c:func:`!unmap_mapping_pages`.
|
||||||
|
The VMA need only be kept stable for this operation.
|
||||||
|
4. **Freeing** page tables - When finally the kernel removes page tables from a
|
||||||
|
userland process (typically via :c:func:`!free_pgtables`) extreme care must
|
||||||
|
be taken to ensure this is done safely, as this logic finally frees all page
|
||||||
|
tables in the specified range, ignoring existing leaf entries (it assumes the
|
||||||
|
caller has both zapped the range and prevented any further faults or
|
||||||
|
modifications within it).
|
||||||
|
|
||||||
|
.. note:: Modifying mappings for reclaim or migration is performed under rmap
|
||||||
|
lock as it, like zapping, does not fundamentally modify the identity
|
||||||
|
of what is being mapped.
|
||||||
|
|
||||||
|
**Traversing** and **zapping** ranges can be performed holding any one of the
|
||||||
|
locks described in the terminology section above - that is the mmap lock, the
|
||||||
|
VMA lock or either of the reverse mapping locks.
|
||||||
|
|
||||||
|
That is - as long as you keep the relevant VMA **stable** - you are good to go
|
||||||
|
ahead and perform these operations on page tables (though internally, kernel
|
||||||
|
operations that perform writes also acquire internal page table locks to
|
||||||
|
serialise - see the page table implementation detail section for more details).
|
||||||
|
|
||||||
|
When **installing** page table entries, the mmap or VMA lock must be held to
|
||||||
|
keep the VMA stable. We explore why this is in the page table locking details
|
||||||
|
section below.
|
||||||
|
|
||||||
|
.. warning:: Page tables are normally only traversed in regions covered by VMAs.
|
||||||
|
If you want to traverse page tables in areas that might not be
|
||||||
|
covered by VMAs, heavier locking is required.
|
||||||
|
See :c:func:`!walk_page_range_novma` for details.
|
||||||
|
|
||||||
|
**Freeing** page tables is an entirely internal memory management operation and
|
||||||
|
has special requirements (see the page freeing section below for more details).
|
||||||
|
|
||||||
|
.. warning:: When **freeing** page tables, it must not be possible for VMAs
|
||||||
|
containing the ranges those page tables map to be accessible via
|
||||||
|
the reverse mapping.
|
||||||
|
|
||||||
|
The :c:func:`!free_pgtables` function removes the relevant VMAs
|
||||||
|
from the reverse mappings, but no other VMAs can be permitted to be
|
||||||
|
accessible and span the specified range.
|
||||||
|
|
||||||
|
Lock ordering
|
||||||
|
-------------
|
||||||
|
|
||||||
|
As we have multiple locks across the kernel which may or may not be taken at the
|
||||||
|
same time as explicit mm or VMA locks, we have to be wary of lock inversion, and
|
||||||
|
the **order** in which locks are acquired and released becomes very important.
|
||||||
|
|
||||||
|
.. note:: Lock inversion occurs when two threads need to acquire multiple locks,
|
||||||
|
but in doing so inadvertently cause a mutual deadlock.
|
||||||
|
|
||||||
|
For example, consider thread 1 which holds lock A and tries to acquire lock B,
|
||||||
|
while thread 2 holds lock B and tries to acquire lock A.
|
||||||
|
|
||||||
|
Both threads are now deadlocked on each other. However, had they attempted to
|
||||||
|
acquire locks in the same order, one would have waited for the other to
|
||||||
|
complete its work and no deadlock would have occurred.
|
||||||
|
|
||||||
|
The opening comment in :c:macro:`!mm/rmap.c` describes in detail the required
|
||||||
|
ordering of locks within memory management code:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
inode->i_rwsem (while writing or truncating, not reading or faulting)
|
||||||
|
mm->mmap_lock
|
||||||
|
mapping->invalidate_lock (in filemap_fault)
|
||||||
|
folio_lock
|
||||||
|
hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
|
||||||
|
vma_start_write
|
||||||
|
mapping->i_mmap_rwsem
|
||||||
|
anon_vma->rwsem
|
||||||
|
mm->page_table_lock or pte_lock
|
||||||
|
swap_lock (in swap_duplicate, swap_info_get)
|
||||||
|
mmlist_lock (in mmput, drain_mmlist and others)
|
||||||
|
mapping->private_lock (in block_dirty_folio)
|
||||||
|
i_pages lock (widely used)
|
||||||
|
lruvec->lru_lock (in folio_lruvec_lock_irq)
|
||||||
|
inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
||||||
|
bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
||||||
|
sb_lock (within inode_lock in fs/fs-writeback.c)
|
||||||
|
i_pages lock (widely used, in set_page_dirty,
|
||||||
|
in arch-dependent flush_dcache_mmap_lock,
|
||||||
|
within bdi.wb->list_lock in __sync_single_inode)
|
||||||
|
|
||||||
|
There is also a file-system specific lock ordering comment located at the top of
|
||||||
|
:c:macro:`!mm/filemap.c`:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
->i_mmap_rwsem (truncate_pagecache)
|
||||||
|
->private_lock (__free_pte->block_dirty_folio)
|
||||||
|
->swap_lock (exclusive_swap_page, others)
|
||||||
|
->i_pages lock
|
||||||
|
|
||||||
|
->i_rwsem
|
||||||
|
->invalidate_lock (acquired by fs in truncate path)
|
||||||
|
->i_mmap_rwsem (truncate->unmap_mapping_range)
|
||||||
|
|
||||||
|
->mmap_lock
|
||||||
|
->i_mmap_rwsem
|
||||||
|
->page_table_lock or pte_lock (various, mainly in memory.c)
|
||||||
|
->i_pages lock (arch-dependent flush_dcache_mmap_lock)
|
||||||
|
|
||||||
|
->mmap_lock
|
||||||
|
->invalidate_lock (filemap_fault)
|
||||||
|
->lock_page (filemap_fault, access_process_vm)
|
||||||
|
|
||||||
|
->i_rwsem (generic_perform_write)
|
||||||
|
->mmap_lock (fault_in_readable->do_page_fault)
|
||||||
|
|
||||||
|
bdi->wb.list_lock
|
||||||
|
sb_lock (fs/fs-writeback.c)
|
||||||
|
->i_pages lock (__sync_single_inode)
|
||||||
|
|
||||||
|
->i_mmap_rwsem
|
||||||
|
->anon_vma.lock (vma_merge)
|
||||||
|
|
||||||
|
->anon_vma.lock
|
||||||
|
->page_table_lock or pte_lock (anon_vma_prepare and various)
|
||||||
|
|
||||||
|
->page_table_lock or pte_lock
|
||||||
|
->swap_lock (try_to_unmap_one)
|
||||||
|
->private_lock (try_to_unmap_one)
|
||||||
|
->i_pages lock (try_to_unmap_one)
|
||||||
|
->lruvec->lru_lock (follow_page_mask->mark_page_accessed)
|
||||||
|
->lruvec->lru_lock (check_pte_range->folio_isolate_lru)
|
||||||
|
->private_lock (folio_remove_rmap_pte->set_page_dirty)
|
||||||
|
->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
|
||||||
|
bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
|
||||||
|
->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)
|
||||||
|
bdi.wb->list_lock (zap_pte_range->set_page_dirty)
|
||||||
|
->inode->i_lock (zap_pte_range->set_page_dirty)
|
||||||
|
->private_lock (zap_pte_range->block_dirty_folio)
|
||||||
|
|
||||||
|
Please check the current state of these comments which may have changed since
|
||||||
|
the time of writing of this document.
|
||||||
|
|
||||||
|
------------------------------
|
||||||
|
Locking Implementation Details
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
.. warning:: Locking rules for PTE-level page tables are very different from
|
||||||
|
locking rules for page tables at other levels.
|
||||||
|
|
||||||
|
Page table locking details
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
In addition to the locks described in the terminology section above, we have
|
||||||
|
additional locks dedicated to page tables:
|
||||||
|
|
||||||
|
* **Higher level page table locks** - Higher level page tables, that is PGD, P4D
|
||||||
|
and PUD each make use of the process address space granularity
|
||||||
|
:c:member:`!mm->page_table_lock` lock when modified.
|
||||||
|
|
||||||
|
* **Fine-grained page table locks** - PMDs and PTEs each have fine-grained locks
|
||||||
|
either kept within the folios describing the page tables or allocated
|
||||||
|
separated and pointed at by the folios if :c:macro:`!ALLOC_SPLIT_PTLOCKS` is
|
||||||
|
set. The PMD spin lock is obtained via :c:func:`!pmd_lock`, however PTEs are
|
||||||
|
mapped into higher memory (if a 32-bit system) and carefully locked via
|
||||||
|
:c:func:`!pte_offset_map_lock`.
|
||||||
|
|
||||||
|
These locks represent the minimum required to interact with each page table
|
||||||
|
level, but there are further requirements.
|
||||||
|
|
||||||
|
Importantly, note that on a **traversal** of page tables, sometimes no such
|
||||||
|
locks are taken. However, at the PTE level, at least concurrent page table
|
||||||
|
deletion must be prevented (using RCU) and the page table must be mapped into
|
||||||
|
high memory, see below.
|
||||||
|
|
||||||
|
Whether care is taken on reading the page table entries depends on the
|
||||||
|
architecture, see the section on atomicity below.
|
||||||
|
|
||||||
|
Locking rules
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
We establish basic locking rules when interacting with page tables:
|
||||||
|
|
||||||
|
* When changing a page table entry the page table lock for that page table
|
||||||
|
**must** be held, except if you can safely assume nobody can access the page
|
||||||
|
tables concurrently (such as on invocation of :c:func:`!free_pgtables`).
|
||||||
|
* Reads from and writes to page table entries must be *appropriately*
|
||||||
|
atomic. See the section on atomicity below for details.
|
||||||
|
* Populating previously empty entries requires that the mmap or VMA locks are
|
||||||
|
held (read or write), doing so with only rmap locks would be dangerous (see
|
||||||
|
the warning below).
|
||||||
|
* As mentioned previously, zapping can be performed while simply keeping the VMA
|
||||||
|
stable, that is holding any one of the mmap, VMA or rmap locks.
|
||||||
|
|
||||||
|
.. warning:: Populating previously empty entries is dangerous as, when unmapping
|
||||||
|
VMAs, :c:func:`!vms_clear_ptes` has a window of time between
|
||||||
|
zapping (via :c:func:`!unmap_vmas`) and freeing page tables (via
|
||||||
|
:c:func:`!free_pgtables`), where the VMA is still visible in the
|
||||||
|
rmap tree. :c:func:`!free_pgtables` assumes that the zap has
|
||||||
|
already been performed and removes PTEs unconditionally (along with
|
||||||
|
all other page tables in the freed range), so installing new PTE
|
||||||
|
entries could leak memory and also cause other unexpected and
|
||||||
|
dangerous behaviour.
|
||||||
|
|
||||||
|
There are additional rules applicable when moving page tables, which we discuss
|
||||||
|
in the section on this topic below.
|
||||||
|
|
||||||
|
PTE-level page tables are different from page tables at other levels, and there
|
||||||
|
are extra requirements for accessing them:
|
||||||
|
|
||||||
|
* On 32-bit architectures, they may be in high memory (meaning they need to be
|
||||||
|
mapped into kernel memory to be accessible).
|
||||||
|
* When empty, they can be unlinked and RCU-freed while holding an mmap lock or
|
||||||
|
rmap lock for reading in combination with the PTE and PMD page table locks.
|
||||||
|
In particular, this happens in :c:func:`!retract_page_tables` when handling
|
||||||
|
:c:macro:`!MADV_COLLAPSE`.
|
||||||
|
So accessing PTE-level page tables requires at least holding an RCU read lock;
|
||||||
|
but that only suffices for readers that can tolerate racing with concurrent
|
||||||
|
page table updates such that an empty PTE is observed (in a page table that
|
||||||
|
has actually already been detached and marked for RCU freeing) while another
|
||||||
|
new page table has been installed in the same location and filled with
|
||||||
|
entries. Writers normally need to take the PTE lock and revalidate that the
|
||||||
|
PMD entry still refers to the same PTE-level page table.
|
||||||
|
|
||||||
|
To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or
|
||||||
|
:c:func:`!pte_offset_map` can be used depending on stability requirements.
|
||||||
|
These map the page table into kernel memory if required, take the RCU lock, and
|
||||||
|
depending on variant, may also look up or acquire the PTE lock.
|
||||||
|
See the comment on :c:func:`!__pte_offset_map_lock`.
|
||||||
|
|
||||||
|
Atomicity
|
||||||
|
^^^^^^^^^
|
||||||
|
|
||||||
|
Regardless of page table locks, the MMU hardware concurrently updates accessed
|
||||||
|
and dirty bits (perhaps more, depending on architecture). Additionally, page
|
||||||
|
table traversal operations in parallel (though holding the VMA stable) and
|
||||||
|
functionality like GUP-fast locklessly traverses (that is reads) page tables,
|
||||||
|
without even keeping the VMA stable at all.
|
||||||
|
|
||||||
|
When performing a page table traversal and keeping the VMA stable, whether a
|
||||||
|
read must be performed once and only once or not depends on the architecture
|
||||||
|
(for instance x86-64 does not require any special precautions).
|
||||||
|
|
||||||
|
If a write is being performed, or if a read informs whether a write takes place
|
||||||
|
(on an installation of a page table entry say, for instance in
|
||||||
|
:c:func:`!__pud_install`), special care must always be taken. In these cases we
|
||||||
|
can never assume that page table locks give us entirely exclusive access, and
|
||||||
|
must retrieve page table entries once and only once.
|
||||||
|
|
||||||
|
If we are reading page table entries, then we need only ensure that the compiler
|
||||||
|
does not rearrange our loads. This is achieved via :c:func:`!pXXp_get`
|
||||||
|
functions - :c:func:`!pgdp_get`, :c:func:`!p4dp_get`, :c:func:`!pudp_get`,
|
||||||
|
:c:func:`!pmdp_get`, and :c:func:`!ptep_get`.
|
||||||
|
|
||||||
|
Each of these uses :c:func:`!READ_ONCE` to guarantee that the compiler reads
|
||||||
|
the page table entry only once.
|
||||||
|
|
||||||
|
However, if we wish to manipulate an existing page table entry and care about
|
||||||
|
the previously stored data, we must go further and use an hardware atomic
|
||||||
|
operation as, for example, in :c:func:`!ptep_get_and_clear`.
|
||||||
|
|
||||||
|
Equally, operations that do not rely on the VMA being held stable, such as
|
||||||
|
GUP-fast (see :c:func:`!gup_fast` and its various page table level handlers like
|
||||||
|
:c:func:`!gup_fast_pte_range`), must very carefully interact with page table
|
||||||
|
entries, using functions such as :c:func:`!ptep_get_lockless` and equivalent for
|
||||||
|
higher level page table levels.
|
||||||
|
|
||||||
|
Writes to page table entries must also be appropriately atomic, as established
|
||||||
|
by :c:func:`!set_pXX` functions - :c:func:`!set_pgd`, :c:func:`!set_p4d`,
|
||||||
|
:c:func:`!set_pud`, :c:func:`!set_pmd`, and :c:func:`!set_pte`.
|
||||||
|
|
||||||
|
Equally functions which clear page table entries must be appropriately atomic,
|
||||||
|
as in :c:func:`!pXX_clear` functions - :c:func:`!pgd_clear`,
|
||||||
|
:c:func:`!p4d_clear`, :c:func:`!pud_clear`, :c:func:`!pmd_clear`, and
|
||||||
|
:c:func:`!pte_clear`.
|
||||||
|
|
||||||
|
Page table installation
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Page table installation is performed with the VMA held stable explicitly by an
|
||||||
|
mmap or VMA lock in read or write mode (see the warning in the locking rules
|
||||||
|
section for details as to why).
|
||||||
|
|
||||||
|
When allocating a P4D, PUD or PMD and setting the relevant entry in the above
|
||||||
|
PGD, P4D or PUD, the :c:member:`!mm->page_table_lock` must be held. This is
|
||||||
|
acquired in :c:func:`!__p4d_alloc`, :c:func:`!__pud_alloc` and
|
||||||
|
:c:func:`!__pmd_alloc` respectively.
|
||||||
|
|
||||||
|
.. note:: :c:func:`!__pmd_alloc` actually invokes :c:func:`!pud_lock` and
|
||||||
|
:c:func:`!pud_lockptr` in turn, however at the time of writing it ultimately
|
||||||
|
references the :c:member:`!mm->page_table_lock`.
|
||||||
|
|
||||||
|
Allocating a PTE will either use the :c:member:`!mm->page_table_lock` or, if
|
||||||
|
:c:macro:`!USE_SPLIT_PMD_PTLOCKS` is defined, a lock embedded in the PMD
|
||||||
|
physical page metadata in the form of a :c:struct:`!struct ptdesc`, acquired by
|
||||||
|
:c:func:`!pmd_ptdesc` called from :c:func:`!pmd_lock` and ultimately
|
||||||
|
:c:func:`!__pte_alloc`.
|
||||||
|
|
||||||
|
Finally, modifying the contents of the PTE requires special treatment, as the
|
||||||
|
PTE page table lock must be acquired whenever we want stable and exclusive
|
||||||
|
access to entries contained within a PTE, especially when we wish to modify
|
||||||
|
them.
|
||||||
|
|
||||||
|
This is performed via :c:func:`!pte_offset_map_lock` which carefully checks to
|
||||||
|
ensure that the PTE hasn't changed from under us, ultimately invoking
|
||||||
|
:c:func:`!pte_lockptr` to obtain a spin lock at PTE granularity contained within
|
||||||
|
the :c:struct:`!struct ptdesc` associated with the physical PTE page. The lock
|
||||||
|
must be released via :c:func:`!pte_unmap_unlock`.
|
||||||
|
|
||||||
|
.. note:: There are some variants on this, such as
|
||||||
|
:c:func:`!pte_offset_map_rw_nolock` when we know we hold the PTE stable but
|
||||||
|
for brevity we do not explore this. See the comment for
|
||||||
|
:c:func:`!__pte_offset_map_lock` for more details.
|
||||||
|
|
||||||
|
When modifying data in ranges we typically only wish to allocate higher page
|
||||||
|
tables as necessary, using these locks to avoid races or overwriting anything,
|
||||||
|
and set/clear data at the PTE level as required (for instance when page faulting
|
||||||
|
or zapping).
|
||||||
|
|
||||||
|
A typical pattern taken when traversing page table entries to install a new
|
||||||
|
mapping is to optimistically determine whether the page table entry in the table
|
||||||
|
above is empty, if so, only then acquiring the page table lock and checking
|
||||||
|
again to see if it was allocated underneath us.
|
||||||
|
|
||||||
|
This allows for a traversal with page table locks only being taken when
|
||||||
|
required. An example of this is :c:func:`!__pud_alloc`.
|
||||||
|
|
||||||
|
At the leaf page table, that is the PTE, we can't entirely rely on this pattern
|
||||||
|
as we have separate PMD and PTE locks and a THP collapse for instance might have
|
||||||
|
eliminated the PMD entry as well as the PTE from under us.
|
||||||
|
|
||||||
|
This is why :c:func:`!__pte_offset_map_lock` locklessly retrieves the PMD entry
|
||||||
|
for the PTE, carefully checking it is as expected, before acquiring the
|
||||||
|
PTE-specific lock, and then *again* checking that the PMD entry is as expected.
|
||||||
|
|
||||||
|
If a THP collapse (or similar) were to occur then the lock on both pages would
|
||||||
|
be acquired, so we can ensure this is prevented while the PTE lock is held.
|
||||||
|
|
||||||
|
Installing entries this way ensures mutual exclusion on write.
|
||||||
|
|
||||||
|
Page table freeing
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Tearing down page tables themselves is something that requires significant
|
||||||
|
care. There must be no way that page tables designated for removal can be
|
||||||
|
traversed or referenced by concurrent tasks.
|
||||||
|
|
||||||
|
It is insufficient to simply hold an mmap write lock and VMA lock (which will
|
||||||
|
prevent racing faults, and rmap operations), as a file-backed mapping can be
|
||||||
|
truncated under the :c:struct:`!struct address_space->i_mmap_rwsem` alone.
|
||||||
|
|
||||||
|
As a result, no VMA which can be accessed via the reverse mapping (either
|
||||||
|
through the :c:struct:`!struct anon_vma->rb_root` or the :c:member:`!struct
|
||||||
|
address_space->i_mmap` interval trees) can have its page tables torn down.
|
||||||
|
|
||||||
|
The operation is typically performed via :c:func:`!free_pgtables`, which assumes
|
||||||
|
either the mmap write lock has been taken (as specified by its
|
||||||
|
:c:member:`!mm_wr_locked` parameter), or that the VMA is already unreachable.
|
||||||
|
|
||||||
|
It carefully removes the VMA from all reverse mappings, however it's important
|
||||||
|
that no new ones overlap these or any route remain to permit access to addresses
|
||||||
|
within the range whose page tables are being torn down.
|
||||||
|
|
||||||
|
Additionally, it assumes that a zap has already been performed and steps have
|
||||||
|
been taken to ensure that no further page table entries can be installed between
|
||||||
|
the zap and the invocation of :c:func:`!free_pgtables`.
|
||||||
|
|
||||||
|
Since it is assumed that all such steps have been taken, page table entries are
|
||||||
|
cleared without page table locks (in the :c:func:`!pgd_clear`, :c:func:`!p4d_clear`,
|
||||||
|
:c:func:`!pud_clear`, and :c:func:`!pmd_clear` functions.
|
||||||
|
|
||||||
|
.. note:: It is possible for leaf page tables to be torn down independent of
|
||||||
|
the page tables above it as is done by
|
||||||
|
:c:func:`!retract_page_tables`, which is performed under the i_mmap
|
||||||
|
read lock, PMD, and PTE page table locks, without this level of care.
|
||||||
|
|
||||||
|
Page table moving
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Some functions manipulate page table levels above PMD (that is PUD, P4D and PGD
|
||||||
|
page tables). Most notable of these is :c:func:`!mremap`, which is capable of
|
||||||
|
moving higher level page tables.
|
||||||
|
|
||||||
|
In these instances, it is required that **all** locks are taken, that is
|
||||||
|
the mmap lock, the VMA lock and the relevant rmap locks.
|
||||||
|
|
||||||
|
You can observe this in the :c:func:`!mremap` implementation in the functions
|
||||||
|
:c:func:`!take_rmap_locks` and :c:func:`!drop_rmap_locks` which perform the rmap
|
||||||
|
side of lock acquisition, invoked ultimately by :c:func:`!move_page_tables`.
|
||||||
|
|
||||||
|
VMA lock internals
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Overview
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
VMA read locking is entirely optimistic - if the lock is contended or a competing
|
||||||
|
write has started, then we do not obtain a read lock.
|
||||||
|
|
||||||
|
A VMA **read** lock is obtained by :c:func:`!lock_vma_under_rcu`, which first
|
||||||
|
calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU
|
||||||
|
critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`,
|
||||||
|
before releasing the RCU lock via :c:func:`!rcu_read_unlock`.
|
||||||
|
|
||||||
|
VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for
|
||||||
|
their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it
|
||||||
|
via :c:func:`!vma_end_read`.
|
||||||
|
|
||||||
|
VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a
|
||||||
|
VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always
|
||||||
|
acquired. An mmap write lock **must** be held for the duration of the VMA write
|
||||||
|
lock, releasing or downgrading the mmap write lock also releases the VMA write
|
||||||
|
lock so there is no :c:func:`!vma_end_write` function.
|
||||||
|
|
||||||
|
Note that a semaphore write lock is not held across a VMA lock. Rather, a
|
||||||
|
sequence number is used for serialisation, and the write semaphore is only
|
||||||
|
acquired at the point of write lock to update this.
|
||||||
|
|
||||||
|
This ensures the semantics we require - VMA write locks provide exclusive write
|
||||||
|
access to the VMA.
|
||||||
|
|
||||||
|
Implementation details
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The VMA lock mechanism is designed to be a lightweight means of avoiding the use
|
||||||
|
of the heavily contended mmap lock. It is implemented using a combination of a
|
||||||
|
read/write semaphore and sequence numbers belonging to the containing
|
||||||
|
:c:struct:`!struct mm_struct` and the VMA.
|
||||||
|
|
||||||
|
Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic
|
||||||
|
operation, i.e. it tries to acquire a read lock but returns false if it is
|
||||||
|
unable to do so. At the end of the read operation, :c:func:`!vma_end_read` is
|
||||||
|
called to release the VMA read lock.
|
||||||
|
|
||||||
|
Invoking :c:func:`!vma_start_read` requires that :c:func:`!rcu_read_lock` has
|
||||||
|
been called first, establishing that we are in an RCU critical section upon VMA
|
||||||
|
read lock acquisition. Once acquired, the RCU lock can be released as it is only
|
||||||
|
required for lookup. This is abstracted by :c:func:`!lock_vma_under_rcu` which
|
||||||
|
is the interface a user should use.
|
||||||
|
|
||||||
|
Writing requires the mmap to be write-locked and the VMA lock to be acquired via
|
||||||
|
:c:func:`!vma_start_write`, however the write lock is released by the termination or
|
||||||
|
downgrade of the mmap write lock so no :c:func:`!vma_end_write` is required.
|
||||||
|
|
||||||
|
All this is achieved by the use of per-mm and per-VMA sequence counts, which are
|
||||||
|
used in order to reduce complexity, especially for operations which write-lock
|
||||||
|
multiple VMAs at once.
|
||||||
|
|
||||||
|
If the mm sequence count, :c:member:`!mm->mm_lock_seq` is equal to the VMA
|
||||||
|
sequence count :c:member:`!vma->vm_lock_seq` then the VMA is write-locked. If
|
||||||
|
they differ, then it is not.
|
||||||
|
|
||||||
|
Each time the mmap write lock is released in :c:func:`!mmap_write_unlock` or
|
||||||
|
:c:func:`!mmap_write_downgrade`, :c:func:`!vma_end_write_all` is invoked which
|
||||||
|
also increments :c:member:`!mm->mm_lock_seq` via
|
||||||
|
:c:func:`!mm_lock_seqcount_end`.
|
||||||
|
|
||||||
|
This way, we ensure that, regardless of the VMA's sequence number, a write lock
|
||||||
|
is never incorrectly indicated and that when we release an mmap write lock we
|
||||||
|
efficiently release **all** VMA write locks contained within the mmap at the
|
||||||
|
same time.
|
||||||
|
|
||||||
|
Since the mmap write lock is exclusive against others who hold it, the automatic
|
||||||
|
release of any VMA locks on its release makes sense, as you would never want to
|
||||||
|
keep VMAs locked across entirely separate write operations. It also maintains
|
||||||
|
correct lock ordering.
|
||||||
|
|
||||||
|
Each time a VMA read lock is acquired, we acquire a read lock on the
|
||||||
|
:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that
|
||||||
|
the sequence count of the VMA does not match that of the mm.
|
||||||
|
|
||||||
|
If it does, the read lock fails. If it does not, we hold the lock, excluding
|
||||||
|
writers, but permitting other readers, who will also obtain this lock under RCU.
|
||||||
|
|
||||||
|
Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu`
|
||||||
|
are also RCU safe, so the whole read lock operation is guaranteed to function
|
||||||
|
correctly.
|
||||||
|
|
||||||
|
On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock`
|
||||||
|
read/write semaphore, before setting the VMA's sequence number under this lock,
|
||||||
|
also simultaneously holding the mmap write lock.
|
||||||
|
|
||||||
|
This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep
|
||||||
|
until these are finished and mutual exclusion is achieved.
|
||||||
|
|
||||||
|
After setting the VMA's sequence number, the lock is released, avoiding
|
||||||
|
complexity with a long-term held write lock.
|
||||||
|
|
||||||
|
This clever combination of a read/write semaphore and sequence count allows for
|
||||||
|
fast RCU-based per-VMA lock acquisition (especially on page fault, though
|
||||||
|
utilised elsewhere) with minimal complexity around lock ordering.
|
||||||
|
|
||||||
|
mmap write lock downgrading
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
When an mmap write lock is held one has exclusive access to resources within the
|
||||||
|
mmap (with the usual caveats about requiring VMA write locks to avoid races with
|
||||||
|
tasks holding VMA read locks).
|
||||||
|
|
||||||
|
It is then possible to **downgrade** from a write lock to a read lock via
|
||||||
|
:c:func:`!mmap_write_downgrade` which, similar to :c:func:`!mmap_write_unlock`,
|
||||||
|
implicitly terminates all VMA write locks via :c:func:`!vma_end_write_all`, but
|
||||||
|
importantly does not relinquish the mmap lock while downgrading, therefore
|
||||||
|
keeping the locked virtual address space stable.
|
||||||
|
|
||||||
|
An interesting consequence of this is that downgraded locks are exclusive
|
||||||
|
against any other task possessing a downgraded lock (since a racing task would
|
||||||
|
have to acquire a write lock first to downgrade it, and the downgraded lock
|
||||||
|
prevents a new write lock from being obtained until the original lock is
|
||||||
|
released).
|
||||||
|
|
||||||
|
For clarity, we map read (R)/downgraded write (D)/write (W) locks against one
|
||||||
|
another showing which locks exclude the others:
|
||||||
|
|
||||||
|
.. list-table:: Lock exclusivity
|
||||||
|
:widths: 5 5 5 5
|
||||||
|
:header-rows: 1
|
||||||
|
:stub-columns: 1
|
||||||
|
|
||||||
|
* -
|
||||||
|
- R
|
||||||
|
- D
|
||||||
|
- W
|
||||||
|
* - R
|
||||||
|
- N
|
||||||
|
- N
|
||||||
|
- Y
|
||||||
|
* - D
|
||||||
|
- N
|
||||||
|
- Y
|
||||||
|
- Y
|
||||||
|
* - W
|
||||||
|
- Y
|
||||||
|
- Y
|
||||||
|
- Y
|
||||||
|
|
||||||
|
Here a Y indicates the locks in the matching row/column are mutually exclusive,
|
||||||
|
and N indicates that they are not.
|
||||||
|
|
||||||
|
Stack expansion
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Stack expansion throws up additional complexities in that we cannot permit there
|
||||||
|
to be racing page faults, as a result we invoke :c:func:`!vma_start_write` to
|
||||||
|
prevent this in :c:func:`!expand_downwards` or :c:func:`!expand_upwards`.
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
config ARC
|
config ARC
|
||||||
def_bool y
|
def_bool y
|
||||||
select ARC_TIMERS
|
select ARC_TIMERS
|
||||||
|
select ARCH_HAS_CPU_CACHE_ALIASING
|
||||||
select ARCH_HAS_CACHE_LINE_SIZE
|
select ARCH_HAS_CACHE_LINE_SIZE
|
||||||
select ARCH_HAS_DEBUG_VM_PGTABLE
|
select ARCH_HAS_DEBUG_VM_PGTABLE
|
||||||
select ARCH_HAS_DMA_PREP_COHERENT
|
select ARCH_HAS_DMA_PREP_COHERENT
|
||||||
|
8
arch/arc/include/asm/cachetype.h
Normal file
8
arch/arc/include/asm/cachetype.h
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
#ifndef __ASM_ARC_CACHETYPE_H
|
||||||
|
#define __ASM_ARC_CACHETYPE_H
|
||||||
|
|
||||||
|
#define cpu_dcache_is_aliasing() false
|
||||||
|
#define cpu_icache_is_aliasing() true
|
||||||
|
|
||||||
|
#endif
|
@ -614,6 +614,12 @@ static ssize_t backing_dev_store(struct device *dev,
|
|||||||
}
|
}
|
||||||
|
|
||||||
nr_pages = i_size_read(inode) >> PAGE_SHIFT;
|
nr_pages = i_size_read(inode) >> PAGE_SHIFT;
|
||||||
|
/* Refuse to use zero sized device (also prevents self reference) */
|
||||||
|
if (!nr_pages) {
|
||||||
|
err = -EINVAL;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
|
bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
|
||||||
bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
|
bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
|
||||||
if (!bitmap) {
|
if (!bitmap) {
|
||||||
@ -1438,12 +1444,16 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
|
|||||||
size_t num_pages = disksize >> PAGE_SHIFT;
|
size_t num_pages = disksize >> PAGE_SHIFT;
|
||||||
size_t index;
|
size_t index;
|
||||||
|
|
||||||
|
if (!zram->table)
|
||||||
|
return;
|
||||||
|
|
||||||
/* Free all pages that are still in this zram device */
|
/* Free all pages that are still in this zram device */
|
||||||
for (index = 0; index < num_pages; index++)
|
for (index = 0; index < num_pages; index++)
|
||||||
zram_free_page(zram, index);
|
zram_free_page(zram, index);
|
||||||
|
|
||||||
zs_destroy_pool(zram->mem_pool);
|
zs_destroy_pool(zram->mem_pool);
|
||||||
vfree(zram->table);
|
vfree(zram->table);
|
||||||
|
zram->table = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool zram_meta_alloc(struct zram *zram, u64 disksize)
|
static bool zram_meta_alloc(struct zram *zram, u64 disksize)
|
||||||
@ -2320,11 +2330,6 @@ static void zram_reset_device(struct zram *zram)
|
|||||||
|
|
||||||
zram->limit_pages = 0;
|
zram->limit_pages = 0;
|
||||||
|
|
||||||
if (!init_done(zram)) {
|
|
||||||
up_write(&zram->init_lock);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
set_capacity_and_notify(zram->disk, 0);
|
set_capacity_and_notify(zram->disk, 0);
|
||||||
part_stat_set_all(zram->disk->part0, 0);
|
part_stat_set_all(zram->disk->part0, 0);
|
||||||
|
|
||||||
|
@ -825,7 +825,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
|
|||||||
error = PTR_ERR(folio);
|
error = PTR_ERR(folio);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size));
|
folio_zero_user(folio, addr);
|
||||||
__folio_mark_uptodate(folio);
|
__folio_mark_uptodate(folio);
|
||||||
error = hugetlb_add_to_page_cache(folio, mapping, index);
|
error = hugetlb_add_to_page_cache(folio, mapping, index);
|
||||||
if (unlikely(error)) {
|
if (unlikely(error)) {
|
||||||
|
@ -35,6 +35,7 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode)
|
|||||||
ii->i_flags = 0;
|
ii->i_flags = 0;
|
||||||
memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap));
|
memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap));
|
||||||
mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS);
|
mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS);
|
||||||
|
btnc_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops;
|
||||||
}
|
}
|
||||||
|
|
||||||
void nilfs_btnode_cache_clear(struct address_space *btnc)
|
void nilfs_btnode_cache_clear(struct address_space *btnc)
|
||||||
|
@ -163,7 +163,7 @@ int nilfs_init_gcinode(struct inode *inode)
|
|||||||
|
|
||||||
inode->i_mode = S_IFREG;
|
inode->i_mode = S_IFREG;
|
||||||
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
|
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
|
||||||
inode->i_mapping->a_ops = &empty_aops;
|
inode->i_mapping->a_ops = &nilfs_buffer_cache_aops;
|
||||||
|
|
||||||
ii->i_flags = 0;
|
ii->i_flags = 0;
|
||||||
nilfs_bmap_init_gc(ii->i_bmap);
|
nilfs_bmap_init_gc(ii->i_bmap);
|
||||||
|
@ -276,6 +276,10 @@ const struct address_space_operations nilfs_aops = {
|
|||||||
.is_partially_uptodate = block_is_partially_uptodate,
|
.is_partially_uptodate = block_is_partially_uptodate,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const struct address_space_operations nilfs_buffer_cache_aops = {
|
||||||
|
.invalidate_folio = block_invalidate_folio,
|
||||||
|
};
|
||||||
|
|
||||||
static int nilfs_insert_inode_locked(struct inode *inode,
|
static int nilfs_insert_inode_locked(struct inode *inode,
|
||||||
struct nilfs_root *root,
|
struct nilfs_root *root,
|
||||||
unsigned long ino)
|
unsigned long ino)
|
||||||
@ -544,8 +548,14 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
|
|||||||
inode = nilfs_iget_locked(sb, root, ino);
|
inode = nilfs_iget_locked(sb, root, ino);
|
||||||
if (unlikely(!inode))
|
if (unlikely(!inode))
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
if (!(inode->i_state & I_NEW))
|
|
||||||
|
if (!(inode->i_state & I_NEW)) {
|
||||||
|
if (!inode->i_nlink) {
|
||||||
|
iput(inode);
|
||||||
|
return ERR_PTR(-ESTALE);
|
||||||
|
}
|
||||||
return inode;
|
return inode;
|
||||||
|
}
|
||||||
|
|
||||||
err = __nilfs_read_inode(sb, root, ino, inode);
|
err = __nilfs_read_inode(sb, root, ino, inode);
|
||||||
if (unlikely(err)) {
|
if (unlikely(err)) {
|
||||||
@ -675,6 +685,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
|
|||||||
NILFS_I(s_inode)->i_flags = 0;
|
NILFS_I(s_inode)->i_flags = 0;
|
||||||
memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap));
|
memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap));
|
||||||
mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS);
|
mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS);
|
||||||
|
s_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops;
|
||||||
|
|
||||||
err = nilfs_attach_btree_node_cache(s_inode);
|
err = nilfs_attach_btree_node_cache(s_inode);
|
||||||
if (unlikely(err)) {
|
if (unlikely(err)) {
|
||||||
|
@ -67,6 +67,11 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
|
|||||||
inode = NULL;
|
inode = NULL;
|
||||||
} else {
|
} else {
|
||||||
inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
|
inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
|
||||||
|
if (inode == ERR_PTR(-ESTALE)) {
|
||||||
|
nilfs_error(dir->i_sb,
|
||||||
|
"deleted inode referenced: %lu", ino);
|
||||||
|
return ERR_PTR(-EIO);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return d_splice_alias(inode, dentry);
|
return d_splice_alias(inode, dentry);
|
||||||
|
@ -401,6 +401,7 @@ extern const struct file_operations nilfs_dir_operations;
|
|||||||
extern const struct inode_operations nilfs_file_inode_operations;
|
extern const struct inode_operations nilfs_file_inode_operations;
|
||||||
extern const struct file_operations nilfs_file_operations;
|
extern const struct file_operations nilfs_file_operations;
|
||||||
extern const struct address_space_operations nilfs_aops;
|
extern const struct address_space_operations nilfs_aops;
|
||||||
|
extern const struct address_space_operations nilfs_buffer_cache_aops;
|
||||||
extern const struct inode_operations nilfs_dir_inode_operations;
|
extern const struct inode_operations nilfs_dir_inode_operations;
|
||||||
extern const struct inode_operations nilfs_special_inode_operations;
|
extern const struct inode_operations nilfs_special_inode_operations;
|
||||||
extern const struct inode_operations nilfs_symlink_inode_operations;
|
extern const struct inode_operations nilfs_symlink_inode_operations;
|
||||||
|
@ -971,9 +971,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
|
|||||||
start = count = 0;
|
start = count = 0;
|
||||||
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
|
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
|
||||||
|
|
||||||
while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) <
|
while (1) {
|
||||||
left) {
|
bit_off = ocfs2_find_next_zero_bit(bitmap, left, start);
|
||||||
if (bit_off == start) {
|
if ((bit_off < left) && (bit_off == start)) {
|
||||||
count++;
|
count++;
|
||||||
start++;
|
start++;
|
||||||
continue;
|
continue;
|
||||||
@ -998,29 +998,12 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (bit_off >= left)
|
||||||
|
break;
|
||||||
count = 1;
|
count = 1;
|
||||||
start = bit_off + 1;
|
start = bit_off + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* clear the contiguous bits until the end boundary */
|
|
||||||
if (count) {
|
|
||||||
blkno = la_start_blk +
|
|
||||||
ocfs2_clusters_to_blocks(osb->sb,
|
|
||||||
start - count);
|
|
||||||
|
|
||||||
trace_ocfs2_sync_local_to_main_free(
|
|
||||||
count, start - count,
|
|
||||||
(unsigned long long)la_start_blk,
|
|
||||||
(unsigned long long)blkno);
|
|
||||||
|
|
||||||
status = ocfs2_release_clusters(handle,
|
|
||||||
main_bm_inode,
|
|
||||||
main_bm_bh, blkno,
|
|
||||||
count);
|
|
||||||
if (status < 0)
|
|
||||||
mlog_errno(status);
|
|
||||||
}
|
|
||||||
|
|
||||||
bail:
|
bail:
|
||||||
if (status)
|
if (status)
|
||||||
mlog_errno(status);
|
mlog_errno(status);
|
||||||
|
@ -63,7 +63,12 @@ static inline void set_codetag_empty(union codetag_ref *ref)
|
|||||||
#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
|
#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
|
||||||
|
|
||||||
static inline bool is_codetag_empty(union codetag_ref *ref) { return false; }
|
static inline bool is_codetag_empty(union codetag_ref *ref) { return false; }
|
||||||
static inline void set_codetag_empty(union codetag_ref *ref) {}
|
|
||||||
|
static inline void set_codetag_empty(union codetag_ref *ref)
|
||||||
|
{
|
||||||
|
if (ref)
|
||||||
|
ref->ct = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
|
#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
|
||||||
|
|
||||||
@ -135,7 +140,7 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
|
|||||||
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
|
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
|
||||||
static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
|
static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
|
||||||
{
|
{
|
||||||
WARN_ONCE(ref && ref->ct,
|
WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref),
|
||||||
"alloc_tag was not cleared (got tag for %s:%u)\n",
|
"alloc_tag was not cleared (got tag for %s:%u)\n",
|
||||||
ref->ct->filename, ref->ct->lineno);
|
ref->ct->filename, ref->ct->lineno);
|
||||||
|
|
||||||
|
@ -155,8 +155,14 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level)
|
|||||||
|
|
||||||
#ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING
|
#ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING
|
||||||
#define cpu_dcache_is_aliasing() false
|
#define cpu_dcache_is_aliasing() false
|
||||||
|
#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing()
|
||||||
#else
|
#else
|
||||||
#include <asm/cachetype.h>
|
#include <asm/cachetype.h>
|
||||||
|
|
||||||
|
#ifndef cpu_icache_is_aliasing
|
||||||
|
#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing()
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /* _LINUX_CACHEINFO_H */
|
#endif /* _LINUX_CACHEINFO_H */
|
||||||
|
@ -224,7 +224,13 @@ static inline
|
|||||||
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
|
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
|
||||||
unsigned long vaddr)
|
unsigned long vaddr)
|
||||||
{
|
{
|
||||||
return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr);
|
struct folio *folio;
|
||||||
|
|
||||||
|
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr);
|
||||||
|
if (folio && user_alloc_needs_zeroing())
|
||||||
|
clear_user_highpage(&folio->page, vaddr);
|
||||||
|
|
||||||
|
return folio;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#include <linux/kasan.h>
|
#include <linux/kasan.h>
|
||||||
#include <linux/memremap.h>
|
#include <linux/memremap.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
|
#include <linux/cacheinfo.h>
|
||||||
|
|
||||||
struct mempolicy;
|
struct mempolicy;
|
||||||
struct anon_vma;
|
struct anon_vma;
|
||||||
@ -3010,7 +3011,15 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
|
|||||||
lruvec_stat_sub_folio(folio, NR_PAGETABLE);
|
lruvec_stat_sub_folio(folio, NR_PAGETABLE);
|
||||||
}
|
}
|
||||||
|
|
||||||
pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
|
pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
|
||||||
|
static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr,
|
||||||
|
pmd_t *pmdvalp)
|
||||||
|
{
|
||||||
|
pte_t *pte;
|
||||||
|
|
||||||
|
__cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp));
|
||||||
|
return pte;
|
||||||
|
}
|
||||||
static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
|
static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
|
||||||
{
|
{
|
||||||
return __pte_offset_map(pmd, addr, NULL);
|
return __pte_offset_map(pmd, addr, NULL);
|
||||||
@ -3023,7 +3032,8 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
|
|||||||
{
|
{
|
||||||
pte_t *pte;
|
pte_t *pte;
|
||||||
|
|
||||||
__cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp));
|
__cond_lock(RCU, __cond_lock(*ptlp,
|
||||||
|
pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)));
|
||||||
return pte;
|
return pte;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4175,6 +4185,23 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* user_alloc_needs_zeroing checks if a user folio from page allocator needs to
|
||||||
|
* be zeroed or not.
|
||||||
|
*/
|
||||||
|
static inline bool user_alloc_needs_zeroing(void)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* for user folios, arch with cache aliasing requires cache flush and
|
||||||
|
* arc changes folio->flags to make icache coherent with dcache, so
|
||||||
|
* always return false to make caller use
|
||||||
|
* clear_user_page()/clear_user_highpage().
|
||||||
|
*/
|
||||||
|
return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() ||
|
||||||
|
!static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
|
||||||
|
&init_on_alloc);
|
||||||
|
}
|
||||||
|
|
||||||
int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
|
int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
|
||||||
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
|
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
|
||||||
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
|
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
|
||||||
|
@ -862,18 +862,10 @@ static inline void ClearPageCompound(struct page *page)
|
|||||||
ClearPageHead(page);
|
ClearPageHead(page);
|
||||||
}
|
}
|
||||||
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
|
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
|
||||||
FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
|
FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
|
||||||
/*
|
|
||||||
* PG_partially_mapped is protected by deferred_split split_queue_lock,
|
|
||||||
* so its safe to use non-atomic set/clear.
|
|
||||||
*/
|
|
||||||
__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
|
|
||||||
__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
|
|
||||||
#else
|
#else
|
||||||
FOLIO_FLAG_FALSE(large_rmappable)
|
FOLIO_FLAG_FALSE(large_rmappable)
|
||||||
FOLIO_TEST_FLAG_FALSE(partially_mapped)
|
FOLIO_FLAG_FALSE(partially_mapped)
|
||||||
__FOLIO_SET_FLAG_NOOP(partially_mapped)
|
|
||||||
__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define PG_head_mask ((1UL << PG_head))
|
#define PG_head_mask ((1UL << PG_head))
|
||||||
|
@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item)
|
|||||||
|
|
||||||
static inline const char *lru_list_name(enum lru_list lru)
|
static inline const char *lru_list_name(enum lru_list lru)
|
||||||
{
|
{
|
||||||
return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
|
return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_"
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
|
#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
|
||||||
|
@ -639,11 +639,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||||||
LIST_HEAD(uf);
|
LIST_HEAD(uf);
|
||||||
VMA_ITERATOR(vmi, mm, 0);
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
uprobe_start_dup_mmap();
|
if (mmap_write_lock_killable(oldmm))
|
||||||
if (mmap_write_lock_killable(oldmm)) {
|
return -EINTR;
|
||||||
retval = -EINTR;
|
|
||||||
goto fail_uprobe_end;
|
|
||||||
}
|
|
||||||
flush_cache_dup_mm(oldmm);
|
flush_cache_dup_mm(oldmm);
|
||||||
uprobe_dup_mmap(oldmm, mm);
|
uprobe_dup_mmap(oldmm, mm);
|
||||||
/*
|
/*
|
||||||
@ -782,8 +779,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||||||
dup_userfaultfd_complete(&uf);
|
dup_userfaultfd_complete(&uf);
|
||||||
else
|
else
|
||||||
dup_userfaultfd_fail(&uf);
|
dup_userfaultfd_fail(&uf);
|
||||||
fail_uprobe_end:
|
|
||||||
uprobe_end_dup_mmap();
|
|
||||||
return retval;
|
return retval;
|
||||||
|
|
||||||
fail_nomem_anon_vma_fork:
|
fail_nomem_anon_vma_fork:
|
||||||
@ -1692,9 +1687,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
|
|||||||
if (!mm_init(mm, tsk, mm->user_ns))
|
if (!mm_init(mm, tsk, mm->user_ns))
|
||||||
goto fail_nomem;
|
goto fail_nomem;
|
||||||
|
|
||||||
|
uprobe_start_dup_mmap();
|
||||||
err = dup_mmap(mm, oldmm);
|
err = dup_mmap(mm, oldmm);
|
||||||
if (err)
|
if (err)
|
||||||
goto free_pt;
|
goto free_pt;
|
||||||
|
uprobe_end_dup_mmap();
|
||||||
|
|
||||||
mm->hiwater_rss = get_mm_rss(mm);
|
mm->hiwater_rss = get_mm_rss(mm);
|
||||||
mm->hiwater_vm = mm->total_vm;
|
mm->hiwater_vm = mm->total_vm;
|
||||||
@ -1709,6 +1706,8 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
|
|||||||
mm->binfmt = NULL;
|
mm->binfmt = NULL;
|
||||||
mm_init_owner(mm, NULL);
|
mm_init_owner(mm, NULL);
|
||||||
mmput(mm);
|
mmput(mm);
|
||||||
|
if (err)
|
||||||
|
uprobe_end_dup_mmap();
|
||||||
|
|
||||||
fail_nomem:
|
fail_nomem:
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -209,6 +209,13 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Clear tag references to avoid debug warning when using
|
||||||
|
* __alloc_tag_ref_set() with non-empty reference.
|
||||||
|
*/
|
||||||
|
set_codetag_empty(&ref_old);
|
||||||
|
set_codetag_empty(&ref_new);
|
||||||
|
|
||||||
/* swap tags */
|
/* swap tags */
|
||||||
__alloc_tag_ref_set(&ref_old, tag_new);
|
__alloc_tag_ref_set(&ref_old, tag_new);
|
||||||
update_page_tag_ref(handle_old, &ref_old);
|
update_page_tag_ref(handle_old, &ref_old);
|
||||||
@ -401,28 +408,52 @@ static bool find_aligned_area(struct ma_state *mas, unsigned long section_size,
|
|||||||
|
|
||||||
static int vm_module_tags_populate(void)
|
static int vm_module_tags_populate(void)
|
||||||
{
|
{
|
||||||
unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT;
|
unsigned long phys_end = ALIGN_DOWN(module_tags.start_addr, PAGE_SIZE) +
|
||||||
|
(vm_module_tags->nr_pages << PAGE_SHIFT);
|
||||||
|
unsigned long new_end = module_tags.start_addr + module_tags.size;
|
||||||
|
|
||||||
if (phys_size < module_tags.size) {
|
if (phys_end < new_end) {
|
||||||
struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages;
|
struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages;
|
||||||
unsigned long addr = module_tags.start_addr + phys_size;
|
unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN);
|
||||||
|
unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN);
|
||||||
unsigned long more_pages;
|
unsigned long more_pages;
|
||||||
unsigned long nr;
|
unsigned long nr;
|
||||||
|
|
||||||
more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT;
|
more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT;
|
||||||
nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN,
|
nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN,
|
||||||
NUMA_NO_NODE, more_pages, next_page);
|
NUMA_NO_NODE, more_pages, next_page);
|
||||||
if (nr < more_pages ||
|
if (nr < more_pages ||
|
||||||
vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL,
|
vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL,
|
||||||
next_page, PAGE_SHIFT) < 0) {
|
next_page, PAGE_SHIFT) < 0) {
|
||||||
/* Clean up and error out */
|
/* Clean up and error out */
|
||||||
for (int i = 0; i < nr; i++)
|
for (int i = 0; i < nr; i++)
|
||||||
__free_page(next_page[i]);
|
__free_page(next_page[i]);
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
vm_module_tags->nr_pages += nr;
|
vm_module_tags->nr_pages += nr;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kasan allocates 1 byte of shadow for every 8 bytes of data.
|
||||||
|
* When kasan_alloc_module_shadow allocates shadow memory,
|
||||||
|
* its unit of allocation is a page.
|
||||||
|
* Therefore, here we need to align to MODULE_ALIGN.
|
||||||
|
*/
|
||||||
|
if (old_shadow_end < new_shadow_end)
|
||||||
|
kasan_alloc_module_shadow((void *)old_shadow_end,
|
||||||
|
new_shadow_end - old_shadow_end,
|
||||||
|
GFP_KERNEL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Mark the pages as accessible, now that they are mapped.
|
||||||
|
* With hardware tag-based KASAN, marking is skipped for
|
||||||
|
* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
|
||||||
|
*/
|
||||||
|
kasan_unpoison_vmalloc((void *)module_tags.start_addr,
|
||||||
|
new_end - module_tags.start_addr,
|
||||||
|
KASAN_VMALLOC_PROT_NORMAL);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1176,11 +1176,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
|
|||||||
folio_throttle_swaprate(folio, gfp);
|
folio_throttle_swaprate(folio, gfp);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When a folio is not zeroed during allocation (__GFP_ZERO not used),
|
* When a folio is not zeroed during allocation (__GFP_ZERO not used)
|
||||||
* folio_zero_user() is used to make sure that the page corresponding
|
* or user folios require special handling, folio_zero_user() is used to
|
||||||
* to the faulting address will be hot in the cache after zeroing.
|
* make sure that the page corresponding to the faulting address will be
|
||||||
|
* hot in the cache after zeroing.
|
||||||
*/
|
*/
|
||||||
if (!alloc_zeroed())
|
if (user_alloc_needs_zeroing())
|
||||||
folio_zero_user(folio, addr);
|
folio_zero_user(folio, addr);
|
||||||
/*
|
/*
|
||||||
* The memory barrier inside __folio_mark_uptodate makes sure that
|
* The memory barrier inside __folio_mark_uptodate makes sure that
|
||||||
@ -3576,7 +3577,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
|
|||||||
!list_empty(&folio->_deferred_list)) {
|
!list_empty(&folio->_deferred_list)) {
|
||||||
ds_queue->split_queue_len--;
|
ds_queue->split_queue_len--;
|
||||||
if (folio_test_partially_mapped(folio)) {
|
if (folio_test_partially_mapped(folio)) {
|
||||||
__folio_clear_partially_mapped(folio);
|
folio_clear_partially_mapped(folio);
|
||||||
mod_mthp_stat(folio_order(folio),
|
mod_mthp_stat(folio_order(folio),
|
||||||
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
|
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
|
||||||
}
|
}
|
||||||
@ -3688,7 +3689,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
|
|||||||
if (!list_empty(&folio->_deferred_list)) {
|
if (!list_empty(&folio->_deferred_list)) {
|
||||||
ds_queue->split_queue_len--;
|
ds_queue->split_queue_len--;
|
||||||
if (folio_test_partially_mapped(folio)) {
|
if (folio_test_partially_mapped(folio)) {
|
||||||
__folio_clear_partially_mapped(folio);
|
folio_clear_partially_mapped(folio);
|
||||||
mod_mthp_stat(folio_order(folio),
|
mod_mthp_stat(folio_order(folio),
|
||||||
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
|
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
|
||||||
}
|
}
|
||||||
@ -3732,7 +3733,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
|
|||||||
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
|
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
|
||||||
if (partially_mapped) {
|
if (partially_mapped) {
|
||||||
if (!folio_test_partially_mapped(folio)) {
|
if (!folio_test_partially_mapped(folio)) {
|
||||||
__folio_set_partially_mapped(folio);
|
folio_set_partially_mapped(folio);
|
||||||
if (folio_test_pmd_mappable(folio))
|
if (folio_test_pmd_mappable(folio))
|
||||||
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
|
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
|
||||||
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
|
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
|
||||||
@ -3825,7 +3826,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
|
|||||||
} else {
|
} else {
|
||||||
/* We lost race with folio_put() */
|
/* We lost race with folio_put() */
|
||||||
if (folio_test_partially_mapped(folio)) {
|
if (folio_test_partially_mapped(folio)) {
|
||||||
__folio_clear_partially_mapped(folio);
|
folio_clear_partially_mapped(folio);
|
||||||
mod_mthp_stat(folio_order(folio),
|
mod_mthp_stat(folio_order(folio),
|
||||||
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
|
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
|
||||||
}
|
}
|
||||||
@ -4168,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
|
|||||||
size_t input_len = strlen(input_buf);
|
size_t input_len = strlen(input_buf);
|
||||||
|
|
||||||
tok = strsep(&buf, ",");
|
tok = strsep(&buf, ",");
|
||||||
if (tok) {
|
if (tok && buf) {
|
||||||
strscpy(file_path, tok);
|
strscpy(file_path, tok);
|
||||||
} else {
|
} else {
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
|
@ -5340,7 +5340,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ret = copy_user_large_folio(new_folio, pte_folio,
|
ret = copy_user_large_folio(new_folio, pte_folio,
|
||||||
ALIGN_DOWN(addr, sz), dst_vma);
|
addr, dst_vma);
|
||||||
folio_put(pte_folio);
|
folio_put(pte_folio);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
folio_put(new_folio);
|
folio_put(new_folio);
|
||||||
@ -6643,8 +6643,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
|
|||||||
*foliop = NULL;
|
*foliop = NULL;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
ret = copy_user_large_folio(folio, *foliop,
|
ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
|
||||||
ALIGN_DOWN(dst_addr, size), dst_vma);
|
|
||||||
folio_put(*foliop);
|
folio_put(*foliop);
|
||||||
*foliop = NULL;
|
*foliop = NULL;
|
||||||
if (ret) {
|
if (ret) {
|
||||||
|
@ -1285,12 +1285,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr,
|
|||||||
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
|
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||||
pmd_t *pmd, bool write);
|
pmd_t *pmd, bool write);
|
||||||
|
|
||||||
static inline bool alloc_zeroed(void)
|
|
||||||
{
|
|
||||||
return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
|
|
||||||
&init_on_alloc);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Parses a string with mem suffixes into its order. Useful to parse kernel
|
* Parses a string with mem suffixes into its order. Useful to parse kernel
|
||||||
* parameters.
|
* parameters.
|
||||||
|
18
mm/memory.c
18
mm/memory.c
@ -4733,12 +4733,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
|
|||||||
folio_throttle_swaprate(folio, gfp);
|
folio_throttle_swaprate(folio, gfp);
|
||||||
/*
|
/*
|
||||||
* When a folio is not zeroed during allocation
|
* When a folio is not zeroed during allocation
|
||||||
* (__GFP_ZERO not used), folio_zero_user() is used
|
* (__GFP_ZERO not used) or user folios require special
|
||||||
* to make sure that the page corresponding to the
|
* handling, folio_zero_user() is used to make sure
|
||||||
* faulting address will be hot in the cache after
|
* that the page corresponding to the faulting address
|
||||||
* zeroing.
|
* will be hot in the cache after zeroing.
|
||||||
*/
|
*/
|
||||||
if (!alloc_zeroed())
|
if (user_alloc_needs_zeroing())
|
||||||
folio_zero_user(folio, vmf->address);
|
folio_zero_user(folio, vmf->address);
|
||||||
return folio;
|
return folio;
|
||||||
}
|
}
|
||||||
@ -6815,9 +6815,10 @@ static inline int process_huge_page(
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void clear_gigantic_page(struct folio *folio, unsigned long addr,
|
static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
|
||||||
unsigned int nr_pages)
|
unsigned int nr_pages)
|
||||||
{
|
{
|
||||||
|
unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
might_sleep();
|
might_sleep();
|
||||||
@ -6851,13 +6852,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
|
static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
|
||||||
unsigned long addr,
|
unsigned long addr_hint,
|
||||||
struct vm_area_struct *vma,
|
struct vm_area_struct *vma,
|
||||||
unsigned int nr_pages)
|
unsigned int nr_pages)
|
||||||
{
|
{
|
||||||
int i;
|
unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
|
||||||
struct page *dst_page;
|
struct page *dst_page;
|
||||||
struct page *src_page;
|
struct page *src_page;
|
||||||
|
int i;
|
||||||
|
|
||||||
for (i = 0; i < nr_pages; i++) {
|
for (i = 0; i < nr_pages; i++) {
|
||||||
dst_page = folio_page(dst, i);
|
dst_page = folio_page(dst, i);
|
||||||
|
@ -1238,13 +1238,15 @@ static void split_large_buddy(struct zone *zone, struct page *page,
|
|||||||
if (order > pageblock_order)
|
if (order > pageblock_order)
|
||||||
order = pageblock_order;
|
order = pageblock_order;
|
||||||
|
|
||||||
while (pfn != end) {
|
do {
|
||||||
int mt = get_pfnblock_migratetype(page, pfn);
|
int mt = get_pfnblock_migratetype(page, pfn);
|
||||||
|
|
||||||
__free_one_page(page, pfn, zone, order, mt, fpi);
|
__free_one_page(page, pfn, zone, order, mt, fpi);
|
||||||
pfn += 1 << order;
|
pfn += 1 << order;
|
||||||
|
if (pfn == end)
|
||||||
|
break;
|
||||||
page = pfn_to_page(pfn);
|
page = pfn_to_page(pfn);
|
||||||
}
|
} while (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void free_one_page(struct zone *zone, struct page *page,
|
static void free_one_page(struct zone *zone, struct page *page,
|
||||||
|
@ -279,7 +279,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; }
|
|||||||
static void pmdp_get_lockless_end(unsigned long irqflags) { }
|
static void pmdp_get_lockless_end(unsigned long irqflags) { }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
|
pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
|
||||||
{
|
{
|
||||||
unsigned long irqflags;
|
unsigned long irqflags;
|
||||||
pmd_t pmdval;
|
pmd_t pmdval;
|
||||||
|
22
mm/shmem.c
22
mm/shmem.c
@ -787,6 +787,14 @@ static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
|
|||||||
}
|
}
|
||||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||||
|
|
||||||
|
static void shmem_update_stats(struct folio *folio, int nr_pages)
|
||||||
|
{
|
||||||
|
if (folio_test_pmd_mappable(folio))
|
||||||
|
__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
|
||||||
|
__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
|
||||||
|
__lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Somewhat like filemap_add_folio, but error if expected item has gone.
|
* Somewhat like filemap_add_folio, but error if expected item has gone.
|
||||||
*/
|
*/
|
||||||
@ -821,10 +829,7 @@ static int shmem_add_to_page_cache(struct folio *folio,
|
|||||||
xas_store(&xas, folio);
|
xas_store(&xas, folio);
|
||||||
if (xas_error(&xas))
|
if (xas_error(&xas))
|
||||||
goto unlock;
|
goto unlock;
|
||||||
if (folio_test_pmd_mappable(folio))
|
shmem_update_stats(folio, nr);
|
||||||
__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
|
|
||||||
__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
|
|
||||||
__lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
|
|
||||||
mapping->nrpages += nr;
|
mapping->nrpages += nr;
|
||||||
unlock:
|
unlock:
|
||||||
xas_unlock_irq(&xas);
|
xas_unlock_irq(&xas);
|
||||||
@ -852,8 +857,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
|
|||||||
error = shmem_replace_entry(mapping, folio->index, folio, radswap);
|
error = shmem_replace_entry(mapping, folio->index, folio, radswap);
|
||||||
folio->mapping = NULL;
|
folio->mapping = NULL;
|
||||||
mapping->nrpages -= nr;
|
mapping->nrpages -= nr;
|
||||||
__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
|
shmem_update_stats(folio, -nr);
|
||||||
__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
|
|
||||||
xa_unlock_irq(&mapping->i_pages);
|
xa_unlock_irq(&mapping->i_pages);
|
||||||
folio_put_refs(folio, nr);
|
folio_put_refs(folio, nr);
|
||||||
BUG_ON(error);
|
BUG_ON(error);
|
||||||
@ -1969,10 +1973,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
|
|||||||
}
|
}
|
||||||
if (!error) {
|
if (!error) {
|
||||||
mem_cgroup_replace_folio(old, new);
|
mem_cgroup_replace_folio(old, new);
|
||||||
__lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages);
|
shmem_update_stats(new, nr_pages);
|
||||||
__lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages);
|
shmem_update_stats(old, -nr_pages);
|
||||||
__lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages);
|
|
||||||
__lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages);
|
|
||||||
}
|
}
|
||||||
xa_unlock_irq(&swap_mapping->i_pages);
|
xa_unlock_irq(&swap_mapping->i_pages);
|
||||||
|
|
||||||
|
5
mm/vma.c
5
mm/vma.c
@ -2460,10 +2460,13 @@ unsigned long __mmap_region(struct file *file, unsigned long addr,
|
|||||||
|
|
||||||
/* If flags changed, we might be able to merge, so try again. */
|
/* If flags changed, we might be able to merge, so try again. */
|
||||||
if (map.retry_merge) {
|
if (map.retry_merge) {
|
||||||
|
struct vm_area_struct *merged;
|
||||||
VMG_MMAP_STATE(vmg, &map, vma);
|
VMG_MMAP_STATE(vmg, &map, vma);
|
||||||
|
|
||||||
vma_iter_config(map.vmi, map.addr, map.end);
|
vma_iter_config(map.vmi, map.addr, map.end);
|
||||||
vma_merge_existing_range(&vmg);
|
merged = vma_merge_existing_range(&vmg);
|
||||||
|
if (merged)
|
||||||
|
vma = merged;
|
||||||
}
|
}
|
||||||
|
|
||||||
__mmap_complete(&map, vma);
|
__mmap_complete(&map, vma);
|
||||||
|
@ -3374,7 +3374,8 @@ void vfree(const void *addr)
|
|||||||
struct page *page = vm->pages[i];
|
struct page *page = vm->pages[i];
|
||||||
|
|
||||||
BUG_ON(!page);
|
BUG_ON(!page);
|
||||||
mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
|
if (!(vm->flags & VM_MAP_PUT_PAGES))
|
||||||
|
mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
|
||||||
/*
|
/*
|
||||||
* High-order allocs for huge vmallocs are split, so
|
* High-order allocs for huge vmallocs are split, so
|
||||||
* can be freed as an array of order-0 allocations
|
* can be freed as an array of order-0 allocations
|
||||||
@ -3382,7 +3383,8 @@ void vfree(const void *addr)
|
|||||||
__free_page(page);
|
__free_page(page);
|
||||||
cond_resched();
|
cond_resched();
|
||||||
}
|
}
|
||||||
atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
|
if (!(vm->flags & VM_MAP_PUT_PAGES))
|
||||||
|
atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
|
||||||
kvfree(vm->pages);
|
kvfree(vm->pages);
|
||||||
kfree(vm);
|
kfree(vm);
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <linux/memfd.h>
|
#include <linux/memfd.h>
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
@ -1557,6 +1558,11 @@ static void test_share_fork(char *banner, char *b_suffix)
|
|||||||
close(fd);
|
close(fd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool pid_ns_supported(void)
|
||||||
|
{
|
||||||
|
return access("/proc/self/ns/pid", F_OK) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
@ -1591,8 +1597,12 @@ int main(int argc, char **argv)
|
|||||||
test_seal_grow();
|
test_seal_grow();
|
||||||
test_seal_resize();
|
test_seal_resize();
|
||||||
|
|
||||||
test_sysctl_simple();
|
if (pid_ns_supported()) {
|
||||||
test_sysctl_nested();
|
test_sysctl_simple();
|
||||||
|
test_sysctl_nested();
|
||||||
|
} else {
|
||||||
|
printf("PID namespaces are not supported; skipping sysctl tests\n");
|
||||||
|
}
|
||||||
|
|
||||||
test_share_dup("SHARE-DUP", "");
|
test_share_dup("SHARE-DUP", "");
|
||||||
test_share_mmap("SHARE-MMAP", "");
|
test_share_mmap("SHARE-MMAP", "");
|
||||||
|
Loading…
Reference in New Issue
Block a user