mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-16 21:35:07 +00:00
- Yu Zhao's Multi-Gen LRU patches are here. They've been under test in
linux-next for a couple of months without, to my knowledge, any negative reports (or any positive ones, come to that). - Also the Maple Tree from Liam R. Howlett. An overlapping range-based tree for vmas. It it apparently slight more efficient in its own right, but is mainly targeted at enabling work to reduce mmap_lock contention. Liam has identified a number of other tree users in the kernel which could be beneficially onverted to mapletrees. Yu Zhao has identified a hard-to-hit but "easy to fix" lockdep splat (https://lkml.kernel.org/r/CAOUHufZabH85CeUN-MEMgL8gJGzJEWUrkiM58JkTbBhh-jew0Q@mail.gmail.com). This has yet to be addressed due to Liam's unfortunately timed vacation. He is now back and we'll get this fixed up. - Dmitry Vyukov introduces KMSAN: the Kernel Memory Sanitizer. It uses clang-generated instrumentation to detect used-unintialized bugs down to the single bit level. KMSAN keeps finding bugs. New ones, as well as the legacy ones. - Yang Shi adds a userspace mechanism (madvise) to induce a collapse of memory into THPs. - Zach O'Keefe has expanded Yang Shi's madvise(MADV_COLLAPSE) to support file/shmem-backed pages. - userfaultfd updates from Axel Rasmussen - zsmalloc cleanups from Alexey Romanov - cleanups from Miaohe Lin: vmscan, hugetlb_cgroup, hugetlb and memory-failure - Huang Ying adds enhancements to NUMA balancing memory tiering mode's page promotion, with a new way of detecting hot pages. - memcg updates from Shakeel Butt: charging optimizations and reduced memory consumption. - memcg cleanups from Kairui Song. - memcg fixes and cleanups from Johannes Weiner. - Vishal Moola provides more folio conversions - Zhang Yi removed ll_rw_block() :( - migration enhancements from Peter Xu - migration error-path bugfixes from Huang Ying - Aneesh Kumar added ability for a device driver to alter the memory tiering promotion paths. For optimizations by PMEM drivers, DRM drivers, etc. - vma merging improvements from Jakub Matěn. - NUMA hinting cleanups from David Hildenbrand. - xu xin added aditional userspace visibility into KSM merging activity. - THP & KSM code consolidation from Qi Zheng. - more folio work from Matthew Wilcox. - KASAN updates from Andrey Konovalov. - DAMON cleanups from Kaixu Xia. - DAMON work from SeongJae Park: fixes, cleanups. - hugetlb sysfs cleanups from Muchun Song. - Mike Kravetz fixes locking issues in hugetlbfs and in hugetlb core. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCY0HaPgAKCRDdBJ7gKXxA joPjAQDZ5LlRCMWZ1oxLP2NOTp6nm63q9PWcGnmY50FjD/dNlwEAnx7OejCLWGWf bbTuk6U2+TKgJa4X7+pbbejeoqnt5QU= =xfWx -----END PGP SIGNATURE----- Merge tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull MM updates from Andrew Morton: - Yu Zhao's Multi-Gen LRU patches are here. They've been under test in linux-next for a couple of months without, to my knowledge, any negative reports (or any positive ones, come to that). - Also the Maple Tree from Liam Howlett. An overlapping range-based tree for vmas. It it apparently slightly more efficient in its own right, but is mainly targeted at enabling work to reduce mmap_lock contention. Liam has identified a number of other tree users in the kernel which could be beneficially onverted to mapletrees. Yu Zhao has identified a hard-to-hit but "easy to fix" lockdep splat at [1]. This has yet to be addressed due to Liam's unfortunately timed vacation. He is now back and we'll get this fixed up. - Dmitry Vyukov introduces KMSAN: the Kernel Memory Sanitizer. It uses clang-generated instrumentation to detect used-unintialized bugs down to the single bit level. KMSAN keeps finding bugs. New ones, as well as the legacy ones. - Yang Shi adds a userspace mechanism (madvise) to induce a collapse of memory into THPs. - Zach O'Keefe has expanded Yang Shi's madvise(MADV_COLLAPSE) to support file/shmem-backed pages. - userfaultfd updates from Axel Rasmussen - zsmalloc cleanups from Alexey Romanov - cleanups from Miaohe Lin: vmscan, hugetlb_cgroup, hugetlb and memory-failure - Huang Ying adds enhancements to NUMA balancing memory tiering mode's page promotion, with a new way of detecting hot pages. - memcg updates from Shakeel Butt: charging optimizations and reduced memory consumption. - memcg cleanups from Kairui Song. - memcg fixes and cleanups from Johannes Weiner. - Vishal Moola provides more folio conversions - Zhang Yi removed ll_rw_block() :( - migration enhancements from Peter Xu - migration error-path bugfixes from Huang Ying - Aneesh Kumar added ability for a device driver to alter the memory tiering promotion paths. For optimizations by PMEM drivers, DRM drivers, etc. - vma merging improvements from Jakub Matěn. - NUMA hinting cleanups from David Hildenbrand. - xu xin added aditional userspace visibility into KSM merging activity. - THP & KSM code consolidation from Qi Zheng. - more folio work from Matthew Wilcox. - KASAN updates from Andrey Konovalov. - DAMON cleanups from Kaixu Xia. - DAMON work from SeongJae Park: fixes, cleanups. - hugetlb sysfs cleanups from Muchun Song. - Mike Kravetz fixes locking issues in hugetlbfs and in hugetlb core. Link: https://lkml.kernel.org/r/CAOUHufZabH85CeUN-MEMgL8gJGzJEWUrkiM58JkTbBhh-jew0Q@mail.gmail.com [1] * tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (555 commits) hugetlb: allocate vma lock for all sharable vmas hugetlb: take hugetlb vma_lock when clearing vma_lock->vma pointer hugetlb: fix vma lock handling during split vma and range unmapping mglru: mm/vmscan.c: fix imprecise comments mm/mglru: don't sync disk for each aging cycle mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol mm: memcontrol: use do_memsw_account() in a few more places mm: memcontrol: deprecate swapaccounting=0 mode mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled mm/secretmem: remove reduntant return value mm/hugetlb: add available_huge_pages() func mm: remove unused inline functions from include/linux/mm_inline.h selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd selftests/vm: add thp collapse shmem testing selftests/vm: add thp collapse file and tmpfs testing selftests/vm: modularize thp collapse memory operations selftests/vm: dedup THP helpers mm/khugepaged: add tracepoint to hpage_collapse_scan_file() mm/madvise: add file and shmem support to MADV_COLLAPSE ...
This commit is contained in:
commit
27bc50fc90
25
Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
Normal file
25
Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
What: /sys/devices/virtual/memory_tiering/
|
||||||
|
Date: August 2022
|
||||||
|
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||||
|
Description: A collection of all the memory tiers allocated.
|
||||||
|
|
||||||
|
Individual memory tier details are contained in subdirectories
|
||||||
|
named by the abstract distance of the memory tier.
|
||||||
|
|
||||||
|
/sys/devices/virtual/memory_tiering/memory_tierN/
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/devices/virtual/memory_tiering/memory_tierN/
|
||||||
|
/sys/devices/virtual/memory_tiering/memory_tierN/nodes
|
||||||
|
Date: August 2022
|
||||||
|
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||||
|
Description: Directory with details of a specific memory tier
|
||||||
|
|
||||||
|
This is the directory containing information about a particular
|
||||||
|
memory tier, memtierN, where N is derived based on abstract distance.
|
||||||
|
|
||||||
|
A smaller value of N implies a higher (faster) memory tier in the
|
||||||
|
hierarchy.
|
||||||
|
|
||||||
|
nodes: NUMA nodes that are part of this memory tier.
|
||||||
|
|
@ -13,7 +13,7 @@ a) waiting for a CPU (while being runnable)
|
|||||||
b) completion of synchronous block I/O initiated by the task
|
b) completion of synchronous block I/O initiated by the task
|
||||||
c) swapping in pages
|
c) swapping in pages
|
||||||
d) memory reclaim
|
d) memory reclaim
|
||||||
e) thrashing page cache
|
e) thrashing
|
||||||
f) direct compact
|
f) direct compact
|
||||||
g) write-protect copy
|
g) write-protect copy
|
||||||
|
|
||||||
|
@ -299,7 +299,7 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by
|
|||||||
lruvec->lru_lock; PG_lru bit of page->flags is cleared before
|
lruvec->lru_lock; PG_lru bit of page->flags is cleared before
|
||||||
isolating a page from its LRU under lruvec->lru_lock.
|
isolating a page from its LRU under lruvec->lru_lock.
|
||||||
|
|
||||||
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
|
2.7 Kernel Memory Extension
|
||||||
-----------------------------------------------
|
-----------------------------------------------
|
||||||
|
|
||||||
With the Kernel memory extension, the Memory Controller is able to limit
|
With the Kernel memory extension, the Memory Controller is able to limit
|
||||||
@ -386,8 +386,6 @@ U != 0, K >= U:
|
|||||||
|
|
||||||
a. Enable CONFIG_CGROUPS
|
a. Enable CONFIG_CGROUPS
|
||||||
b. Enable CONFIG_MEMCG
|
b. Enable CONFIG_MEMCG
|
||||||
c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
|
|
||||||
d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
|
|
||||||
|
|
||||||
3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
|
3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
|
||||||
-------------------------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
|
@ -1469,6 +1469,14 @@
|
|||||||
Permit 'security.evm' to be updated regardless of
|
Permit 'security.evm' to be updated regardless of
|
||||||
current integrity status.
|
current integrity status.
|
||||||
|
|
||||||
|
early_page_ext [KNL] Enforces page_ext initialization to earlier
|
||||||
|
stages so cover more early boot allocations.
|
||||||
|
Please note that as side effect some optimizations
|
||||||
|
might be disabled to achieve that (e.g. parallelized
|
||||||
|
memory initialization is disabled) so the boot process
|
||||||
|
might take longer, especially on systems with a lot of
|
||||||
|
memory. Available with CONFIG_PAGE_EXTENSION=y.
|
||||||
|
|
||||||
failslab=
|
failslab=
|
||||||
fail_usercopy=
|
fail_usercopy=
|
||||||
fail_page_alloc=
|
fail_page_alloc=
|
||||||
@ -6041,12 +6049,6 @@
|
|||||||
This parameter controls use of the Protected
|
This parameter controls use of the Protected
|
||||||
Execution Facility on pSeries.
|
Execution Facility on pSeries.
|
||||||
|
|
||||||
swapaccount= [KNL]
|
|
||||||
Format: [0|1]
|
|
||||||
Enable accounting of swap in memory resource
|
|
||||||
controller if no parameter or 1 is given or disable
|
|
||||||
it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
|
|
||||||
|
|
||||||
swiotlb= [ARM,IA-64,PPC,MIPS,X86]
|
swiotlb= [ARM,IA-64,PPC,MIPS,X86]
|
||||||
Format: { <int> [,<int>] | force | noforce }
|
Format: { <int> [,<int>] | force | noforce }
|
||||||
<int> -- Number of I/O TLB slabs
|
<int> -- Number of I/O TLB slabs
|
||||||
|
@ -5,10 +5,10 @@ CMA Debugfs Interface
|
|||||||
The CMA debugfs interface is useful to retrieve basic information out of the
|
The CMA debugfs interface is useful to retrieve basic information out of the
|
||||||
different CMA areas and to test allocation/release in each of the areas.
|
different CMA areas and to test allocation/release in each of the areas.
|
||||||
|
|
||||||
Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
|
Each CMA area represents a directory under <debugfs>/cma/, represented by
|
||||||
kernel's CMA index. So the first CMA zone would be:
|
its CMA name like below:
|
||||||
|
|
||||||
<debugfs>/cma/cma-0
|
<debugfs>/cma/<cma_name>
|
||||||
|
|
||||||
The structure of the files created under that directory is as follows:
|
The structure of the files created under that directory is as follows:
|
||||||
|
|
||||||
@ -18,8 +18,8 @@ The structure of the files created under that directory is as follows:
|
|||||||
- [RO] bitmap: The bitmap of page states in the zone.
|
- [RO] bitmap: The bitmap of page states in the zone.
|
||||||
- [WO] alloc: Allocate N pages from that CMA area. For example::
|
- [WO] alloc: Allocate N pages from that CMA area. For example::
|
||||||
|
|
||||||
echo 5 > <debugfs>/cma/cma-2/alloc
|
echo 5 > <debugfs>/cma/<cma_name>/alloc
|
||||||
|
|
||||||
would try to allocate 5 pages from the cma-2 area.
|
would try to allocate 5 pages from the 'cma_name' area.
|
||||||
|
|
||||||
- [WO] free: Free N pages from that CMA area, similar to the above.
|
- [WO] free: Free N pages from that CMA area, similar to the above.
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
========================
|
==========================
|
||||||
Monitoring Data Accesses
|
DAMON: Data Access MONitor
|
||||||
========================
|
==========================
|
||||||
|
|
||||||
:doc:`DAMON </mm/damon/index>` allows light-weight data access monitoring.
|
:doc:`DAMON </mm/damon/index>` allows light-weight data access monitoring.
|
||||||
Using DAMON, users can analyze the memory access patterns of their systems and
|
Using DAMON, users can analyze the memory access patterns of their systems and
|
||||||
|
@ -29,16 +29,9 @@ called DAMON Operator (DAMO). It is available at
|
|||||||
https://github.com/awslabs/damo. The examples below assume that ``damo`` is on
|
https://github.com/awslabs/damo. The examples below assume that ``damo`` is on
|
||||||
your ``$PATH``. It's not mandatory, though.
|
your ``$PATH``. It's not mandatory, though.
|
||||||
|
|
||||||
Because DAMO is using the debugfs interface (refer to :doc:`usage` for the
|
Because DAMO is using the sysfs interface (refer to :doc:`usage` for the
|
||||||
detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as
|
detail) of DAMON, you should ensure :doc:`sysfs </filesystems/sysfs>` is
|
||||||
below::
|
mounted.
|
||||||
|
|
||||||
# mount -t debugfs none /sys/kernel/debug/
|
|
||||||
|
|
||||||
or append the following line to your ``/etc/fstab`` file so that your system
|
|
||||||
can automatically mount debugfs upon booting::
|
|
||||||
|
|
||||||
debugfs /sys/kernel/debug debugfs defaults 0 0
|
|
||||||
|
|
||||||
|
|
||||||
Recording Data Access Patterns
|
Recording Data Access Patterns
|
||||||
|
@ -393,6 +393,11 @@ the files as above. Above is only for an example.
|
|||||||
debugfs Interface
|
debugfs Interface
|
||||||
=================
|
=================
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
DAMON debugfs interface will be removed after next LTS kernel is released, so
|
||||||
|
users should move to the :ref:`sysfs interface <sysfs_interface>`.
|
||||||
|
|
||||||
DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
|
DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||||
``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
|
``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
|
||||||
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
||||||
|
@ -32,6 +32,7 @@ the Linux memory management.
|
|||||||
idle_page_tracking
|
idle_page_tracking
|
||||||
ksm
|
ksm
|
||||||
memory-hotplug
|
memory-hotplug
|
||||||
|
multigen_lru
|
||||||
nommu-mmap
|
nommu-mmap
|
||||||
numa_memory_policy
|
numa_memory_policy
|
||||||
numaperf
|
numaperf
|
||||||
|
@ -184,6 +184,42 @@ The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the
|
|||||||
``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
|
``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
|
||||||
be increased accordingly.
|
be increased accordingly.
|
||||||
|
|
||||||
|
Monitoring KSM profit
|
||||||
|
=====================
|
||||||
|
|
||||||
|
KSM can save memory by merging identical pages, but also can consume
|
||||||
|
additional memory, because it needs to generate a number of rmap_items to
|
||||||
|
save each scanned page's brief rmap information. Some of these pages may
|
||||||
|
be merged, but some may not be abled to be merged after being checked
|
||||||
|
several times, which are unprofitable memory consumed.
|
||||||
|
|
||||||
|
1) How to determine whether KSM save memory or consume memory in system-wide
|
||||||
|
range? Here is a simple approximate calculation for reference::
|
||||||
|
|
||||||
|
general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) *
|
||||||
|
sizeof(rmap_item);
|
||||||
|
|
||||||
|
where all_rmap_items can be easily obtained by summing ``pages_sharing``,
|
||||||
|
``pages_shared``, ``pages_unshared`` and ``pages_volatile``.
|
||||||
|
|
||||||
|
2) The KSM profit inner a single process can be similarly obtained by the
|
||||||
|
following approximate calculation::
|
||||||
|
|
||||||
|
process_profit =~ ksm_merging_pages * sizeof(page) -
|
||||||
|
ksm_rmap_items * sizeof(rmap_item).
|
||||||
|
|
||||||
|
where ksm_merging_pages is shown under the directory ``/proc/<pid>/``,
|
||||||
|
and ksm_rmap_items is shown in ``/proc/<pid>/ksm_stat``.
|
||||||
|
|
||||||
|
From the perspective of application, a high ratio of ``ksm_rmap_items`` to
|
||||||
|
``ksm_merging_pages`` means a bad madvise-applied policy, so developers or
|
||||||
|
administrators have to rethink how to change madvise policy. Giving an example
|
||||||
|
for reference, a page's size is usually 4K, and the rmap_item's size is
|
||||||
|
separately 32B on 32-bit CPU architecture and 64B on 64-bit CPU architecture.
|
||||||
|
so if the ``ksm_rmap_items/ksm_merging_pages`` ratio exceeds 64 on 64-bit CPU
|
||||||
|
or exceeds 128 on 32-bit CPU, then the app's madvise policy should be dropped,
|
||||||
|
because the ksm profit is approximately zero or negative.
|
||||||
|
|
||||||
Monitoring KSM events
|
Monitoring KSM events
|
||||||
=====================
|
=====================
|
||||||
|
|
||||||
|
162
Documentation/admin-guide/mm/multigen_lru.rst
Normal file
162
Documentation/admin-guide/mm/multigen_lru.rst
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=============
|
||||||
|
Multi-Gen LRU
|
||||||
|
=============
|
||||||
|
The multi-gen LRU is an alternative LRU implementation that optimizes
|
||||||
|
page reclaim and improves performance under memory pressure. Page
|
||||||
|
reclaim decides the kernel's caching policy and ability to overcommit
|
||||||
|
memory. It directly impacts the kswapd CPU usage and RAM efficiency.
|
||||||
|
|
||||||
|
Quick start
|
||||||
|
===========
|
||||||
|
Build the kernel with the following configurations.
|
||||||
|
|
||||||
|
* ``CONFIG_LRU_GEN=y``
|
||||||
|
* ``CONFIG_LRU_GEN_ENABLED=y``
|
||||||
|
|
||||||
|
All set!
|
||||||
|
|
||||||
|
Runtime options
|
||||||
|
===============
|
||||||
|
``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
|
||||||
|
following subsections.
|
||||||
|
|
||||||
|
Kill switch
|
||||||
|
-----------
|
||||||
|
``enabled`` accepts different values to enable or disable the
|
||||||
|
following components. Its default value depends on
|
||||||
|
``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
|
||||||
|
unless some of them have unforeseen side effects. Writing to
|
||||||
|
``enabled`` has no effect when a component is not supported by the
|
||||||
|
hardware, and valid values will be accepted even when the main switch
|
||||||
|
is off.
|
||||||
|
|
||||||
|
====== ===============================================================
|
||||||
|
Values Components
|
||||||
|
====== ===============================================================
|
||||||
|
0x0001 The main switch for the multi-gen LRU.
|
||||||
|
0x0002 Clearing the accessed bit in leaf page table entries in large
|
||||||
|
batches, when MMU sets it (e.g., on x86). This behavior can
|
||||||
|
theoretically worsen lock contention (mmap_lock). If it is
|
||||||
|
disabled, the multi-gen LRU will suffer a minor performance
|
||||||
|
degradation for workloads that contiguously map hot pages,
|
||||||
|
whose accessed bits can be otherwise cleared by fewer larger
|
||||||
|
batches.
|
||||||
|
0x0004 Clearing the accessed bit in non-leaf page table entries as
|
||||||
|
well, when MMU sets it (e.g., on x86). This behavior was not
|
||||||
|
verified on x86 varieties other than Intel and AMD. If it is
|
||||||
|
disabled, the multi-gen LRU will suffer a negligible
|
||||||
|
performance degradation.
|
||||||
|
[yYnN] Apply to all the components above.
|
||||||
|
====== ===============================================================
|
||||||
|
|
||||||
|
E.g.,
|
||||||
|
::
|
||||||
|
|
||||||
|
echo y >/sys/kernel/mm/lru_gen/enabled
|
||||||
|
cat /sys/kernel/mm/lru_gen/enabled
|
||||||
|
0x0007
|
||||||
|
echo 5 >/sys/kernel/mm/lru_gen/enabled
|
||||||
|
cat /sys/kernel/mm/lru_gen/enabled
|
||||||
|
0x0005
|
||||||
|
|
||||||
|
Thrashing prevention
|
||||||
|
--------------------
|
||||||
|
Personal computers are more sensitive to thrashing because it can
|
||||||
|
cause janks (lags when rendering UI) and negatively impact user
|
||||||
|
experience. The multi-gen LRU offers thrashing prevention to the
|
||||||
|
majority of laptop and desktop users who do not have ``oomd``.
|
||||||
|
|
||||||
|
Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
|
||||||
|
``N`` milliseconds from getting evicted. The OOM killer is triggered
|
||||||
|
if this working set cannot be kept in memory. In other words, this
|
||||||
|
option works as an adjustable pressure relief valve, and when open, it
|
||||||
|
terminates applications that are hopefully not being used.
|
||||||
|
|
||||||
|
Based on the average human detectable lag (~100ms), ``N=1000`` usually
|
||||||
|
eliminates intolerable janks due to thrashing. Larger values like
|
||||||
|
``N=3000`` make janks less noticeable at the risk of premature OOM
|
||||||
|
kills.
|
||||||
|
|
||||||
|
The default value ``0`` means disabled.
|
||||||
|
|
||||||
|
Experimental features
|
||||||
|
=====================
|
||||||
|
``/sys/kernel/debug/lru_gen`` accepts commands described in the
|
||||||
|
following subsections. Multiple command lines are supported, so does
|
||||||
|
concatenation with delimiters ``,`` and ``;``.
|
||||||
|
|
||||||
|
``/sys/kernel/debug/lru_gen_full`` provides additional stats for
|
||||||
|
debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
|
||||||
|
evicted generations in this file.
|
||||||
|
|
||||||
|
Working set estimation
|
||||||
|
----------------------
|
||||||
|
Working set estimation measures how much memory an application needs
|
||||||
|
in a given time interval, and it is usually done with little impact on
|
||||||
|
the performance of the application. E.g., data centers want to
|
||||||
|
optimize job scheduling (bin packing) to improve memory utilizations.
|
||||||
|
When a new job comes in, the job scheduler needs to find out whether
|
||||||
|
each server it manages can allocate a certain amount of memory for
|
||||||
|
this new job before it can pick a candidate. To do so, the job
|
||||||
|
scheduler needs to estimate the working sets of the existing jobs.
|
||||||
|
|
||||||
|
When it is read, ``lru_gen`` returns a histogram of numbers of pages
|
||||||
|
accessed over different time intervals for each memcg and node.
|
||||||
|
``MAX_NR_GENS`` decides the number of bins for each histogram. The
|
||||||
|
histograms are noncumulative.
|
||||||
|
::
|
||||||
|
|
||||||
|
memcg memcg_id memcg_path
|
||||||
|
node node_id
|
||||||
|
min_gen_nr age_in_ms nr_anon_pages nr_file_pages
|
||||||
|
...
|
||||||
|
max_gen_nr age_in_ms nr_anon_pages nr_file_pages
|
||||||
|
|
||||||
|
Each bin contains an estimated number of pages that have been accessed
|
||||||
|
within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
|
||||||
|
and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
|
||||||
|
the former is the largest and that of the latter is the smallest.
|
||||||
|
|
||||||
|
Users can write the following command to ``lru_gen`` to create a new
|
||||||
|
generation ``max_gen_nr+1``:
|
||||||
|
|
||||||
|
``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
|
||||||
|
|
||||||
|
``can_swap`` defaults to the swap setting and, if it is set to ``1``,
|
||||||
|
it forces the scan of anon pages when swap is off, and vice versa.
|
||||||
|
``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
|
||||||
|
employs heuristics to reduce the overhead, which is likely to reduce
|
||||||
|
the coverage as well.
|
||||||
|
|
||||||
|
A typical use case is that a job scheduler runs this command at a
|
||||||
|
certain time interval to create new generations, and it ranks the
|
||||||
|
servers it manages based on the sizes of their cold pages defined by
|
||||||
|
this time interval.
|
||||||
|
|
||||||
|
Proactive reclaim
|
||||||
|
-----------------
|
||||||
|
Proactive reclaim induces page reclaim when there is no memory
|
||||||
|
pressure. It usually targets cold pages only. E.g., when a new job
|
||||||
|
comes in, the job scheduler wants to proactively reclaim cold pages on
|
||||||
|
the server it selected, to improve the chance of successfully landing
|
||||||
|
this new job.
|
||||||
|
|
||||||
|
Users can write the following command to ``lru_gen`` to evict
|
||||||
|
generations less than or equal to ``min_gen_nr``.
|
||||||
|
|
||||||
|
``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
|
||||||
|
|
||||||
|
``min_gen_nr`` should be less than ``max_gen_nr-1``, since
|
||||||
|
``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
|
||||||
|
the active list) and therefore cannot be evicted. ``swappiness``
|
||||||
|
overrides the default value in ``/proc/sys/vm/swappiness``.
|
||||||
|
``nr_to_reclaim`` limits the number of pages to evict.
|
||||||
|
|
||||||
|
A typical use case is that a job scheduler runs this command before it
|
||||||
|
tries to land a new job on a server. If it fails to materialize enough
|
||||||
|
cold pages because of the overestimation, it retries on the next
|
||||||
|
server according to the ranking result obtained from the working set
|
||||||
|
estimation step. This less forceful approach limits the impacts on the
|
||||||
|
existing jobs.
|
@ -191,7 +191,14 @@ allocation failure to throttle the next allocation attempt::
|
|||||||
|
|
||||||
/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
|
/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
|
||||||
|
|
||||||
The khugepaged progress can be seen in the number of pages collapsed::
|
The khugepaged progress can be seen in the number of pages collapsed (note
|
||||||
|
that this counter may not be an exact count of the number of pages
|
||||||
|
collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
|
||||||
|
being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
|
||||||
|
one 2M hugepage. Each may happen independently, or together, depending on
|
||||||
|
the type of memory and the failures that occur. As such, this value should
|
||||||
|
be interpreted roughly as a sign of progress, and counters in /proc/vmstat
|
||||||
|
consulted for more accurate accounting)::
|
||||||
|
|
||||||
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
|
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
|
||||||
|
|
||||||
@ -366,10 +373,9 @@ thp_split_pmd
|
|||||||
page table entry.
|
page table entry.
|
||||||
|
|
||||||
thp_zero_page_alloc
|
thp_zero_page_alloc
|
||||||
is incremented every time a huge zero page is
|
is incremented every time a huge zero page used for thp is
|
||||||
successfully allocated. It includes allocations which where
|
successfully allocated. Note, it doesn't count every map of
|
||||||
dropped due race with other allocation. Note, it doesn't count
|
the huge zero page, only its allocation.
|
||||||
every map of the huge zero page, only its allocation.
|
|
||||||
|
|
||||||
thp_zero_page_alloc_failed
|
thp_zero_page_alloc_failed
|
||||||
is incremented if kernel fails to allocate
|
is incremented if kernel fails to allocate
|
||||||
|
@ -17,7 +17,10 @@ of the ``PROT_NONE+SIGSEGV`` trick.
|
|||||||
Design
|
Design
|
||||||
======
|
======
|
||||||
|
|
||||||
Userfaults are delivered and resolved through the ``userfaultfd`` syscall.
|
Userspace creates a new userfaultfd, initializes it, and registers one or more
|
||||||
|
regions of virtual memory with it. Then, any page faults which occur within the
|
||||||
|
region(s) result in a message being delivered to the userfaultfd, notifying
|
||||||
|
userspace of the fault.
|
||||||
|
|
||||||
The ``userfaultfd`` (aside from registering and unregistering virtual
|
The ``userfaultfd`` (aside from registering and unregistering virtual
|
||||||
memory ranges) provides two primary functionalities:
|
memory ranges) provides two primary functionalities:
|
||||||
@ -34,12 +37,11 @@ The real advantage of userfaults if compared to regular virtual memory
|
|||||||
management of mremap/mprotect is that the userfaults in all their
|
management of mremap/mprotect is that the userfaults in all their
|
||||||
operations never involve heavyweight structures like vmas (in fact the
|
operations never involve heavyweight structures like vmas (in fact the
|
||||||
``userfaultfd`` runtime load never takes the mmap_lock for writing).
|
``userfaultfd`` runtime load never takes the mmap_lock for writing).
|
||||||
|
|
||||||
Vmas are not suitable for page- (or hugepage) granular fault tracking
|
Vmas are not suitable for page- (or hugepage) granular fault tracking
|
||||||
when dealing with virtual address spaces that could span
|
when dealing with virtual address spaces that could span
|
||||||
Terabytes. Too many vmas would be needed for that.
|
Terabytes. Too many vmas would be needed for that.
|
||||||
|
|
||||||
The ``userfaultfd`` once opened by invoking the syscall, can also be
|
The ``userfaultfd``, once created, can also be
|
||||||
passed using unix domain sockets to a manager process, so the same
|
passed using unix domain sockets to a manager process, so the same
|
||||||
manager process could handle the userfaults of a multitude of
|
manager process could handle the userfaults of a multitude of
|
||||||
different processes without them being aware about what is going on
|
different processes without them being aware about what is going on
|
||||||
@ -50,6 +52,39 @@ is a corner case that would currently return ``-EBUSY``).
|
|||||||
API
|
API
|
||||||
===
|
===
|
||||||
|
|
||||||
|
Creating a userfaultfd
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
There are two ways to create a new userfaultfd, each of which provide ways to
|
||||||
|
restrict access to this functionality (since historically userfaultfds which
|
||||||
|
handle kernel page faults have been a useful tool for exploiting the kernel).
|
||||||
|
|
||||||
|
The first way, supported since userfaultfd was introduced, is the
|
||||||
|
userfaultfd(2) syscall. Access to this is controlled in several ways:
|
||||||
|
|
||||||
|
- Any user can always create a userfaultfd which traps userspace page faults
|
||||||
|
only. Such a userfaultfd can be created using the userfaultfd(2) syscall
|
||||||
|
with the flag UFFD_USER_MODE_ONLY.
|
||||||
|
|
||||||
|
- In order to also trap kernel page faults for the address space, either the
|
||||||
|
process needs the CAP_SYS_PTRACE capability, or the system must have
|
||||||
|
vm.unprivileged_userfaultfd set to 1. By default, vm.unprivileged_userfaultfd
|
||||||
|
is set to 0.
|
||||||
|
|
||||||
|
The second way, added to the kernel more recently, is by opening
|
||||||
|
/dev/userfaultfd and issuing a USERFAULTFD_IOC_NEW ioctl to it. This method
|
||||||
|
yields equivalent userfaultfds to the userfaultfd(2) syscall.
|
||||||
|
|
||||||
|
Unlike userfaultfd(2), access to /dev/userfaultfd is controlled via normal
|
||||||
|
filesystem permissions (user/group/mode), which gives fine grained access to
|
||||||
|
userfaultfd specifically, without also granting other unrelated privileges at
|
||||||
|
the same time (as e.g. granting CAP_SYS_PTRACE would do). Users who have access
|
||||||
|
to /dev/userfaultfd can always create userfaultfds that trap kernel page faults;
|
||||||
|
vm.unprivileged_userfaultfd is not considered.
|
||||||
|
|
||||||
|
Initializing a userfaultfd
|
||||||
|
--------------------------
|
||||||
|
|
||||||
When first opened the ``userfaultfd`` must be enabled invoking the
|
When first opened the ``userfaultfd`` must be enabled invoking the
|
||||||
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
|
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
|
||||||
a later API version) which will specify the ``read/POLLIN`` protocol
|
a later API version) which will specify the ``read/POLLIN`` protocol
|
||||||
|
@ -635,6 +635,17 @@ different types of memory (represented as different NUMA nodes) to
|
|||||||
place the hot pages in the fast memory. This is implemented based on
|
place the hot pages in the fast memory. This is implemented based on
|
||||||
unmapping and page fault too.
|
unmapping and page fault too.
|
||||||
|
|
||||||
|
numa_balancing_promote_rate_limit_MBps
|
||||||
|
======================================
|
||||||
|
|
||||||
|
Too high promotion/demotion throughput between different memory types
|
||||||
|
may hurt application latency. This can be used to rate limit the
|
||||||
|
promotion throughput. The per-node max promotion throughput in MB/s
|
||||||
|
will be limited to be no more than the set value.
|
||||||
|
|
||||||
|
A rule of thumb is to set this to less than 1/10 of the PMEM node
|
||||||
|
write bandwidth.
|
||||||
|
|
||||||
oops_all_cpu_backtrace
|
oops_all_cpu_backtrace
|
||||||
======================
|
======================
|
||||||
|
|
||||||
|
@ -926,6 +926,9 @@ calls without any restrictions.
|
|||||||
|
|
||||||
The default value is 0.
|
The default value is 0.
|
||||||
|
|
||||||
|
Another way to control permissions for userfaultfd is to use
|
||||||
|
/dev/userfaultfd instead of userfaultfd(2). See
|
||||||
|
Documentation/admin-guide/mm/userfaultfd.rst.
|
||||||
|
|
||||||
user_reserve_kbytes
|
user_reserve_kbytes
|
||||||
===================
|
===================
|
||||||
|
@ -37,6 +37,7 @@ Library functionality that is used throughout the kernel.
|
|||||||
kref
|
kref
|
||||||
assoc_array
|
assoc_array
|
||||||
xarray
|
xarray
|
||||||
|
maple_tree
|
||||||
idr
|
idr
|
||||||
circular-buffers
|
circular-buffers
|
||||||
rbtree
|
rbtree
|
||||||
|
217
Documentation/core-api/maple_tree.rst
Normal file
217
Documentation/core-api/maple_tree.rst
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0+
|
||||||
|
|
||||||
|
|
||||||
|
==========
|
||||||
|
Maple Tree
|
||||||
|
==========
|
||||||
|
|
||||||
|
:Author: Liam R. Howlett
|
||||||
|
|
||||||
|
Overview
|
||||||
|
========
|
||||||
|
|
||||||
|
The Maple Tree is a B-Tree data type which is optimized for storing
|
||||||
|
non-overlapping ranges, including ranges of size 1. The tree was designed to
|
||||||
|
be simple to use and does not require a user written search method. It
|
||||||
|
supports iterating over a range of entries and going to the previous or next
|
||||||
|
entry in a cache-efficient manner. The tree can also be put into an RCU-safe
|
||||||
|
mode of operation which allows reading and writing concurrently. Writers must
|
||||||
|
synchronize on a lock, which can be the default spinlock, or the user can set
|
||||||
|
the lock to an external lock of a different type.
|
||||||
|
|
||||||
|
The Maple Tree maintains a small memory footprint and was designed to use
|
||||||
|
modern processor cache efficiently. The majority of the users will be able to
|
||||||
|
use the normal API. An :ref:`maple-tree-advanced-api` exists for more complex
|
||||||
|
scenarios. The most important usage of the Maple Tree is the tracking of the
|
||||||
|
virtual memory areas.
|
||||||
|
|
||||||
|
The Maple Tree can store values between ``0`` and ``ULONG_MAX``. The Maple
|
||||||
|
Tree reserves values with the bottom two bits set to '10' which are below 4096
|
||||||
|
(ie 2, 6, 10 .. 4094) for internal use. If the entries may use reserved
|
||||||
|
entries then the users can convert the entries using xa_mk_value() and convert
|
||||||
|
them back by calling xa_to_value(). If the user needs to use a reserved
|
||||||
|
value, then the user can convert the value when using the
|
||||||
|
:ref:`maple-tree-advanced-api`, but are blocked by the normal API.
|
||||||
|
|
||||||
|
The Maple Tree can also be configured to support searching for a gap of a given
|
||||||
|
size (or larger).
|
||||||
|
|
||||||
|
Pre-allocating of nodes is also supported using the
|
||||||
|
:ref:`maple-tree-advanced-api`. This is useful for users who must guarantee a
|
||||||
|
successful store operation within a given
|
||||||
|
code segment when allocating cannot be done. Allocations of nodes are
|
||||||
|
relatively small at around 256 bytes.
|
||||||
|
|
||||||
|
.. _maple-tree-normal-api:
|
||||||
|
|
||||||
|
Normal API
|
||||||
|
==========
|
||||||
|
|
||||||
|
Start by initialising a maple tree, either with DEFINE_MTREE() for statically
|
||||||
|
allocated maple trees or mt_init() for dynamically allocated ones. A
|
||||||
|
freshly-initialised maple tree contains a ``NULL`` pointer for the range ``0``
|
||||||
|
- ``ULONG_MAX``. There are currently two types of maple trees supported: the
|
||||||
|
allocation tree and the regular tree. The regular tree has a higher branching
|
||||||
|
factor for internal nodes. The allocation tree has a lower branching factor
|
||||||
|
but allows the user to search for a gap of a given size or larger from either
|
||||||
|
``0`` upwards or ``ULONG_MAX`` down. An allocation tree can be used by
|
||||||
|
passing in the ``MT_FLAGS_ALLOC_RANGE`` flag when initialising the tree.
|
||||||
|
|
||||||
|
You can then set entries using mtree_store() or mtree_store_range().
|
||||||
|
mtree_store() will overwrite any entry with the new entry and return 0 on
|
||||||
|
success or an error code otherwise. mtree_store_range() works in the same way
|
||||||
|
but takes a range. mtree_load() is used to retrieve the entry stored at a
|
||||||
|
given index. You can use mtree_erase() to erase an entire range by only
|
||||||
|
knowing one value within that range, or mtree_store() call with an entry of
|
||||||
|
NULL may be used to partially erase a range or many ranges at once.
|
||||||
|
|
||||||
|
If you want to only store a new entry to a range (or index) if that range is
|
||||||
|
currently ``NULL``, you can use mtree_insert_range() or mtree_insert() which
|
||||||
|
return -EEXIST if the range is not empty.
|
||||||
|
|
||||||
|
You can search for an entry from an index upwards by using mt_find().
|
||||||
|
|
||||||
|
You can walk each entry within a range by calling mt_for_each(). You must
|
||||||
|
provide a temporary variable to store a cursor. If you want to walk each
|
||||||
|
element of the tree then ``0`` and ``ULONG_MAX`` may be used as the range. If
|
||||||
|
the caller is going to hold the lock for the duration of the walk then it is
|
||||||
|
worth looking at the mas_for_each() API in the :ref:`maple-tree-advanced-api`
|
||||||
|
section.
|
||||||
|
|
||||||
|
Sometimes it is necessary to ensure the next call to store to a maple tree does
|
||||||
|
not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case.
|
||||||
|
|
||||||
|
Finally, you can remove all entries from a maple tree by calling
|
||||||
|
mtree_destroy(). If the maple tree entries are pointers, you may wish to free
|
||||||
|
the entries first.
|
||||||
|
|
||||||
|
Allocating Nodes
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The allocations are handled by the internal tree code. See
|
||||||
|
:ref:`maple-tree-advanced-alloc` for other options.
|
||||||
|
|
||||||
|
Locking
|
||||||
|
-------
|
||||||
|
|
||||||
|
You do not have to worry about locking. See :ref:`maple-tree-advanced-locks`
|
||||||
|
for other options.
|
||||||
|
|
||||||
|
The Maple Tree uses RCU and an internal spinlock to synchronise access:
|
||||||
|
|
||||||
|
Takes RCU read lock:
|
||||||
|
* mtree_load()
|
||||||
|
* mt_find()
|
||||||
|
* mt_for_each()
|
||||||
|
* mt_next()
|
||||||
|
* mt_prev()
|
||||||
|
|
||||||
|
Takes ma_lock internally:
|
||||||
|
* mtree_store()
|
||||||
|
* mtree_store_range()
|
||||||
|
* mtree_insert()
|
||||||
|
* mtree_insert_range()
|
||||||
|
* mtree_erase()
|
||||||
|
* mtree_destroy()
|
||||||
|
* mt_set_in_rcu()
|
||||||
|
* mt_clear_in_rcu()
|
||||||
|
|
||||||
|
If you want to take advantage of the internal lock to protect the data
|
||||||
|
structures that you are storing in the Maple Tree, you can call mtree_lock()
|
||||||
|
before calling mtree_load(), then take a reference count on the object you
|
||||||
|
have found before calling mtree_unlock(). This will prevent stores from
|
||||||
|
removing the object from the tree between looking up the object and
|
||||||
|
incrementing the refcount. You can also use RCU to avoid dereferencing
|
||||||
|
freed memory, but an explanation of that is beyond the scope of this
|
||||||
|
document.
|
||||||
|
|
||||||
|
.. _maple-tree-advanced-api:
|
||||||
|
|
||||||
|
Advanced API
|
||||||
|
============
|
||||||
|
|
||||||
|
The advanced API offers more flexibility and better performance at the
|
||||||
|
cost of an interface which can be harder to use and has fewer safeguards.
|
||||||
|
You must take care of your own locking while using the advanced API.
|
||||||
|
You can use the ma_lock, RCU or an external lock for protection.
|
||||||
|
You can mix advanced and normal operations on the same array, as long
|
||||||
|
as the locking is compatible. The :ref:`maple-tree-normal-api` is implemented
|
||||||
|
in terms of the advanced API.
|
||||||
|
|
||||||
|
The advanced API is based around the ma_state, this is where the 'mas'
|
||||||
|
prefix originates. The ma_state struct keeps track of tree operations to make
|
||||||
|
life easier for both internal and external tree users.
|
||||||
|
|
||||||
|
Initialising the maple tree is the same as in the :ref:`maple-tree-normal-api`.
|
||||||
|
Please see above.
|
||||||
|
|
||||||
|
The maple state keeps track of the range start and end in mas->index and
|
||||||
|
mas->last, respectively.
|
||||||
|
|
||||||
|
mas_walk() will walk the tree to the location of mas->index and set the
|
||||||
|
mas->index and mas->last according to the range for the entry.
|
||||||
|
|
||||||
|
You can set entries using mas_store(). mas_store() will overwrite any entry
|
||||||
|
with the new entry and return the first existing entry that is overwritten.
|
||||||
|
The range is passed in as members of the maple state: index and last.
|
||||||
|
|
||||||
|
You can use mas_erase() to erase an entire range by setting index and
|
||||||
|
last of the maple state to the desired range to erase. This will erase
|
||||||
|
the first range that is found in that range, set the maple state index
|
||||||
|
and last as the range that was erased and return the entry that existed
|
||||||
|
at that location.
|
||||||
|
|
||||||
|
You can walk each entry within a range by using mas_for_each(). If you want
|
||||||
|
to walk each element of the tree then ``0`` and ``ULONG_MAX`` may be used as
|
||||||
|
the range. If the lock needs to be periodically dropped, see the locking
|
||||||
|
section mas_pause().
|
||||||
|
|
||||||
|
Using a maple state allows mas_next() and mas_prev() to function as if the
|
||||||
|
tree was a linked list. With such a high branching factor the amortized
|
||||||
|
performance penalty is outweighed by cache optimization. mas_next() will
|
||||||
|
return the next entry which occurs after the entry at index. mas_prev()
|
||||||
|
will return the previous entry which occurs before the entry at index.
|
||||||
|
|
||||||
|
mas_find() will find the first entry which exists at or above index on
|
||||||
|
the first call, and the next entry from every subsequent calls.
|
||||||
|
|
||||||
|
mas_find_rev() will find the fist entry which exists at or below the last on
|
||||||
|
the first call, and the previous entry from every subsequent calls.
|
||||||
|
|
||||||
|
If the user needs to yield the lock during an operation, then the maple state
|
||||||
|
must be paused using mas_pause().
|
||||||
|
|
||||||
|
There are a few extra interfaces provided when using an allocation tree.
|
||||||
|
If you wish to search for a gap within a range, then mas_empty_area()
|
||||||
|
or mas_empty_area_rev() can be used. mas_empty_area() searches for a gap
|
||||||
|
starting at the lowest index given up to the maximum of the range.
|
||||||
|
mas_empty_area_rev() searches for a gap starting at the highest index given
|
||||||
|
and continues downward to the lower bound of the range.
|
||||||
|
|
||||||
|
.. _maple-tree-advanced-alloc:
|
||||||
|
|
||||||
|
Advanced Allocating Nodes
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Allocations are usually handled internally to the tree, however if allocations
|
||||||
|
need to occur before a write occurs then calling mas_expected_entries() will
|
||||||
|
allocate the worst-case number of needed nodes to insert the provided number of
|
||||||
|
ranges. This also causes the tree to enter mass insertion mode. Once
|
||||||
|
insertions are complete calling mas_destroy() on the maple state will free the
|
||||||
|
unused allocations.
|
||||||
|
|
||||||
|
.. _maple-tree-advanced-locks:
|
||||||
|
|
||||||
|
Advanced Locking
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The maple tree uses a spinlock by default, but external locks can be used for
|
||||||
|
tree updates as well. To use an external lock, the tree must be initialized
|
||||||
|
with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
|
||||||
|
MTREE_INIT_EXT() #define, which takes an external lock as an argument.
|
||||||
|
|
||||||
|
Functions and structures
|
||||||
|
========================
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/maple_tree.h
|
||||||
|
.. kernel-doc:: lib/maple_tree.c
|
@ -19,9 +19,6 @@ User Space Memory Access
|
|||||||
Memory Allocation Controls
|
Memory Allocation Controls
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/gfp.h
|
|
||||||
:internal:
|
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/gfp_types.h
|
.. kernel-doc:: include/linux/gfp_types.h
|
||||||
:doc: Page mobility and placement hints
|
:doc: Page mobility and placement hints
|
||||||
|
|
||||||
|
@ -24,6 +24,7 @@ Documentation/dev-tools/testing-overview.rst
|
|||||||
kcov
|
kcov
|
||||||
gcov
|
gcov
|
||||||
kasan
|
kasan
|
||||||
|
kmsan
|
||||||
ubsan
|
ubsan
|
||||||
kmemleak
|
kmemleak
|
||||||
kcsan
|
kcsan
|
||||||
|
@ -111,9 +111,17 @@ parameter can be used to control panic and reporting behaviour:
|
|||||||
report or also panic the kernel (default: ``report``). The panic happens even
|
report or also panic the kernel (default: ``report``). The panic happens even
|
||||||
if ``kasan_multi_shot`` is enabled.
|
if ``kasan_multi_shot`` is enabled.
|
||||||
|
|
||||||
Hardware Tag-Based KASAN mode (see the section about various modes below) is
|
Software and Hardware Tag-Based KASAN modes (see the section about various
|
||||||
intended for use in production as a security mitigation. Therefore, it supports
|
modes below) support altering stack trace collection behavior:
|
||||||
additional boot parameters that allow disabling KASAN or controlling features:
|
|
||||||
|
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
|
||||||
|
traces collection (default: ``on``).
|
||||||
|
- ``kasan.stack_ring_size=<number of entries>`` specifies the number of entries
|
||||||
|
in the stack ring (default: ``32768``).
|
||||||
|
|
||||||
|
Hardware Tag-Based KASAN mode is intended for use in production as a security
|
||||||
|
mitigation. Therefore, it supports additional boot parameters that allow
|
||||||
|
disabling KASAN altogether or controlling its features:
|
||||||
|
|
||||||
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
|
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
|
||||||
|
|
||||||
@ -132,9 +140,6 @@ additional boot parameters that allow disabling KASAN or controlling features:
|
|||||||
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
|
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
|
||||||
allocations (default: ``on``).
|
allocations (default: ``on``).
|
||||||
|
|
||||||
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
|
|
||||||
traces collection (default: ``on``).
|
|
||||||
|
|
||||||
Error reports
|
Error reports
|
||||||
~~~~~~~~~~~~~
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
427
Documentation/dev-tools/kmsan.rst
Normal file
427
Documentation/dev-tools/kmsan.rst
Normal file
@ -0,0 +1,427 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. Copyright (C) 2022, Google LLC.
|
||||||
|
|
||||||
|
===================================
|
||||||
|
The Kernel Memory Sanitizer (KMSAN)
|
||||||
|
===================================
|
||||||
|
|
||||||
|
KMSAN is a dynamic error detector aimed at finding uses of uninitialized
|
||||||
|
values. It is based on compiler instrumentation, and is quite similar to the
|
||||||
|
userspace `MemorySanitizer tool`_.
|
||||||
|
|
||||||
|
An important note is that KMSAN is not intended for production use, because it
|
||||||
|
drastically increases kernel memory footprint and slows the whole system down.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
=====
|
||||||
|
|
||||||
|
Building the kernel
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
In order to build a kernel with KMSAN you will need a fresh Clang (14.0.6+).
|
||||||
|
Please refer to `LLVM documentation`_ for the instructions on how to build Clang.
|
||||||
|
|
||||||
|
Now configure and build the kernel with CONFIG_KMSAN enabled.
|
||||||
|
|
||||||
|
Example report
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Here is an example of a KMSAN report::
|
||||||
|
|
||||||
|
=====================================================
|
||||||
|
BUG: KMSAN: uninit-value in test_uninit_kmsan_check_memory+0x1be/0x380 [kmsan_test]
|
||||||
|
test_uninit_kmsan_check_memory+0x1be/0x380 mm/kmsan/kmsan_test.c:273
|
||||||
|
kunit_run_case_internal lib/kunit/test.c:333
|
||||||
|
kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374
|
||||||
|
kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28
|
||||||
|
kthread+0x721/0x850 kernel/kthread.c:327
|
||||||
|
ret_from_fork+0x1f/0x30 ??:?
|
||||||
|
|
||||||
|
Uninit was stored to memory at:
|
||||||
|
do_uninit_local_array+0xfa/0x110 mm/kmsan/kmsan_test.c:260
|
||||||
|
test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271
|
||||||
|
kunit_run_case_internal lib/kunit/test.c:333
|
||||||
|
kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374
|
||||||
|
kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28
|
||||||
|
kthread+0x721/0x850 kernel/kthread.c:327
|
||||||
|
ret_from_fork+0x1f/0x30 ??:?
|
||||||
|
|
||||||
|
Local variable uninit created at:
|
||||||
|
do_uninit_local_array+0x4a/0x110 mm/kmsan/kmsan_test.c:256
|
||||||
|
test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271
|
||||||
|
|
||||||
|
Bytes 4-7 of 8 are uninitialized
|
||||||
|
Memory access of size 8 starts at ffff888083fe3da0
|
||||||
|
|
||||||
|
CPU: 0 PID: 6731 Comm: kunit_try_catch Tainted: G B E 5.16.0-rc3+ #104
|
||||||
|
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
|
||||||
|
=====================================================
|
||||||
|
|
||||||
|
The report says that the local variable ``uninit`` was created uninitialized in
|
||||||
|
``do_uninit_local_array()``. The third stack trace corresponds to the place
|
||||||
|
where this variable was created.
|
||||||
|
|
||||||
|
The first stack trace shows where the uninit value was used (in
|
||||||
|
``test_uninit_kmsan_check_memory()``). The tool shows the bytes which were left
|
||||||
|
uninitialized in the local variable, as well as the stack where the value was
|
||||||
|
copied to another memory location before use.
|
||||||
|
|
||||||
|
A use of uninitialized value ``v`` is reported by KMSAN in the following cases:
|
||||||
|
- in a condition, e.g. ``if (v) { ... }``;
|
||||||
|
- in an indexing or pointer dereferencing, e.g. ``array[v]`` or ``*v``;
|
||||||
|
- when it is copied to userspace or hardware, e.g. ``copy_to_user(..., &v, ...)``;
|
||||||
|
- when it is passed as an argument to a function, and
|
||||||
|
``CONFIG_KMSAN_CHECK_PARAM_RETVAL`` is enabled (see below).
|
||||||
|
|
||||||
|
The mentioned cases (apart from copying data to userspace or hardware, which is
|
||||||
|
a security issue) are considered undefined behavior from the C11 Standard point
|
||||||
|
of view.
|
||||||
|
|
||||||
|
Disabling the instrumentation
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
A function can be marked with ``__no_kmsan_checks``. Doing so makes KMSAN
|
||||||
|
ignore uninitialized values in that function and mark its output as initialized.
|
||||||
|
As a result, the user will not get KMSAN reports related to that function.
|
||||||
|
|
||||||
|
Another function attribute supported by KMSAN is ``__no_sanitize_memory``.
|
||||||
|
Applying this attribute to a function will result in KMSAN not instrumenting
|
||||||
|
it, which can be helpful if we do not want the compiler to interfere with some
|
||||||
|
low-level code (e.g. that marked with ``noinstr`` which implicitly adds
|
||||||
|
``__no_sanitize_memory``).
|
||||||
|
|
||||||
|
This however comes at a cost: stack allocations from such functions will have
|
||||||
|
incorrect shadow/origin values, likely leading to false positives. Functions
|
||||||
|
called from non-instrumented code may also receive incorrect metadata for their
|
||||||
|
parameters.
|
||||||
|
|
||||||
|
As a rule of thumb, avoid using ``__no_sanitize_memory`` explicitly.
|
||||||
|
|
||||||
|
It is also possible to disable KMSAN for a single file (e.g. main.o)::
|
||||||
|
|
||||||
|
KMSAN_SANITIZE_main.o := n
|
||||||
|
|
||||||
|
or for the whole directory::
|
||||||
|
|
||||||
|
KMSAN_SANITIZE := n
|
||||||
|
|
||||||
|
in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every
|
||||||
|
function in the file or directory. Most users won't need KMSAN_SANITIZE, unless
|
||||||
|
their code gets broken by KMSAN (e.g. runs at early boot time).
|
||||||
|
|
||||||
|
Support
|
||||||
|
=======
|
||||||
|
|
||||||
|
In order for KMSAN to work the kernel must be built with Clang, which so far is
|
||||||
|
the only compiler that has KMSAN support. The kernel instrumentation pass is
|
||||||
|
based on the userspace `MemorySanitizer tool`_.
|
||||||
|
|
||||||
|
The runtime library only supports x86_64 at the moment.
|
||||||
|
|
||||||
|
How KMSAN works
|
||||||
|
===============
|
||||||
|
|
||||||
|
KMSAN shadow memory
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
KMSAN associates a metadata byte (also called shadow byte) with every byte of
|
||||||
|
kernel memory. A bit in the shadow byte is set iff the corresponding bit of the
|
||||||
|
kernel memory byte is uninitialized. Marking the memory uninitialized (i.e.
|
||||||
|
setting its shadow bytes to ``0xff``) is called poisoning, marking it
|
||||||
|
initialized (setting the shadow bytes to ``0x00``) is called unpoisoning.
|
||||||
|
|
||||||
|
When a new variable is allocated on the stack, it is poisoned by default by
|
||||||
|
instrumentation code inserted by the compiler (unless it is a stack variable
|
||||||
|
that is immediately initialized). Any new heap allocation done without
|
||||||
|
``__GFP_ZERO`` is also poisoned.
|
||||||
|
|
||||||
|
Compiler instrumentation also tracks the shadow values as they are used along
|
||||||
|
the code. When needed, instrumentation code invokes the runtime library in
|
||||||
|
``mm/kmsan/`` to persist shadow values.
|
||||||
|
|
||||||
|
The shadow value of a basic or compound type is an array of bytes of the same
|
||||||
|
length. When a constant value is written into memory, that memory is unpoisoned.
|
||||||
|
When a value is read from memory, its shadow memory is also obtained and
|
||||||
|
propagated into all the operations which use that value. For every instruction
|
||||||
|
that takes one or more values the compiler generates code that calculates the
|
||||||
|
shadow of the result depending on those values and their shadows.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
int a = 0xff; // i.e. 0x000000ff
|
||||||
|
int b;
|
||||||
|
int c = a | b;
|
||||||
|
|
||||||
|
In this case the shadow of ``a`` is ``0``, shadow of ``b`` is ``0xffffffff``,
|
||||||
|
shadow of ``c`` is ``0xffffff00``. This means that the upper three bytes of
|
||||||
|
``c`` are uninitialized, while the lower byte is initialized.
|
||||||
|
|
||||||
|
Origin tracking
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Every four bytes of kernel memory also have a so-called origin mapped to them.
|
||||||
|
This origin describes the point in program execution at which the uninitialized
|
||||||
|
value was created. Every origin is associated with either the full allocation
|
||||||
|
stack (for heap-allocated memory), or the function containing the uninitialized
|
||||||
|
variable (for locals).
|
||||||
|
|
||||||
|
When an uninitialized variable is allocated on stack or heap, a new origin
|
||||||
|
value is created, and that variable's origin is filled with that value. When a
|
||||||
|
value is read from memory, its origin is also read and kept together with the
|
||||||
|
shadow. For every instruction that takes one or more values, the origin of the
|
||||||
|
result is one of the origins corresponding to any of the uninitialized inputs.
|
||||||
|
If a poisoned value is written into memory, its origin is written to the
|
||||||
|
corresponding storage as well.
|
||||||
|
|
||||||
|
Example 1::
|
||||||
|
|
||||||
|
int a = 42;
|
||||||
|
int b;
|
||||||
|
int c = a + b;
|
||||||
|
|
||||||
|
In this case the origin of ``b`` is generated upon function entry, and is
|
||||||
|
stored to the origin of ``c`` right before the addition result is written into
|
||||||
|
memory.
|
||||||
|
|
||||||
|
Several variables may share the same origin address, if they are stored in the
|
||||||
|
same four-byte chunk. In this case every write to either variable updates the
|
||||||
|
origin for all of them. We have to sacrifice precision in this case, because
|
||||||
|
storing origins for individual bits (and even bytes) would be too costly.
|
||||||
|
|
||||||
|
Example 2::
|
||||||
|
|
||||||
|
int combine(short a, short b) {
|
||||||
|
union ret_t {
|
||||||
|
int i;
|
||||||
|
short s[2];
|
||||||
|
} ret;
|
||||||
|
ret.s[0] = a;
|
||||||
|
ret.s[1] = b;
|
||||||
|
return ret.i;
|
||||||
|
}
|
||||||
|
|
||||||
|
If ``a`` is initialized and ``b`` is not, the shadow of the result would be
|
||||||
|
0xffff0000, and the origin of the result would be the origin of ``b``.
|
||||||
|
``ret.s[0]`` would have the same origin, but it will never be used, because
|
||||||
|
that variable is initialized.
|
||||||
|
|
||||||
|
If both function arguments are uninitialized, only the origin of the second
|
||||||
|
argument is preserved.
|
||||||
|
|
||||||
|
Origin chaining
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
To ease debugging, KMSAN creates a new origin for every store of an
|
||||||
|
uninitialized value to memory. The new origin references both its creation stack
|
||||||
|
and the previous origin the value had. This may cause increased memory
|
||||||
|
consumption, so we limit the length of origin chains in the runtime.
|
||||||
|
|
||||||
|
Clang instrumentation API
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Clang instrumentation pass inserts calls to functions defined in
|
||||||
|
``mm/kmsan/nstrumentation.c`` into the kernel code.
|
||||||
|
|
||||||
|
Shadow manipulation
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
For every memory access the compiler emits a call to a function that returns a
|
||||||
|
pair of pointers to the shadow and origin addresses of the given memory::
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
void *shadow, *origin;
|
||||||
|
} shadow_origin_ptr_t
|
||||||
|
|
||||||
|
shadow_origin_ptr_t __msan_metadata_ptr_for_load_{1,2,4,8}(void *addr)
|
||||||
|
shadow_origin_ptr_t __msan_metadata_ptr_for_store_{1,2,4,8}(void *addr)
|
||||||
|
shadow_origin_ptr_t __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size)
|
||||||
|
shadow_origin_ptr_t __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size)
|
||||||
|
|
||||||
|
The function name depends on the memory access size.
|
||||||
|
|
||||||
|
The compiler makes sure that for every loaded value its shadow and origin
|
||||||
|
values are read from memory. When a value is stored to memory, its shadow and
|
||||||
|
origin are also stored using the metadata pointers.
|
||||||
|
|
||||||
|
Handling locals
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
A special function is used to create a new origin value for a local variable and
|
||||||
|
set the origin of that variable to that value::
|
||||||
|
|
||||||
|
void __msan_poison_alloca(void *addr, uintptr_t size, char *descr)
|
||||||
|
|
||||||
|
Access to per-task data
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
At the beginning of every instrumented function KMSAN inserts a call to
|
||||||
|
``__msan_get_context_state()``::
|
||||||
|
|
||||||
|
kmsan_context_state *__msan_get_context_state(void)
|
||||||
|
|
||||||
|
``kmsan_context_state`` is declared in ``include/linux/kmsan.h``::
|
||||||
|
|
||||||
|
struct kmsan_context_state {
|
||||||
|
char param_tls[KMSAN_PARAM_SIZE];
|
||||||
|
char retval_tls[KMSAN_RETVAL_SIZE];
|
||||||
|
char va_arg_tls[KMSAN_PARAM_SIZE];
|
||||||
|
char va_arg_origin_tls[KMSAN_PARAM_SIZE];
|
||||||
|
u64 va_arg_overflow_size_tls;
|
||||||
|
char param_origin_tls[KMSAN_PARAM_SIZE];
|
||||||
|
depot_stack_handle_t retval_origin_tls;
|
||||||
|
};
|
||||||
|
|
||||||
|
This structure is used by KMSAN to pass parameter shadows and origins between
|
||||||
|
instrumented functions (unless the parameters are checked immediately by
|
||||||
|
``CONFIG_KMSAN_CHECK_PARAM_RETVAL``).
|
||||||
|
|
||||||
|
Passing uninitialized values to functions
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Clang's MemorySanitizer instrumentation has an option,
|
||||||
|
``-fsanitize-memory-param-retval``, which makes the compiler check function
|
||||||
|
parameters passed by value, as well as function return values.
|
||||||
|
|
||||||
|
The option is controlled by ``CONFIG_KMSAN_CHECK_PARAM_RETVAL``, which is
|
||||||
|
enabled by default to let KMSAN report uninitialized values earlier.
|
||||||
|
Please refer to the `LKML discussion`_ for more details.
|
||||||
|
|
||||||
|
Because of the way the checks are implemented in LLVM (they are only applied to
|
||||||
|
parameters marked as ``noundef``), not all parameters are guaranteed to be
|
||||||
|
checked, so we cannot give up the metadata storage in ``kmsan_context_state``.
|
||||||
|
|
||||||
|
String functions
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The compiler replaces calls to ``memcpy()``/``memmove()``/``memset()`` with the
|
||||||
|
following functions. These functions are also called when data structures are
|
||||||
|
initialized or copied, making sure shadow and origin values are copied alongside
|
||||||
|
with the data::
|
||||||
|
|
||||||
|
void *__msan_memcpy(void *dst, void *src, uintptr_t n)
|
||||||
|
void *__msan_memmove(void *dst, void *src, uintptr_t n)
|
||||||
|
void *__msan_memset(void *dst, int c, uintptr_t n)
|
||||||
|
|
||||||
|
Error reporting
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
For each use of a value the compiler emits a shadow check that calls
|
||||||
|
``__msan_warning()`` in the case that value is poisoned::
|
||||||
|
|
||||||
|
void __msan_warning(u32 origin)
|
||||||
|
|
||||||
|
``__msan_warning()`` causes KMSAN runtime to print an error report.
|
||||||
|
|
||||||
|
Inline assembly instrumentation
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
KMSAN instruments every inline assembly output with a call to::
|
||||||
|
|
||||||
|
void __msan_instrument_asm_store(void *addr, uintptr_t size)
|
||||||
|
|
||||||
|
, which unpoisons the memory region.
|
||||||
|
|
||||||
|
This approach may mask certain errors, but it also helps to avoid a lot of
|
||||||
|
false positives in bitwise operations, atomics etc.
|
||||||
|
|
||||||
|
Sometimes the pointers passed into inline assembly do not point to valid memory.
|
||||||
|
In such cases they are ignored at runtime.
|
||||||
|
|
||||||
|
|
||||||
|
Runtime library
|
||||||
|
---------------
|
||||||
|
|
||||||
|
The code is located in ``mm/kmsan/``.
|
||||||
|
|
||||||
|
Per-task KMSAN state
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Every task_struct has an associated KMSAN task state that holds the KMSAN
|
||||||
|
context (see above) and a per-task flag disallowing KMSAN reports::
|
||||||
|
|
||||||
|
struct kmsan_context {
|
||||||
|
...
|
||||||
|
bool allow_reporting;
|
||||||
|
struct kmsan_context_state cstate;
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
struct task_struct {
|
||||||
|
...
|
||||||
|
struct kmsan_context kmsan;
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
KMSAN contexts
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
When running in a kernel task context, KMSAN uses ``current->kmsan.cstate`` to
|
||||||
|
hold the metadata for function parameters and return values.
|
||||||
|
|
||||||
|
But in the case the kernel is running in the interrupt, softirq or NMI context,
|
||||||
|
where ``current`` is unavailable, KMSAN switches to per-cpu interrupt state::
|
||||||
|
|
||||||
|
DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
|
||||||
|
|
||||||
|
Metadata allocation
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
There are several places in the kernel for which the metadata is stored.
|
||||||
|
|
||||||
|
1. Each ``struct page`` instance contains two pointers to its shadow and
|
||||||
|
origin pages::
|
||||||
|
|
||||||
|
struct page {
|
||||||
|
...
|
||||||
|
struct page *shadow, *origin;
|
||||||
|
...
|
||||||
|
};
|
||||||
|
|
||||||
|
At boot-time, the kernel allocates shadow and origin pages for every available
|
||||||
|
kernel page. This is done quite late, when the kernel address space is already
|
||||||
|
fragmented, so normal data pages may arbitrarily interleave with the metadata
|
||||||
|
pages.
|
||||||
|
|
||||||
|
This means that in general for two contiguous memory pages their shadow/origin
|
||||||
|
pages may not be contiguous. Consequently, if a memory access crosses the
|
||||||
|
boundary of a memory block, accesses to shadow/origin memory may potentially
|
||||||
|
corrupt other pages or read incorrect values from them.
|
||||||
|
|
||||||
|
In practice, contiguous memory pages returned by the same ``alloc_pages()``
|
||||||
|
call will have contiguous metadata, whereas if these pages belong to two
|
||||||
|
different allocations their metadata pages can be fragmented.
|
||||||
|
|
||||||
|
For the kernel data (``.data``, ``.bss`` etc.) and percpu memory regions
|
||||||
|
there also are no guarantees on metadata contiguity.
|
||||||
|
|
||||||
|
In the case ``__msan_metadata_ptr_for_XXX_YYY()`` hits the border between two
|
||||||
|
pages with non-contiguous metadata, it returns pointers to fake shadow/origin regions::
|
||||||
|
|
||||||
|
char dummy_load_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
|
||||||
|
char dummy_store_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
|
||||||
|
|
||||||
|
``dummy_load_page`` is zero-initialized, so reads from it always yield zeroes.
|
||||||
|
All stores to ``dummy_store_page`` are ignored.
|
||||||
|
|
||||||
|
2. For vmalloc memory and modules, there is a direct mapping between the memory
|
||||||
|
range, its shadow and origin. KMSAN reduces the vmalloc area by 3/4, making only
|
||||||
|
the first quarter available to ``vmalloc()``. The second quarter of the vmalloc
|
||||||
|
area contains shadow memory for the first quarter, the third one holds the
|
||||||
|
origins. A small part of the fourth quarter contains shadow and origins for the
|
||||||
|
kernel modules. Please refer to ``arch/x86/include/asm/pgtable_64_types.h`` for
|
||||||
|
more details.
|
||||||
|
|
||||||
|
When an array of pages is mapped into a contiguous virtual memory space, their
|
||||||
|
shadow and origin pages are similarly mapped into contiguous regions.
|
||||||
|
|
||||||
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
E. Stepanov, K. Serebryany. `MemorySanitizer: fast detector of uninitialized
|
||||||
|
memory use in C++
|
||||||
|
<https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43308.pdf>`_.
|
||||||
|
In Proceedings of CGO 2015.
|
||||||
|
|
||||||
|
.. _MemorySanitizer tool: https://clang.llvm.org/docs/MemorySanitizer.html
|
||||||
|
.. _LLVM documentation: https://llvm.org/docs/GettingStarted.html
|
||||||
|
.. _LKML discussion: https://lore.kernel.org/all/20220614144853.3693273-1-glider@google.com/
|
@ -51,6 +51,7 @@ above structured documentation, or deleted if it has served its purpose.
|
|||||||
ksm
|
ksm
|
||||||
memory-model
|
memory-model
|
||||||
mmu_notifier
|
mmu_notifier
|
||||||
|
multigen_lru
|
||||||
numa
|
numa
|
||||||
overcommit-accounting
|
overcommit-accounting
|
||||||
page_migration
|
page_migration
|
||||||
|
@ -26,7 +26,7 @@ tree.
|
|||||||
|
|
||||||
If a KSM page is shared between less than ``max_page_sharing`` VMAs,
|
If a KSM page is shared between less than ``max_page_sharing`` VMAs,
|
||||||
the node of the stable tree that represents such KSM page points to a
|
the node of the stable tree that represents such KSM page points to a
|
||||||
list of struct rmap_item and the ``page->mapping`` of the
|
list of struct ksm_rmap_item and the ``page->mapping`` of the
|
||||||
KSM page points to the stable tree node.
|
KSM page points to the stable tree node.
|
||||||
|
|
||||||
When the sharing passes this threshold, KSM adds a second dimension to
|
When the sharing passes this threshold, KSM adds a second dimension to
|
||||||
|
159
Documentation/mm/multigen_lru.rst
Normal file
159
Documentation/mm/multigen_lru.rst
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=============
|
||||||
|
Multi-Gen LRU
|
||||||
|
=============
|
||||||
|
The multi-gen LRU is an alternative LRU implementation that optimizes
|
||||||
|
page reclaim and improves performance under memory pressure. Page
|
||||||
|
reclaim decides the kernel's caching policy and ability to overcommit
|
||||||
|
memory. It directly impacts the kswapd CPU usage and RAM efficiency.
|
||||||
|
|
||||||
|
Design overview
|
||||||
|
===============
|
||||||
|
Objectives
|
||||||
|
----------
|
||||||
|
The design objectives are:
|
||||||
|
|
||||||
|
* Good representation of access recency
|
||||||
|
* Try to profit from spatial locality
|
||||||
|
* Fast paths to make obvious choices
|
||||||
|
* Simple self-correcting heuristics
|
||||||
|
|
||||||
|
The representation of access recency is at the core of all LRU
|
||||||
|
implementations. In the multi-gen LRU, each generation represents a
|
||||||
|
group of pages with similar access recency. Generations establish a
|
||||||
|
(time-based) common frame of reference and therefore help make better
|
||||||
|
choices, e.g., between different memcgs on a computer or different
|
||||||
|
computers in a data center (for job scheduling).
|
||||||
|
|
||||||
|
Exploiting spatial locality improves efficiency when gathering the
|
||||||
|
accessed bit. A rmap walk targets a single page and does not try to
|
||||||
|
profit from discovering a young PTE. A page table walk can sweep all
|
||||||
|
the young PTEs in an address space, but the address space can be too
|
||||||
|
sparse to make a profit. The key is to optimize both methods and use
|
||||||
|
them in combination.
|
||||||
|
|
||||||
|
Fast paths reduce code complexity and runtime overhead. Unmapped pages
|
||||||
|
do not require TLB flushes; clean pages do not require writeback.
|
||||||
|
These facts are only helpful when other conditions, e.g., access
|
||||||
|
recency, are similar. With generations as a common frame of reference,
|
||||||
|
additional factors stand out. But obvious choices might not be good
|
||||||
|
choices; thus self-correction is necessary.
|
||||||
|
|
||||||
|
The benefits of simple self-correcting heuristics are self-evident.
|
||||||
|
Again, with generations as a common frame of reference, this becomes
|
||||||
|
attainable. Specifically, pages in the same generation can be
|
||||||
|
categorized based on additional factors, and a feedback loop can
|
||||||
|
statistically compare the refault percentages across those categories
|
||||||
|
and infer which of them are better choices.
|
||||||
|
|
||||||
|
Assumptions
|
||||||
|
-----------
|
||||||
|
The protection of hot pages and the selection of cold pages are based
|
||||||
|
on page access channels and patterns. There are two access channels:
|
||||||
|
|
||||||
|
* Accesses through page tables
|
||||||
|
* Accesses through file descriptors
|
||||||
|
|
||||||
|
The protection of the former channel is by design stronger because:
|
||||||
|
|
||||||
|
1. The uncertainty in determining the access patterns of the former
|
||||||
|
channel is higher due to the approximation of the accessed bit.
|
||||||
|
2. The cost of evicting the former channel is higher due to the TLB
|
||||||
|
flushes required and the likelihood of encountering the dirty bit.
|
||||||
|
3. The penalty of underprotecting the former channel is higher because
|
||||||
|
applications usually do not prepare themselves for major page
|
||||||
|
faults like they do for blocked I/O. E.g., GUI applications
|
||||||
|
commonly use dedicated I/O threads to avoid blocking rendering
|
||||||
|
threads.
|
||||||
|
|
||||||
|
There are also two access patterns:
|
||||||
|
|
||||||
|
* Accesses exhibiting temporal locality
|
||||||
|
* Accesses not exhibiting temporal locality
|
||||||
|
|
||||||
|
For the reasons listed above, the former channel is assumed to follow
|
||||||
|
the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is
|
||||||
|
present, and the latter channel is assumed to follow the latter
|
||||||
|
pattern unless outlying refaults have been observed.
|
||||||
|
|
||||||
|
Workflow overview
|
||||||
|
=================
|
||||||
|
Evictable pages are divided into multiple generations for each
|
||||||
|
``lruvec``. The youngest generation number is stored in
|
||||||
|
``lrugen->max_seq`` for both anon and file types as they are aged on
|
||||||
|
an equal footing. The oldest generation numbers are stored in
|
||||||
|
``lrugen->min_seq[]`` separately for anon and file types as clean file
|
||||||
|
pages can be evicted regardless of swap constraints. These three
|
||||||
|
variables are monotonically increasing.
|
||||||
|
|
||||||
|
Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
|
||||||
|
bits in order to fit into the gen counter in ``folio->flags``. Each
|
||||||
|
truncated generation number is an index to ``lrugen->lists[]``. The
|
||||||
|
sliding window technique is used to track at least ``MIN_NR_GENS`` and
|
||||||
|
at most ``MAX_NR_GENS`` generations. The gen counter stores a value
|
||||||
|
within ``[1, MAX_NR_GENS]`` while a page is on one of
|
||||||
|
``lrugen->lists[]``; otherwise it stores zero.
|
||||||
|
|
||||||
|
Each generation is divided into multiple tiers. A page accessed ``N``
|
||||||
|
times through file descriptors is in tier ``order_base_2(N)``. Unlike
|
||||||
|
generations, tiers do not have dedicated ``lrugen->lists[]``. In
|
||||||
|
contrast to moving across generations, which requires the LRU lock,
|
||||||
|
moving across tiers only involves atomic operations on
|
||||||
|
``folio->flags`` and therefore has a negligible cost. A feedback loop
|
||||||
|
modeled after the PID controller monitors refaults over all the tiers
|
||||||
|
from anon and file types and decides which tiers from which types to
|
||||||
|
evict or protect.
|
||||||
|
|
||||||
|
There are two conceptually independent procedures: the aging and the
|
||||||
|
eviction. They form a closed-loop system, i.e., the page reclaim.
|
||||||
|
|
||||||
|
Aging
|
||||||
|
-----
|
||||||
|
The aging produces young generations. Given an ``lruvec``, it
|
||||||
|
increments ``max_seq`` when ``max_seq-min_seq+1`` approaches
|
||||||
|
``MIN_NR_GENS``. The aging promotes hot pages to the youngest
|
||||||
|
generation when it finds them accessed through page tables; the
|
||||||
|
demotion of cold pages happens consequently when it increments
|
||||||
|
``max_seq``. The aging uses page table walks and rmap walks to find
|
||||||
|
young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list``
|
||||||
|
and calls ``walk_page_range()`` with each ``mm_struct`` on this list
|
||||||
|
to scan PTEs, and after each iteration, it increments ``max_seq``. For
|
||||||
|
the latter, when the eviction walks the rmap and finds a young PTE,
|
||||||
|
the aging scans the adjacent PTEs. For both, on finding a young PTE,
|
||||||
|
the aging clears the accessed bit and updates the gen counter of the
|
||||||
|
page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
|
||||||
|
|
||||||
|
Eviction
|
||||||
|
--------
|
||||||
|
The eviction consumes old generations. Given an ``lruvec``, it
|
||||||
|
increments ``min_seq`` when ``lrugen->lists[]`` indexed by
|
||||||
|
``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
|
||||||
|
evict from, it first compares ``min_seq[]`` to select the older type.
|
||||||
|
If both types are equally old, it selects the one whose first tier has
|
||||||
|
a lower refault percentage. The first tier contains single-use
|
||||||
|
unmapped clean pages, which are the best bet. The eviction sorts a
|
||||||
|
page according to its gen counter if the aging has found this page
|
||||||
|
accessed through page tables and updated its gen counter. It also
|
||||||
|
moves a page to the next generation, i.e., ``min_seq+1``, if this page
|
||||||
|
was accessed multiple times through file descriptors and the feedback
|
||||||
|
loop has detected outlying refaults from the tier this page is in. To
|
||||||
|
this end, the feedback loop uses the first tier as the baseline, for
|
||||||
|
the reason stated earlier.
|
||||||
|
|
||||||
|
Summary
|
||||||
|
-------
|
||||||
|
The multi-gen LRU can be disassembled into the following parts:
|
||||||
|
|
||||||
|
* Generations
|
||||||
|
* Rmap walks
|
||||||
|
* Page table walks
|
||||||
|
* Bloom filters
|
||||||
|
* PID controller
|
||||||
|
|
||||||
|
The aging and the eviction form a producer-consumer model;
|
||||||
|
specifically, the latter drives the former by the sliding window over
|
||||||
|
generations. Within the aging, rmap walks drive page table walks by
|
||||||
|
inserting hot densely populated page tables to the Bloom filters.
|
||||||
|
Within the eviction, the PID controller uses refaults as the feedback
|
||||||
|
to select types to evict and tiers to protect.
|
@ -94,6 +94,11 @@ Usage
|
|||||||
Page allocated via order XXX, ...
|
Page allocated via order XXX, ...
|
||||||
PFN XXX ...
|
PFN XXX ...
|
||||||
// Detailed stack
|
// Detailed stack
|
||||||
|
By default, it will do full pfn dump, to start with a given pfn,
|
||||||
|
page_owner supports fseek.
|
||||||
|
|
||||||
|
FILE *fp = fopen("/sys/kernel/debug/page_owner", "r");
|
||||||
|
fseek(fp, pfn_start, SEEK_SET);
|
||||||
|
|
||||||
The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
|
The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
|
||||||
in buf, uses regexp to extract the page order value, counts the times
|
in buf, uses regexp to extract the page order value, counts the times
|
||||||
|
27
MAINTAINERS
27
MAINTAINERS
@ -11004,7 +11004,6 @@ F: arch/*/include/asm/*kasan.h
|
|||||||
F: arch/*/mm/kasan_init*
|
F: arch/*/mm/kasan_init*
|
||||||
F: include/linux/kasan*.h
|
F: include/linux/kasan*.h
|
||||||
F: lib/Kconfig.kasan
|
F: lib/Kconfig.kasan
|
||||||
F: lib/test_kasan*.c
|
|
||||||
F: mm/kasan/
|
F: mm/kasan/
|
||||||
F: scripts/Makefile.kasan
|
F: scripts/Makefile.kasan
|
||||||
|
|
||||||
@ -11438,6 +11437,20 @@ F: kernel/kmod.c
|
|||||||
F: lib/test_kmod.c
|
F: lib/test_kmod.c
|
||||||
F: tools/testing/selftests/kmod/
|
F: tools/testing/selftests/kmod/
|
||||||
|
|
||||||
|
KMSAN
|
||||||
|
M: Alexander Potapenko <glider@google.com>
|
||||||
|
R: Marco Elver <elver@google.com>
|
||||||
|
R: Dmitry Vyukov <dvyukov@google.com>
|
||||||
|
L: kasan-dev@googlegroups.com
|
||||||
|
S: Maintained
|
||||||
|
F: Documentation/dev-tools/kmsan.rst
|
||||||
|
F: arch/*/include/asm/kmsan.h
|
||||||
|
F: arch/*/mm/kmsan_*
|
||||||
|
F: include/linux/kmsan*.h
|
||||||
|
F: lib/Kconfig.kmsan
|
||||||
|
F: mm/kmsan/
|
||||||
|
F: scripts/Makefile.kmsan
|
||||||
|
|
||||||
KPROBES
|
KPROBES
|
||||||
M: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
|
M: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
|
||||||
M: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
|
M: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
|
||||||
@ -12168,6 +12181,18 @@ L: linux-man@vger.kernel.org
|
|||||||
S: Maintained
|
S: Maintained
|
||||||
W: http://www.kernel.org/doc/man-pages
|
W: http://www.kernel.org/doc/man-pages
|
||||||
|
|
||||||
|
MAPLE TREE
|
||||||
|
M: Liam R. Howlett <Liam.Howlett@oracle.com>
|
||||||
|
L: linux-mm@kvack.org
|
||||||
|
S: Supported
|
||||||
|
F: Documentation/core-api/maple_tree.rst
|
||||||
|
F: include/linux/maple_tree.h
|
||||||
|
F: include/trace/events/maple_tree.h
|
||||||
|
F: lib/maple_tree.c
|
||||||
|
F: lib/test_maple_tree.c
|
||||||
|
F: tools/testing/radix-tree/linux/maple_tree.h
|
||||||
|
F: tools/testing/radix-tree/maple.c
|
||||||
|
|
||||||
MARDUK (CREATOR CI40) DEVICE TREE SUPPORT
|
MARDUK (CREATOR CI40) DEVICE TREE SUPPORT
|
||||||
M: Rahul Bedarkar <rahulbedarkar89@gmail.com>
|
M: Rahul Bedarkar <rahulbedarkar89@gmail.com>
|
||||||
L: linux-mips@vger.kernel.org
|
L: linux-mips@vger.kernel.org
|
||||||
|
1
Makefile
1
Makefile
@ -1081,6 +1081,7 @@ include-y := scripts/Makefile.extrawarn
|
|||||||
include-$(CONFIG_DEBUG_INFO) += scripts/Makefile.debug
|
include-$(CONFIG_DEBUG_INFO) += scripts/Makefile.debug
|
||||||
include-$(CONFIG_KASAN) += scripts/Makefile.kasan
|
include-$(CONFIG_KASAN) += scripts/Makefile.kasan
|
||||||
include-$(CONFIG_KCSAN) += scripts/Makefile.kcsan
|
include-$(CONFIG_KCSAN) += scripts/Makefile.kcsan
|
||||||
|
include-$(CONFIG_KMSAN) += scripts/Makefile.kmsan
|
||||||
include-$(CONFIG_UBSAN) += scripts/Makefile.ubsan
|
include-$(CONFIG_UBSAN) += scripts/Makefile.ubsan
|
||||||
include-$(CONFIG_KCOV) += scripts/Makefile.kcov
|
include-$(CONFIG_KCOV) += scripts/Makefile.kcov
|
||||||
include-$(CONFIG_RANDSTRUCT) += scripts/Makefile.randstruct
|
include-$(CONFIG_RANDSTRUCT) += scripts/Makefile.randstruct
|
||||||
|
@ -1416,6 +1416,14 @@ config DYNAMIC_SIGFRAME
|
|||||||
config HAVE_ARCH_NODE_DEV_GROUP
|
config HAVE_ARCH_NODE_DEV_GROUP
|
||||||
bool
|
bool
|
||||||
|
|
||||||
|
config ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||||
|
bool
|
||||||
|
help
|
||||||
|
Architectures that select this option are capable of setting the
|
||||||
|
accessed bit in non-leaf PMD entries when using them as part of linear
|
||||||
|
address translations. Page table walkers that clear the accessed bit
|
||||||
|
may use this capability to reduce their search space.
|
||||||
|
|
||||||
source "kernel/gcov/Kconfig"
|
source "kernel/gcov/Kconfig"
|
||||||
|
|
||||||
source "scripts/gcc-plugins/Kconfig"
|
source "scripts/gcc-plugins/Kconfig"
|
||||||
|
@ -76,6 +76,8 @@
|
|||||||
|
|
||||||
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
||||||
|
|
||||||
|
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
|
||||||
|
|
||||||
/* compatibility flags */
|
/* compatibility flags */
|
||||||
#define MAP_FILE 0
|
#define MAP_FILE 0
|
||||||
|
|
||||||
|
@ -554,7 +554,7 @@ config ARC_BUILTIN_DTB_NAME
|
|||||||
|
|
||||||
endmenu # "ARC Architecture Configuration"
|
endmenu # "ARC Architecture Configuration"
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
default "12" if ARC_HUGEPAGE_16M
|
default "12" if ARC_HUGEPAGE_16M
|
||||||
default "11"
|
default "11"
|
||||||
|
@ -1362,7 +1362,7 @@ config ARM_MODULE_PLTS
|
|||||||
Disabling this is usually safe for small single-platform
|
Disabling this is usually safe for small single-platform
|
||||||
configurations. If unsure, say y.
|
configurations. If unsure, say y.
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
default "12" if SOC_AM33XX
|
default "12" if SOC_AM33XX
|
||||||
default "9" if SA1111
|
default "9" if SA1111
|
||||||
|
@ -31,7 +31,7 @@ CONFIG_SOC_VF610=y
|
|||||||
CONFIG_SMP=y
|
CONFIG_SMP=y
|
||||||
CONFIG_ARM_PSCI=y
|
CONFIG_ARM_PSCI=y
|
||||||
CONFIG_HIGHMEM=y
|
CONFIG_HIGHMEM=y
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=14
|
CONFIG_ARCH_FORCE_MAX_ORDER=14
|
||||||
CONFIG_CMDLINE="noinitrd console=ttymxc0,115200"
|
CONFIG_CMDLINE="noinitrd console=ttymxc0,115200"
|
||||||
CONFIG_KEXEC=y
|
CONFIG_KEXEC=y
|
||||||
CONFIG_CPU_FREQ=y
|
CONFIG_CPU_FREQ=y
|
||||||
|
@ -26,7 +26,7 @@ CONFIG_THUMB2_KERNEL=y
|
|||||||
# CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11 is not set
|
# CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11 is not set
|
||||||
# CONFIG_ARM_PATCH_IDIV is not set
|
# CONFIG_ARM_PATCH_IDIV is not set
|
||||||
CONFIG_HIGHMEM=y
|
CONFIG_HIGHMEM=y
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=12
|
CONFIG_ARCH_FORCE_MAX_ORDER=12
|
||||||
CONFIG_SECCOMP=y
|
CONFIG_SECCOMP=y
|
||||||
CONFIG_KEXEC=y
|
CONFIG_KEXEC=y
|
||||||
CONFIG_EFI=y
|
CONFIG_EFI=y
|
||||||
|
@ -12,7 +12,7 @@ CONFIG_ARCH_OXNAS=y
|
|||||||
CONFIG_MACH_OX820=y
|
CONFIG_MACH_OX820=y
|
||||||
CONFIG_SMP=y
|
CONFIG_SMP=y
|
||||||
CONFIG_NR_CPUS=16
|
CONFIG_NR_CPUS=16
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=12
|
CONFIG_ARCH_FORCE_MAX_ORDER=12
|
||||||
CONFIG_SECCOMP=y
|
CONFIG_SECCOMP=y
|
||||||
CONFIG_ARM_APPENDED_DTB=y
|
CONFIG_ARM_APPENDED_DTB=y
|
||||||
CONFIG_ARM_ATAG_DTB_COMPAT=y
|
CONFIG_ARM_ATAG_DTB_COMPAT=y
|
||||||
|
@ -21,7 +21,7 @@ CONFIG_MACH_AKITA=y
|
|||||||
CONFIG_MACH_BORZOI=y
|
CONFIG_MACH_BORZOI=y
|
||||||
CONFIG_PXA_SYSTEMS_CPLDS=y
|
CONFIG_PXA_SYSTEMS_CPLDS=y
|
||||||
CONFIG_AEABI=y
|
CONFIG_AEABI=y
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=9
|
CONFIG_ARCH_FORCE_MAX_ORDER=9
|
||||||
CONFIG_CMDLINE="root=/dev/ram0 ro"
|
CONFIG_CMDLINE="root=/dev/ram0 ro"
|
||||||
CONFIG_KEXEC=y
|
CONFIG_KEXEC=y
|
||||||
CONFIG_CPU_FREQ=y
|
CONFIG_CPU_FREQ=y
|
||||||
|
@ -19,7 +19,7 @@ CONFIG_ATMEL_CLOCKSOURCE_TCB=y
|
|||||||
# CONFIG_CACHE_L2X0 is not set
|
# CONFIG_CACHE_L2X0 is not set
|
||||||
# CONFIG_ARM_PATCH_IDIV is not set
|
# CONFIG_ARM_PATCH_IDIV is not set
|
||||||
# CONFIG_CPU_SW_DOMAIN_PAN is not set
|
# CONFIG_CPU_SW_DOMAIN_PAN is not set
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=15
|
CONFIG_ARCH_FORCE_MAX_ORDER=15
|
||||||
CONFIG_UACCESS_WITH_MEMCPY=y
|
CONFIG_UACCESS_WITH_MEMCPY=y
|
||||||
# CONFIG_ATAGS is not set
|
# CONFIG_ATAGS is not set
|
||||||
CONFIG_CMDLINE="console=ttyS0,115200 earlyprintk ignore_loglevel"
|
CONFIG_CMDLINE="console=ttyS0,115200 earlyprintk ignore_loglevel"
|
||||||
|
@ -17,7 +17,7 @@ CONFIG_ARCH_SUNPLUS=y
|
|||||||
# CONFIG_VDSO is not set
|
# CONFIG_VDSO is not set
|
||||||
CONFIG_SMP=y
|
CONFIG_SMP=y
|
||||||
CONFIG_THUMB2_KERNEL=y
|
CONFIG_THUMB2_KERNEL=y
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=12
|
CONFIG_ARCH_FORCE_MAX_ORDER=12
|
||||||
CONFIG_VFP=y
|
CONFIG_VFP=y
|
||||||
CONFIG_NEON=y
|
CONFIG_NEON=y
|
||||||
CONFIG_MODULES=y
|
CONFIG_MODULES=y
|
||||||
|
@ -1431,7 +1431,7 @@ config XEN
|
|||||||
help
|
help
|
||||||
Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
|
Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int
|
int
|
||||||
default "14" if ARM64_64K_PAGES
|
default "14" if ARM64_64K_PAGES
|
||||||
default "12" if ARM64_16K_PAGES
|
default "12" if ARM64_16K_PAGES
|
||||||
|
@ -1082,24 +1082,13 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
|
|||||||
* page after fork() + CoW for pfn mappings. We don't always have a
|
* page after fork() + CoW for pfn mappings. We don't always have a
|
||||||
* hardware-managed access flag on arm64.
|
* hardware-managed access flag on arm64.
|
||||||
*/
|
*/
|
||||||
static inline bool arch_faults_on_old_pte(void)
|
#define arch_has_hw_pte_young cpu_has_hw_af
|
||||||
{
|
|
||||||
/* The register read below requires a stable CPU to make any sense */
|
|
||||||
cant_migrate();
|
|
||||||
|
|
||||||
return !cpu_has_hw_af();
|
|
||||||
}
|
|
||||||
#define arch_faults_on_old_pte arch_faults_on_old_pte
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Experimentally, it's cheap to set the access flag in hardware and we
|
* Experimentally, it's cheap to set the access flag in hardware and we
|
||||||
* benefit from prefaulting mappings as 'old' to start with.
|
* benefit from prefaulting mappings as 'old' to start with.
|
||||||
*/
|
*/
|
||||||
static inline bool arch_wants_old_prefaulted_pte(void)
|
#define arch_wants_old_prefaulted_pte cpu_has_hw_af
|
||||||
{
|
|
||||||
return !arch_faults_on_old_pte();
|
|
||||||
}
|
|
||||||
#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
|
|
||||||
|
|
||||||
static inline bool pud_sect_supported(void)
|
static inline bool pud_sect_supported(void)
|
||||||
{
|
{
|
||||||
|
@ -8,9 +8,9 @@
|
|||||||
#include <asm/cpufeature.h>
|
#include <asm/cpufeature.h>
|
||||||
#include <asm/mte.h>
|
#include <asm/mte.h>
|
||||||
|
|
||||||
#define for_each_mte_vma(tsk, vma) \
|
#define for_each_mte_vma(vmi, vma) \
|
||||||
if (system_supports_mte()) \
|
if (system_supports_mte()) \
|
||||||
for (vma = tsk->mm->mmap; vma; vma = vma->vm_next) \
|
for_each_vma(vmi, vma) \
|
||||||
if (vma->vm_flags & VM_MTE)
|
if (vma->vm_flags & VM_MTE)
|
||||||
|
|
||||||
static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma)
|
static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma)
|
||||||
@ -81,8 +81,9 @@ Elf_Half elf_core_extra_phdrs(void)
|
|||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
int vma_count = 0;
|
int vma_count = 0;
|
||||||
|
VMA_ITERATOR(vmi, current->mm, 0);
|
||||||
|
|
||||||
for_each_mte_vma(current, vma)
|
for_each_mte_vma(vmi, vma)
|
||||||
vma_count++;
|
vma_count++;
|
||||||
|
|
||||||
return vma_count;
|
return vma_count;
|
||||||
@ -91,8 +92,9 @@ Elf_Half elf_core_extra_phdrs(void)
|
|||||||
int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
|
int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, current->mm, 0);
|
||||||
|
|
||||||
for_each_mte_vma(current, vma) {
|
for_each_mte_vma(vmi, vma) {
|
||||||
struct elf_phdr phdr;
|
struct elf_phdr phdr;
|
||||||
|
|
||||||
phdr.p_type = PT_AARCH64_MEMTAG_MTE;
|
phdr.p_type = PT_AARCH64_MEMTAG_MTE;
|
||||||
@ -116,8 +118,9 @@ size_t elf_core_extra_data_size(void)
|
|||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
VMA_ITERATOR(vmi, current->mm, 0);
|
||||||
|
|
||||||
for_each_mte_vma(current, vma)
|
for_each_mte_vma(vmi, vma)
|
||||||
data_size += mte_vma_tag_dump_size(vma);
|
data_size += mte_vma_tag_dump_size(vma);
|
||||||
|
|
||||||
return data_size;
|
return data_size;
|
||||||
@ -126,8 +129,9 @@ size_t elf_core_extra_data_size(void)
|
|||||||
int elf_core_write_extra_data(struct coredump_params *cprm)
|
int elf_core_write_extra_data(struct coredump_params *cprm)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, current->mm, 0);
|
||||||
|
|
||||||
for_each_mte_vma(current, vma) {
|
for_each_mte_vma(vmi, vma) {
|
||||||
if (vma->vm_flags & VM_DONTDUMP)
|
if (vma->vm_flags & VM_DONTDUMP)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
@ -133,10 +133,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
|||||||
{
|
{
|
||||||
struct mm_struct *mm = task->mm;
|
struct mm_struct *mm = task->mm;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
mmap_read_lock(mm);
|
mmap_read_lock(mm);
|
||||||
|
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
for_each_vma(vmi, vma) {
|
||||||
unsigned long size = vma->vm_end - vma->vm_start;
|
unsigned long size = vma->vm_end - vma->vm_start;
|
||||||
|
|
||||||
if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm))
|
if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm))
|
||||||
|
@ -245,7 +245,7 @@ static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
|
|||||||
{
|
{
|
||||||
VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));
|
VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));
|
||||||
|
|
||||||
return page_folio(pfn_to_page(swp_offset(entry)));
|
return page_folio(pfn_to_page(swp_offset_pfn(entry)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||||
|
@ -332,7 +332,7 @@ config HIGHMEM
|
|||||||
select KMAP_LOCAL
|
select KMAP_LOCAL
|
||||||
default y
|
default y
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
default "11"
|
default "11"
|
||||||
|
|
||||||
|
@ -200,7 +200,7 @@ config IA64_CYCLONE
|
|||||||
Say Y here to enable support for IBM EXA Cyclone time source.
|
Say Y here to enable support for IBM EXA Cyclone time source.
|
||||||
If you're unsure, answer N.
|
If you're unsure, answer N.
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "MAX_ORDER (11 - 17)" if !HUGETLB_PAGE
|
int "MAX_ORDER (11 - 17)" if !HUGETLB_PAGE
|
||||||
range 11 17 if !HUGETLB_PAGE
|
range 11 17 if !HUGETLB_PAGE
|
||||||
default "17" if HUGETLB_PAGE
|
default "17" if HUGETLB_PAGE
|
||||||
|
@ -11,10 +11,10 @@
|
|||||||
|
|
||||||
#define SECTION_SIZE_BITS (30)
|
#define SECTION_SIZE_BITS (30)
|
||||||
#define MAX_PHYSMEM_BITS (50)
|
#define MAX_PHYSMEM_BITS (50)
|
||||||
#ifdef CONFIG_FORCE_MAX_ZONEORDER
|
#ifdef CONFIG_ARCH_FORCE_MAX_ORDER
|
||||||
#if ((CONFIG_FORCE_MAX_ZONEORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS)
|
#if ((CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS)
|
||||||
#undef SECTION_SIZE_BITS
|
#undef SECTION_SIZE_BITS
|
||||||
#define SECTION_SIZE_BITS (CONFIG_FORCE_MAX_ZONEORDER - 1 + PAGE_SHIFT)
|
#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT)
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -377,7 +377,7 @@ config NODES_SHIFT
|
|||||||
default "6"
|
default "6"
|
||||||
depends on NUMA
|
depends on NUMA
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
range 14 64 if PAGE_SIZE_64KB
|
range 14 64 if PAGE_SIZE_64KB
|
||||||
default "14" if PAGE_SIZE_64KB
|
default "14" if PAGE_SIZE_64KB
|
||||||
|
@ -397,7 +397,7 @@ config SINGLE_MEMORY_CHUNK
|
|||||||
order" to save memory that could be wasted for unused memory map.
|
order" to save memory that could be wasted for unused memory map.
|
||||||
Say N if not sure.
|
Say N if not sure.
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order" if ADVANCED
|
int "Maximum zone order" if ADVANCED
|
||||||
depends on !SINGLE_MEMORY_CHUNK
|
depends on !SINGLE_MEMORY_CHUNK
|
||||||
default "11"
|
default "11"
|
||||||
|
@ -2140,7 +2140,7 @@ config PAGE_SIZE_64KB
|
|||||||
|
|
||||||
endchoice
|
endchoice
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
range 14 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
|
range 14 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
|
||||||
default "14" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
|
default "14" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
|
||||||
|
@ -9,7 +9,6 @@ CONFIG_HIGH_RES_TIMERS=y
|
|||||||
CONFIG_LOG_BUF_SHIFT=16
|
CONFIG_LOG_BUF_SHIFT=16
|
||||||
CONFIG_CGROUPS=y
|
CONFIG_CGROUPS=y
|
||||||
CONFIG_MEMCG=y
|
CONFIG_MEMCG=y
|
||||||
CONFIG_MEMCG_SWAP=y
|
|
||||||
CONFIG_BLK_CGROUP=y
|
CONFIG_BLK_CGROUP=y
|
||||||
CONFIG_CGROUP_SCHED=y
|
CONFIG_CGROUP_SCHED=y
|
||||||
CONFIG_CFS_BANDWIDTH=y
|
CONFIG_CFS_BANDWIDTH=y
|
||||||
|
@ -3,7 +3,6 @@ CONFIG_NO_HZ_IDLE=y
|
|||||||
CONFIG_IKCONFIG=y
|
CONFIG_IKCONFIG=y
|
||||||
CONFIG_IKCONFIG_PROC=y
|
CONFIG_IKCONFIG_PROC=y
|
||||||
CONFIG_MEMCG=y
|
CONFIG_MEMCG=y
|
||||||
CONFIG_MEMCG_SWAP=y
|
|
||||||
CONFIG_BLK_CGROUP=y
|
CONFIG_BLK_CGROUP=y
|
||||||
CONFIG_CFS_BANDWIDTH=y
|
CONFIG_CFS_BANDWIDTH=y
|
||||||
CONFIG_RT_GROUP_SCHED=y
|
CONFIG_RT_GROUP_SCHED=y
|
||||||
|
@ -103,6 +103,8 @@
|
|||||||
|
|
||||||
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
||||||
|
|
||||||
|
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
|
||||||
|
|
||||||
/* compatibility flags */
|
/* compatibility flags */
|
||||||
#define MAP_FILE 0
|
#define MAP_FILE 0
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ menu "Kernel features"
|
|||||||
|
|
||||||
source "kernel/Kconfig.hz"
|
source "kernel/Kconfig.hz"
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
range 9 20
|
range 9 20
|
||||||
default "11"
|
default "11"
|
||||||
|
@ -70,6 +70,8 @@
|
|||||||
#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
|
#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
|
||||||
#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
|
#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
|
||||||
|
|
||||||
|
#define MADV_COLLAPSE 73 /* Synchronous hugepage collapse */
|
||||||
|
|
||||||
#define MADV_HWPOISON 100 /* poison a page for testing */
|
#define MADV_HWPOISON 100 /* poison a page for testing */
|
||||||
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
|
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
|
||||||
|
|
||||||
|
@ -657,15 +657,20 @@ static inline unsigned long mm_total_size(struct mm_struct *mm)
|
|||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
unsigned long usize = 0;
|
unsigned long usize = 0;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
for (vma = mm->mmap; vma && usize < parisc_cache_flush_threshold; vma = vma->vm_next)
|
for_each_vma(vmi, vma) {
|
||||||
|
if (usize >= parisc_cache_flush_threshold)
|
||||||
|
break;
|
||||||
usize += vma->vm_end - vma->vm_start;
|
usize += vma->vm_end - vma->vm_start;
|
||||||
|
}
|
||||||
return usize;
|
return usize;
|
||||||
}
|
}
|
||||||
|
|
||||||
void flush_cache_mm(struct mm_struct *mm)
|
void flush_cache_mm(struct mm_struct *mm)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Flushing the whole cache on each cpu takes forever on
|
* Flushing the whole cache on each cpu takes forever on
|
||||||
@ -685,7 +690,7 @@ void flush_cache_mm(struct mm_struct *mm)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Flush mm */
|
/* Flush mm */
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next)
|
for_each_vma(vmi, vma)
|
||||||
flush_cache_pages(vma, vma->vm_start, vma->vm_end);
|
flush_cache_pages(vma, vma->vm_start, vma->vm_end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -846,7 +846,7 @@ config DATA_SHIFT
|
|||||||
in that case. If PIN_TLB is selected, it must be aligned to 8M as
|
in that case. If PIN_TLB is selected, it must be aligned to 8M as
|
||||||
8M pages will be pinned.
|
8M pages will be pinned.
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
range 8 9 if PPC64 && PPC_64K_PAGES
|
range 8 9 if PPC64 && PPC_64K_PAGES
|
||||||
default "9" if PPC64 && PPC_64K_PAGES
|
default "9" if PPC64 && PPC_64K_PAGES
|
||||||
|
@ -30,7 +30,7 @@ CONFIG_PREEMPT=y
|
|||||||
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
|
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
|
||||||
CONFIG_BINFMT_MISC=m
|
CONFIG_BINFMT_MISC=m
|
||||||
CONFIG_MATH_EMULATION=y
|
CONFIG_MATH_EMULATION=y
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=17
|
CONFIG_ARCH_FORCE_MAX_ORDER=17
|
||||||
CONFIG_PCI=y
|
CONFIG_PCI=y
|
||||||
CONFIG_PCIEPORTBUS=y
|
CONFIG_PCIEPORTBUS=y
|
||||||
CONFIG_PCI_MSI=y
|
CONFIG_PCI_MSI=y
|
||||||
|
@ -41,7 +41,7 @@ CONFIG_FIXED_PHY=y
|
|||||||
CONFIG_FONT_8x16=y
|
CONFIG_FONT_8x16=y
|
||||||
CONFIG_FONT_8x8=y
|
CONFIG_FONT_8x8=y
|
||||||
CONFIG_FONTS=y
|
CONFIG_FONTS=y
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=13
|
CONFIG_ARCH_FORCE_MAX_ORDER=13
|
||||||
CONFIG_FRAMEBUFFER_CONSOLE=y
|
CONFIG_FRAMEBUFFER_CONSOLE=y
|
||||||
CONFIG_FRAME_WARN=1024
|
CONFIG_FRAME_WARN=1024
|
||||||
CONFIG_FTL=y
|
CONFIG_FTL=y
|
||||||
|
@ -17,7 +17,6 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=13
|
|||||||
CONFIG_NUMA_BALANCING=y
|
CONFIG_NUMA_BALANCING=y
|
||||||
CONFIG_CGROUPS=y
|
CONFIG_CGROUPS=y
|
||||||
CONFIG_MEMCG=y
|
CONFIG_MEMCG=y
|
||||||
CONFIG_MEMCG_SWAP=y
|
|
||||||
CONFIG_CGROUP_SCHED=y
|
CONFIG_CGROUP_SCHED=y
|
||||||
CONFIG_CGROUP_FREEZER=y
|
CONFIG_CGROUP_FREEZER=y
|
||||||
CONFIG_CPUSETS=y
|
CONFIG_CPUSETS=y
|
||||||
|
@ -18,7 +18,6 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=13
|
|||||||
CONFIG_NUMA_BALANCING=y
|
CONFIG_NUMA_BALANCING=y
|
||||||
CONFIG_CGROUPS=y
|
CONFIG_CGROUPS=y
|
||||||
CONFIG_MEMCG=y
|
CONFIG_MEMCG=y
|
||||||
CONFIG_MEMCG_SWAP=y
|
|
||||||
CONFIG_CGROUP_SCHED=y
|
CONFIG_CGROUP_SCHED=y
|
||||||
CONFIG_CGROUP_FREEZER=y
|
CONFIG_CGROUP_FREEZER=y
|
||||||
CONFIG_CPUSETS=y
|
CONFIG_CPUSETS=y
|
||||||
|
@ -115,18 +115,18 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page)
|
|||||||
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
||||||
{
|
{
|
||||||
struct mm_struct *mm = task->mm;
|
struct mm_struct *mm = task->mm;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
|
||||||
mmap_read_lock(mm);
|
mmap_read_lock(mm);
|
||||||
|
for_each_vma(vmi, vma) {
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
|
||||||
unsigned long size = vma->vm_end - vma->vm_start;
|
unsigned long size = vma->vm_end - vma->vm_start;
|
||||||
|
|
||||||
if (vma_is_special_mapping(vma, &vvar_spec))
|
if (vma_is_special_mapping(vma, &vvar_spec))
|
||||||
zap_page_range(vma, vma->vm_start, size);
|
zap_page_range(vma, vma->vm_start, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
mmap_read_unlock(mm);
|
mmap_read_unlock(mm);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range);
|
|||||||
void hash__flush_tlb_mm(struct mm_struct *mm)
|
void hash__flush_tlb_mm(struct mm_struct *mm)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *mp;
|
struct vm_area_struct *mp;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* It is safe to go down the mm's list of vmas when called
|
* It is safe to iterate the vmas when called from dup_mmap,
|
||||||
* from dup_mmap, holding mmap_lock. It would also be safe from
|
* holding mmap_lock. It would also be safe from unmap_region
|
||||||
* unmap_region or exit_mmap, but not from vmtruncate on SMP -
|
* or exit_mmap, but not from vmtruncate on SMP - but it seems
|
||||||
* but it seems dup_mmap is the only SMP case which gets here.
|
* dup_mmap is the only SMP case which gets here.
|
||||||
*/
|
*/
|
||||||
for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
|
for_each_vma(vmi, mp)
|
||||||
hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
|
hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(hash__flush_tlb_mm);
|
EXPORT_SYMBOL(hash__flush_tlb_mm);
|
||||||
|
@ -149,24 +149,15 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
|
|||||||
unsigned long len)
|
unsigned long len)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, addr);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We don't try too hard, we just mark all the vma in that range
|
* We don't try too hard, we just mark all the vma in that range
|
||||||
* VM_NOHUGEPAGE and split them.
|
* VM_NOHUGEPAGE and split them.
|
||||||
*/
|
*/
|
||||||
vma = find_vma(mm, addr);
|
for_each_vma_range(vmi, vma, addr + len) {
|
||||||
/*
|
|
||||||
* If the range is in unmapped range, just return
|
|
||||||
*/
|
|
||||||
if (vma && ((addr + len) <= vma->vm_start))
|
|
||||||
return;
|
|
||||||
|
|
||||||
while (vma) {
|
|
||||||
if (vma->vm_start >= (addr + len))
|
|
||||||
break;
|
|
||||||
vma->vm_flags |= VM_NOHUGEPAGE;
|
vma->vm_flags |= VM_NOHUGEPAGE;
|
||||||
walk_page_vma(vma, &subpage_walk_ops, NULL);
|
walk_page_vma(vma, &subpage_walk_ops, NULL);
|
||||||
vma = vma->vm_next;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
@ -114,11 +114,12 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
|||||||
{
|
{
|
||||||
struct mm_struct *mm = task->mm;
|
struct mm_struct *mm = task->mm;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
struct __vdso_info *vdso_info = mm->context.vdso_info;
|
struct __vdso_info *vdso_info = mm->context.vdso_info;
|
||||||
|
|
||||||
mmap_read_lock(mm);
|
mmap_read_lock(mm);
|
||||||
|
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
for_each_vma(vmi, vma) {
|
||||||
unsigned long size = vma->vm_end - vma->vm_start;
|
unsigned long size = vma->vm_end - vma->vm_start;
|
||||||
|
|
||||||
if (vma_is_special_mapping(vma, vdso_info->dm))
|
if (vma_is_special_mapping(vma, vdso_info->dm))
|
||||||
|
@ -69,10 +69,11 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
|
|||||||
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
||||||
{
|
{
|
||||||
struct mm_struct *mm = task->mm;
|
struct mm_struct *mm = task->mm;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
|
||||||
mmap_read_lock(mm);
|
mmap_read_lock(mm);
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
for_each_vma(vmi, vma) {
|
||||||
unsigned long size = vma->vm_end - vma->vm_start;
|
unsigned long size = vma->vm_end - vma->vm_start;
|
||||||
|
|
||||||
if (!vma_is_special_mapping(vma, &vvar_mapping))
|
if (!vma_is_special_mapping(vma, &vvar_mapping))
|
||||||
|
@ -81,8 +81,9 @@ unsigned long _copy_from_user_key(void *to, const void __user *from,
|
|||||||
|
|
||||||
might_fault();
|
might_fault();
|
||||||
if (!should_fail_usercopy()) {
|
if (!should_fail_usercopy()) {
|
||||||
instrument_copy_from_user(to, from, n);
|
instrument_copy_from_user_before(to, from, n);
|
||||||
res = raw_copy_from_user_key(to, from, n, key);
|
res = raw_copy_from_user_key(to, from, n, key);
|
||||||
|
instrument_copy_from_user_after(to, from, n, res);
|
||||||
}
|
}
|
||||||
if (unlikely(res))
|
if (unlikely(res))
|
||||||
memset(to + (n - res), 0, res);
|
memset(to + (n - res), 0, res);
|
||||||
|
@ -2515,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = {
|
|||||||
static inline void thp_split_mm(struct mm_struct *mm)
|
static inline void thp_split_mm(struct mm_struct *mm)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
|
for_each_vma(vmi, vma) {
|
||||||
vma->vm_flags &= ~VM_HUGEPAGE;
|
vma->vm_flags &= ~VM_HUGEPAGE;
|
||||||
vma->vm_flags |= VM_NOHUGEPAGE;
|
vma->vm_flags |= VM_NOHUGEPAGE;
|
||||||
walk_page_vma(vma, &thp_split_walk_ops, NULL);
|
walk_page_vma(vma, &thp_split_walk_ops, NULL);
|
||||||
@ -2584,8 +2585,9 @@ int gmap_mark_unmergeable(void)
|
|||||||
struct mm_struct *mm = current->mm;
|
struct mm_struct *mm = current->mm;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
int ret;
|
int ret;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
for_each_vma(vmi, vma) {
|
||||||
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
|
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
|
||||||
MADV_UNMERGEABLE, &vma->vm_flags);
|
MADV_UNMERGEABLE, &vma->vm_flags);
|
||||||
if (ret)
|
if (ret)
|
||||||
|
@ -237,16 +237,6 @@ int pud_huge(pud_t pud)
|
|||||||
return pud_large(pud);
|
return pud_large(pud);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct page *
|
|
||||||
follow_huge_pud(struct mm_struct *mm, unsigned long address,
|
|
||||||
pud_t *pud, int flags)
|
|
||||||
{
|
|
||||||
if (flags & FOLL_GET)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool __init arch_hugetlb_valid_size(unsigned long size)
|
bool __init arch_hugetlb_valid_size(unsigned long size)
|
||||||
{
|
{
|
||||||
if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
|
if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
|
||||||
|
@ -8,7 +8,7 @@ CONFIG_MODULES=y
|
|||||||
CONFIG_MODULE_UNLOAD=y
|
CONFIG_MODULE_UNLOAD=y
|
||||||
# CONFIG_BLK_DEV_BSG is not set
|
# CONFIG_BLK_DEV_BSG is not set
|
||||||
CONFIG_CPU_SUBTYPE_SH7724=y
|
CONFIG_CPU_SUBTYPE_SH7724=y
|
||||||
CONFIG_FORCE_MAX_ZONEORDER=12
|
CONFIG_ARCH_FORCE_MAX_ORDER=12
|
||||||
CONFIG_MEMORY_SIZE=0x10000000
|
CONFIG_MEMORY_SIZE=0x10000000
|
||||||
CONFIG_FLATMEM_MANUAL=y
|
CONFIG_FLATMEM_MANUAL=y
|
||||||
CONFIG_SH_ECOVEC=y
|
CONFIG_SH_ECOVEC=y
|
||||||
|
@ -16,7 +16,6 @@ CONFIG_CPUSETS=y
|
|||||||
# CONFIG_PROC_PID_CPUSET is not set
|
# CONFIG_PROC_PID_CPUSET is not set
|
||||||
CONFIG_CGROUP_CPUACCT=y
|
CONFIG_CGROUP_CPUACCT=y
|
||||||
CONFIG_CGROUP_MEMCG=y
|
CONFIG_CGROUP_MEMCG=y
|
||||||
CONFIG_CGROUP_MEMCG_SWAP=y
|
|
||||||
CONFIG_CGROUP_SCHED=y
|
CONFIG_CGROUP_SCHED=y
|
||||||
CONFIG_RT_GROUP_SCHED=y
|
CONFIG_RT_GROUP_SCHED=y
|
||||||
CONFIG_BLK_CGROUP=y
|
CONFIG_BLK_CGROUP=y
|
||||||
|
@ -14,7 +14,6 @@ CONFIG_CPUSETS=y
|
|||||||
# CONFIG_PROC_PID_CPUSET is not set
|
# CONFIG_PROC_PID_CPUSET is not set
|
||||||
CONFIG_CGROUP_CPUACCT=y
|
CONFIG_CGROUP_CPUACCT=y
|
||||||
CONFIG_CGROUP_MEMCG=y
|
CONFIG_CGROUP_MEMCG=y
|
||||||
CONFIG_CGROUP_MEMCG_SWAP=y
|
|
||||||
CONFIG_CGROUP_SCHED=y
|
CONFIG_CGROUP_SCHED=y
|
||||||
CONFIG_RT_GROUP_SCHED=y
|
CONFIG_RT_GROUP_SCHED=y
|
||||||
CONFIG_BLK_DEV_INITRD=y
|
CONFIG_BLK_DEV_INITRD=y
|
||||||
|
@ -18,7 +18,7 @@ config PAGE_OFFSET
|
|||||||
default "0x80000000" if MMU
|
default "0x80000000" if MMU
|
||||||
default "0x00000000"
|
default "0x00000000"
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
range 9 64 if PAGE_SIZE_16KB
|
range 9 64 if PAGE_SIZE_16KB
|
||||||
default "9" if PAGE_SIZE_16KB
|
default "9" if PAGE_SIZE_16KB
|
||||||
|
@ -269,7 +269,7 @@ config ARCH_SPARSEMEM_ENABLE
|
|||||||
config ARCH_SPARSEMEM_DEFAULT
|
config ARCH_SPARSEMEM_DEFAULT
|
||||||
def_bool y if SPARC64
|
def_bool y if SPARC64
|
||||||
|
|
||||||
config FORCE_MAX_ZONEORDER
|
config ARCH_FORCE_MAX_ORDER
|
||||||
int "Maximum zone order"
|
int "Maximum zone order"
|
||||||
default "13"
|
default "13"
|
||||||
help
|
help
|
||||||
|
@ -584,21 +584,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
|||||||
|
|
||||||
void flush_tlb_mm(struct mm_struct *mm)
|
void flush_tlb_mm(struct mm_struct *mm)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma = mm->mmap;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
while (vma != NULL) {
|
for_each_vma(vmi, vma)
|
||||||
fix_range(mm, vma->vm_start, vma->vm_end, 0);
|
fix_range(mm, vma->vm_start, vma->vm_end, 0);
|
||||||
vma = vma->vm_next;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void force_flush_all(void)
|
void force_flush_all(void)
|
||||||
{
|
{
|
||||||
struct mm_struct *mm = current->mm;
|
struct mm_struct *mm = current->mm;
|
||||||
struct vm_area_struct *vma = mm->mmap;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
while (vma != NULL) {
|
for_each_vma(vmi, vma)
|
||||||
fix_range(mm, vma->vm_start, vma->vm_end, 1);
|
fix_range(mm, vma->vm_start, vma->vm_end, 1);
|
||||||
vma = vma->vm_next;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -85,6 +85,7 @@ config X86
|
|||||||
select ARCH_HAS_PMEM_API if X86_64
|
select ARCH_HAS_PMEM_API if X86_64
|
||||||
select ARCH_HAS_PTE_DEVMAP if X86_64
|
select ARCH_HAS_PTE_DEVMAP if X86_64
|
||||||
select ARCH_HAS_PTE_SPECIAL
|
select ARCH_HAS_PTE_SPECIAL
|
||||||
|
select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
|
||||||
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
||||||
select ARCH_HAS_COPY_MC if X86_64
|
select ARCH_HAS_COPY_MC if X86_64
|
||||||
select ARCH_HAS_SET_MEMORY
|
select ARCH_HAS_SET_MEMORY
|
||||||
@ -130,7 +131,9 @@ config X86
|
|||||||
select CLKEVT_I8253
|
select CLKEVT_I8253
|
||||||
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
|
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
|
||||||
select CLOCKSOURCE_WATCHDOG
|
select CLOCKSOURCE_WATCHDOG
|
||||||
select DCACHE_WORD_ACCESS
|
# Word-size accesses may read uninitialized data past the trailing \0
|
||||||
|
# in strings and cause false KMSAN reports.
|
||||||
|
select DCACHE_WORD_ACCESS if !KMSAN
|
||||||
select DYNAMIC_SIGFRAME
|
select DYNAMIC_SIGFRAME
|
||||||
select EDAC_ATOMIC_SCRUB
|
select EDAC_ATOMIC_SCRUB
|
||||||
select EDAC_SUPPORT
|
select EDAC_SUPPORT
|
||||||
@ -168,6 +171,7 @@ config X86
|
|||||||
select HAVE_ARCH_KASAN if X86_64
|
select HAVE_ARCH_KASAN if X86_64
|
||||||
select HAVE_ARCH_KASAN_VMALLOC if X86_64
|
select HAVE_ARCH_KASAN_VMALLOC if X86_64
|
||||||
select HAVE_ARCH_KFENCE
|
select HAVE_ARCH_KFENCE
|
||||||
|
select HAVE_ARCH_KMSAN if X86_64
|
||||||
select HAVE_ARCH_KGDB
|
select HAVE_ARCH_KGDB
|
||||||
select HAVE_ARCH_MMAP_RND_BITS if MMU
|
select HAVE_ARCH_MMAP_RND_BITS if MMU
|
||||||
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
|
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
|
||||||
@ -328,6 +332,10 @@ config GENERIC_ISA_DMA
|
|||||||
def_bool y
|
def_bool y
|
||||||
depends on ISA_DMA_API
|
depends on ISA_DMA_API
|
||||||
|
|
||||||
|
config GENERIC_CSUM
|
||||||
|
bool
|
||||||
|
default y if KMSAN || KASAN
|
||||||
|
|
||||||
config GENERIC_BUG
|
config GENERIC_BUG
|
||||||
def_bool y
|
def_bool y
|
||||||
depends on BUG
|
depends on BUG
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
# Sanitizer runtimes are unavailable and cannot be linked for early boot code.
|
# Sanitizer runtimes are unavailable and cannot be linked for early boot code.
|
||||||
KASAN_SANITIZE := n
|
KASAN_SANITIZE := n
|
||||||
KCSAN_SANITIZE := n
|
KCSAN_SANITIZE := n
|
||||||
|
KMSAN_SANITIZE := n
|
||||||
OBJECT_FILES_NON_STANDARD := y
|
OBJECT_FILES_NON_STANDARD := y
|
||||||
|
|
||||||
# Kernel does not boot with kcov instrumentation here.
|
# Kernel does not boot with kcov instrumentation here.
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
# Sanitizer runtimes are unavailable and cannot be linked for early boot code.
|
# Sanitizer runtimes are unavailable and cannot be linked for early boot code.
|
||||||
KASAN_SANITIZE := n
|
KASAN_SANITIZE := n
|
||||||
KCSAN_SANITIZE := n
|
KCSAN_SANITIZE := n
|
||||||
|
KMSAN_SANITIZE := n
|
||||||
OBJECT_FILES_NON_STANDARD := y
|
OBJECT_FILES_NON_STANDARD := y
|
||||||
|
|
||||||
# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
|
# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
|
||||||
|
@ -11,6 +11,9 @@ include $(srctree)/lib/vdso/Makefile
|
|||||||
|
|
||||||
# Sanitizer runtimes are unavailable and cannot be linked here.
|
# Sanitizer runtimes are unavailable and cannot be linked here.
|
||||||
KASAN_SANITIZE := n
|
KASAN_SANITIZE := n
|
||||||
|
KMSAN_SANITIZE_vclock_gettime.o := n
|
||||||
|
KMSAN_SANITIZE_vgetcpu.o := n
|
||||||
|
|
||||||
UBSAN_SANITIZE := n
|
UBSAN_SANITIZE := n
|
||||||
KCSAN_SANITIZE := n
|
KCSAN_SANITIZE := n
|
||||||
OBJECT_FILES_NON_STANDARD := y
|
OBJECT_FILES_NON_STANDARD := y
|
||||||
|
@ -127,17 +127,17 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
|||||||
{
|
{
|
||||||
struct mm_struct *mm = task->mm;
|
struct mm_struct *mm = task->mm;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
mmap_read_lock(mm);
|
mmap_read_lock(mm);
|
||||||
|
for_each_vma(vmi, vma) {
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
|
||||||
unsigned long size = vma->vm_end - vma->vm_start;
|
unsigned long size = vma->vm_end - vma->vm_start;
|
||||||
|
|
||||||
if (vma_is_special_mapping(vma, &vvar_mapping))
|
if (vma_is_special_mapping(vma, &vvar_mapping))
|
||||||
zap_page_range(vma, vma->vm_start, size);
|
zap_page_range(vma, vma->vm_start, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
mmap_read_unlock(mm);
|
mmap_read_unlock(mm);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -354,6 +354,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
|
|||||||
{
|
{
|
||||||
struct mm_struct *mm = current->mm;
|
struct mm_struct *mm = current->mm;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
VMA_ITERATOR(vmi, mm, 0);
|
||||||
|
|
||||||
mmap_write_lock(mm);
|
mmap_write_lock(mm);
|
||||||
/*
|
/*
|
||||||
@ -363,7 +364,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
|
|||||||
* We could search vma near context.vdso, but it's a slowpath,
|
* We could search vma near context.vdso, but it's a slowpath,
|
||||||
* so let's explicitly check all VMAs to be completely sure.
|
* so let's explicitly check all VMAs to be completely sure.
|
||||||
*/
|
*/
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
for_each_vma(vmi, vma) {
|
||||||
if (vma_is_special_mapping(vma, &vdso_mapping) ||
|
if (vma_is_special_mapping(vma, &vdso_mapping) ||
|
||||||
vma_is_special_mapping(vma, &vvar_mapping)) {
|
vma_is_special_mapping(vma, &vvar_mapping)) {
|
||||||
mmap_write_unlock(mm);
|
mmap_write_unlock(mm);
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
/* SPDX-License-Identifier: GPL-2.0 */
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
|
#ifdef CONFIG_GENERIC_CSUM
|
||||||
#define HAVE_CSUM_COPY_USER
|
# include <asm-generic/checksum.h>
|
||||||
#define _HAVE_ARCH_CSUM_AND_COPY
|
|
||||||
#ifdef CONFIG_X86_32
|
|
||||||
# include <asm/checksum_32.h>
|
|
||||||
#else
|
#else
|
||||||
# include <asm/checksum_64.h>
|
# define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
|
||||||
|
# define HAVE_CSUM_COPY_USER
|
||||||
|
# define _HAVE_ARCH_CSUM_AND_COPY
|
||||||
|
# ifdef CONFIG_X86_32
|
||||||
|
# include <asm/checksum_32.h>
|
||||||
|
# else
|
||||||
|
# include <asm/checksum_64.h>
|
||||||
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
87
arch/x86/include/asm/kmsan.h
Normal file
87
arch/x86/include/asm/kmsan.h
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* x86 KMSAN support.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2022, Google LLC
|
||||||
|
* Author: Alexander Potapenko <glider@google.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _ASM_X86_KMSAN_H
|
||||||
|
#define _ASM_X86_KMSAN_H
|
||||||
|
|
||||||
|
#ifndef MODULE
|
||||||
|
|
||||||
|
#include <asm/cpu_entry_area.h>
|
||||||
|
#include <asm/processor.h>
|
||||||
|
#include <linux/mmzone.h>
|
||||||
|
|
||||||
|
DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow);
|
||||||
|
DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Functions below are declared in the header to make sure they are inlined.
|
||||||
|
* They all are called from kmsan_get_metadata() for every memory access in
|
||||||
|
* the kernel, so speed is important here.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compute metadata addresses for the CPU entry area on x86.
|
||||||
|
*/
|
||||||
|
static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin)
|
||||||
|
{
|
||||||
|
unsigned long addr64 = (unsigned long)addr;
|
||||||
|
char *metadata_array;
|
||||||
|
unsigned long off;
|
||||||
|
int cpu;
|
||||||
|
|
||||||
|
if ((addr64 < CPU_ENTRY_AREA_BASE) ||
|
||||||
|
(addr64 >= (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE)))
|
||||||
|
return NULL;
|
||||||
|
cpu = (addr64 - CPU_ENTRY_AREA_BASE) / CPU_ENTRY_AREA_SIZE;
|
||||||
|
off = addr64 - (unsigned long)get_cpu_entry_area(cpu);
|
||||||
|
if ((off < 0) || (off >= CPU_ENTRY_AREA_SIZE))
|
||||||
|
return NULL;
|
||||||
|
metadata_array = is_origin ? cpu_entry_area_origin :
|
||||||
|
cpu_entry_area_shadow;
|
||||||
|
return &per_cpu(metadata_array[off], cpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version.
|
||||||
|
*/
|
||||||
|
static inline bool kmsan_phys_addr_valid(unsigned long addr)
|
||||||
|
{
|
||||||
|
if (IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
|
||||||
|
return !(addr >> boot_cpu_data.x86_phys_bits);
|
||||||
|
else
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Taken from arch/x86/mm/physaddr.c to avoid using an instrumented version.
|
||||||
|
*/
|
||||||
|
static inline bool kmsan_virt_addr_valid(void *addr)
|
||||||
|
{
|
||||||
|
unsigned long x = (unsigned long)addr;
|
||||||
|
unsigned long y = x - __START_KERNEL_map;
|
||||||
|
|
||||||
|
/* use the carry flag to determine if x was < __START_KERNEL_map */
|
||||||
|
if (unlikely(x > y)) {
|
||||||
|
x = y + phys_base;
|
||||||
|
|
||||||
|
if (y >= KERNEL_IMAGE_SIZE)
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
x = y + (__START_KERNEL_map - PAGE_OFFSET);
|
||||||
|
|
||||||
|
/* carry flag will be set if starting x was >= PAGE_OFFSET */
|
||||||
|
if ((x > y) || !kmsan_phys_addr_valid(x))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return pfn_valid(x >> PAGE_SHIFT);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* !MODULE */
|
||||||
|
|
||||||
|
#endif /* _ASM_X86_KMSAN_H */
|
@ -8,6 +8,8 @@
|
|||||||
#include <asm/cpufeatures.h>
|
#include <asm/cpufeatures.h>
|
||||||
#include <asm/alternative.h>
|
#include <asm/alternative.h>
|
||||||
|
|
||||||
|
#include <linux/kmsan-checks.h>
|
||||||
|
|
||||||
/* duplicated to the one in bootmem.h */
|
/* duplicated to the one in bootmem.h */
|
||||||
extern unsigned long max_pfn;
|
extern unsigned long max_pfn;
|
||||||
extern unsigned long phys_base;
|
extern unsigned long phys_base;
|
||||||
@ -47,6 +49,11 @@ void clear_page_erms(void *page);
|
|||||||
|
|
||||||
static inline void clear_page(void *page)
|
static inline void clear_page(void *page)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Clean up KMSAN metadata for the page being cleared. The assembly call
|
||||||
|
* below clobbers @page, so we perform unpoisoning before it.
|
||||||
|
*/
|
||||||
|
kmsan_unpoison_memory(page, PAGE_SIZE);
|
||||||
alternative_call_2(clear_page_orig,
|
alternative_call_2(clear_page_orig,
|
||||||
clear_page_rep, X86_FEATURE_REP_GOOD,
|
clear_page_rep, X86_FEATURE_REP_GOOD,
|
||||||
clear_page_erms, X86_FEATURE_ERMS,
|
clear_page_erms, X86_FEATURE_ERMS,
|
||||||
|
@ -256,10 +256,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
|
|||||||
/* We always extract/encode the offset by shifting it all the way up, and then down again */
|
/* We always extract/encode the offset by shifting it all the way up, and then down again */
|
||||||
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
|
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
|
||||||
|
|
||||||
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
|
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
|
||||||
#define __swp_type(x) (((x).val) & 0x1f)
|
#define __swp_type(x) (((x).val) & ((1UL << SWP_TYPE_BITS) - 1))
|
||||||
#define __swp_offset(x) ((x).val >> 5)
|
#define __swp_offset(x) ((x).val >> SWP_TYPE_BITS)
|
||||||
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
|
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << SWP_TYPE_BITS})
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Normally, __swp_entry() converts from arch-independent swp_entry_t to
|
* Normally, __swp_entry() converts from arch-independent swp_entry_t to
|
||||||
|
@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
|||||||
|
|
||||||
static inline int pmd_bad(pmd_t pmd)
|
static inline int pmd_bad(pmd_t pmd)
|
||||||
{
|
{
|
||||||
return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
|
return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
|
||||||
|
(_KERNPG_TABLE & ~_PAGE_ACCESSED);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned long pages_to_mb(unsigned long npg)
|
static inline unsigned long pages_to_mb(unsigned long npg)
|
||||||
@ -1431,10 +1432,10 @@ static inline bool arch_has_pfn_modify_check(void)
|
|||||||
return boot_cpu_has_bug(X86_BUG_L1TF);
|
return boot_cpu_has_bug(X86_BUG_L1TF);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define arch_faults_on_old_pte arch_faults_on_old_pte
|
#define arch_has_hw_pte_young arch_has_hw_pte_young
|
||||||
static inline bool arch_faults_on_old_pte(void)
|
static inline bool arch_has_hw_pte_young(void)
|
||||||
{
|
{
|
||||||
return false;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_PAGE_TABLE_CHECK
|
#ifdef CONFIG_PAGE_TABLE_CHECK
|
||||||
|
@ -139,7 +139,52 @@ extern unsigned int ptrs_per_p4d;
|
|||||||
# define VMEMMAP_START __VMEMMAP_BASE_L4
|
# define VMEMMAP_START __VMEMMAP_BASE_L4
|
||||||
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
|
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
|
||||||
|
|
||||||
#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
|
/*
|
||||||
|
* End of the region for which vmalloc page tables are pre-allocated.
|
||||||
|
* For non-KMSAN builds, this is the same as VMALLOC_END.
|
||||||
|
* For KMSAN builds, VMALLOC_START..VMEMORY_END is 4 times bigger than
|
||||||
|
* VMALLOC_START..VMALLOC_END (see below).
|
||||||
|
*/
|
||||||
|
#define VMEMORY_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
|
||||||
|
|
||||||
|
#ifndef CONFIG_KMSAN
|
||||||
|
#define VMALLOC_END VMEMORY_END
|
||||||
|
#else
|
||||||
|
/*
|
||||||
|
* In KMSAN builds vmalloc area is four times smaller, and the remaining 3/4
|
||||||
|
* are used to keep the metadata for virtual pages. The memory formerly
|
||||||
|
* belonging to vmalloc area is now laid out as follows:
|
||||||
|
*
|
||||||
|
* 1st quarter: VMALLOC_START to VMALLOC_END - new vmalloc area
|
||||||
|
* 2nd quarter: KMSAN_VMALLOC_SHADOW_START to
|
||||||
|
* VMALLOC_END+KMSAN_VMALLOC_SHADOW_OFFSET - vmalloc area shadow
|
||||||
|
* 3rd quarter: KMSAN_VMALLOC_ORIGIN_START to
|
||||||
|
* VMALLOC_END+KMSAN_VMALLOC_ORIGIN_OFFSET - vmalloc area origins
|
||||||
|
* 4th quarter: KMSAN_MODULES_SHADOW_START to KMSAN_MODULES_ORIGIN_START
|
||||||
|
* - shadow for modules,
|
||||||
|
* KMSAN_MODULES_ORIGIN_START to
|
||||||
|
* KMSAN_MODULES_ORIGIN_START + MODULES_LEN - origins for modules.
|
||||||
|
*/
|
||||||
|
#define VMALLOC_QUARTER_SIZE ((VMALLOC_SIZE_TB << 40) >> 2)
|
||||||
|
#define VMALLOC_END (VMALLOC_START + VMALLOC_QUARTER_SIZE - 1)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* vmalloc metadata addresses are calculated by adding shadow/origin offsets
|
||||||
|
* to vmalloc address.
|
||||||
|
*/
|
||||||
|
#define KMSAN_VMALLOC_SHADOW_OFFSET VMALLOC_QUARTER_SIZE
|
||||||
|
#define KMSAN_VMALLOC_ORIGIN_OFFSET (VMALLOC_QUARTER_SIZE << 1)
|
||||||
|
|
||||||
|
#define KMSAN_VMALLOC_SHADOW_START (VMALLOC_START + KMSAN_VMALLOC_SHADOW_OFFSET)
|
||||||
|
#define KMSAN_VMALLOC_ORIGIN_START (VMALLOC_START + KMSAN_VMALLOC_ORIGIN_OFFSET)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The shadow/origin for modules are placed one by one in the last 1/4 of
|
||||||
|
* vmalloc space.
|
||||||
|
*/
|
||||||
|
#define KMSAN_MODULES_SHADOW_START (VMALLOC_END + KMSAN_VMALLOC_ORIGIN_OFFSET + 1)
|
||||||
|
#define KMSAN_MODULES_ORIGIN_START (KMSAN_MODULES_SHADOW_START + MODULES_LEN)
|
||||||
|
#endif /* CONFIG_KMSAN */
|
||||||
|
|
||||||
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
|
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
|
||||||
/* The module sections ends with the start of the fixmap */
|
/* The module sections ends with the start of the fixmap */
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
#ifndef _ASM_X86_SPARSEMEM_H
|
#ifndef _ASM_X86_SPARSEMEM_H
|
||||||
#define _ASM_X86_SPARSEMEM_H
|
#define _ASM_X86_SPARSEMEM_H
|
||||||
|
|
||||||
|
#include <linux/types.h>
|
||||||
|
|
||||||
#ifdef CONFIG_SPARSEMEM
|
#ifdef CONFIG_SPARSEMEM
|
||||||
/*
|
/*
|
||||||
* generic non-linear memory support:
|
* generic non-linear memory support:
|
||||||
|
@ -11,11 +11,23 @@
|
|||||||
function. */
|
function. */
|
||||||
|
|
||||||
#define __HAVE_ARCH_MEMCPY 1
|
#define __HAVE_ARCH_MEMCPY 1
|
||||||
|
#if defined(__SANITIZE_MEMORY__)
|
||||||
|
#undef memcpy
|
||||||
|
void *__msan_memcpy(void *dst, const void *src, size_t size);
|
||||||
|
#define memcpy __msan_memcpy
|
||||||
|
#else
|
||||||
extern void *memcpy(void *to, const void *from, size_t len);
|
extern void *memcpy(void *to, const void *from, size_t len);
|
||||||
|
#endif
|
||||||
extern void *__memcpy(void *to, const void *from, size_t len);
|
extern void *__memcpy(void *to, const void *from, size_t len);
|
||||||
|
|
||||||
#define __HAVE_ARCH_MEMSET
|
#define __HAVE_ARCH_MEMSET
|
||||||
|
#if defined(__SANITIZE_MEMORY__)
|
||||||
|
extern void *__msan_memset(void *s, int c, size_t n);
|
||||||
|
#undef memset
|
||||||
|
#define memset __msan_memset
|
||||||
|
#else
|
||||||
void *memset(void *s, int c, size_t n);
|
void *memset(void *s, int c, size_t n);
|
||||||
|
#endif
|
||||||
void *__memset(void *s, int c, size_t n);
|
void *__memset(void *s, int c, size_t n);
|
||||||
|
|
||||||
#define __HAVE_ARCH_MEMSET16
|
#define __HAVE_ARCH_MEMSET16
|
||||||
@ -55,7 +67,13 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define __HAVE_ARCH_MEMMOVE
|
#define __HAVE_ARCH_MEMMOVE
|
||||||
|
#if defined(__SANITIZE_MEMORY__)
|
||||||
|
#undef memmove
|
||||||
|
void *__msan_memmove(void *dest, const void *src, size_t len);
|
||||||
|
#define memmove __msan_memmove
|
||||||
|
#else
|
||||||
void *memmove(void *dest, const void *src, size_t count);
|
void *memmove(void *dest, const void *src, size_t count);
|
||||||
|
#endif
|
||||||
void *__memmove(void *dest, const void *src, size_t count);
|
void *__memmove(void *dest, const void *src, size_t count);
|
||||||
|
|
||||||
int memcmp(const void *cs, const void *ct, size_t count);
|
int memcmp(const void *cs, const void *ct, size_t count);
|
||||||
@ -64,8 +82,7 @@ char *strcpy(char *dest, const char *src);
|
|||||||
char *strcat(char *dest, const char *src);
|
char *strcat(char *dest, const char *src);
|
||||||
int strcmp(const char *cs, const char *ct);
|
int strcmp(const char *cs, const char *ct);
|
||||||
|
|
||||||
#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
|
#if (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For files that not instrumented (e.g. mm/slub.c) we
|
* For files that not instrumented (e.g. mm/slub.c) we
|
||||||
* should use not instrumented version of mem* functions.
|
* should use not instrumented version of mem* functions.
|
||||||
@ -73,7 +90,9 @@ int strcmp(const char *cs, const char *ct);
|
|||||||
|
|
||||||
#undef memcpy
|
#undef memcpy
|
||||||
#define memcpy(dst, src, len) __memcpy(dst, src, len)
|
#define memcpy(dst, src, len) __memcpy(dst, src, len)
|
||||||
|
#undef memmove
|
||||||
#define memmove(dst, src, len) __memmove(dst, src, len)
|
#define memmove(dst, src, len) __memmove(dst, src, len)
|
||||||
|
#undef memset
|
||||||
#define memset(s, c, n) __memset(s, c, n)
|
#define memset(s, c, n) __memset(s, c, n)
|
||||||
|
|
||||||
#ifndef __NO_FORTIFY
|
#ifndef __NO_FORTIFY
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
* User space memory access functions
|
* User space memory access functions
|
||||||
*/
|
*/
|
||||||
#include <linux/compiler.h>
|
#include <linux/compiler.h>
|
||||||
|
#include <linux/instrumented.h>
|
||||||
#include <linux/kasan-checks.h>
|
#include <linux/kasan-checks.h>
|
||||||
#include <linux/string.h>
|
#include <linux/string.h>
|
||||||
#include <asm/asm.h>
|
#include <asm/asm.h>
|
||||||
@ -103,6 +104,7 @@ extern int __get_user_bad(void);
|
|||||||
: "=a" (__ret_gu), "=r" (__val_gu), \
|
: "=a" (__ret_gu), "=r" (__val_gu), \
|
||||||
ASM_CALL_CONSTRAINT \
|
ASM_CALL_CONSTRAINT \
|
||||||
: "0" (ptr), "i" (sizeof(*(ptr)))); \
|
: "0" (ptr), "i" (sizeof(*(ptr)))); \
|
||||||
|
instrument_get_user(__val_gu); \
|
||||||
(x) = (__force __typeof__(*(ptr))) __val_gu; \
|
(x) = (__force __typeof__(*(ptr))) __val_gu; \
|
||||||
__builtin_expect(__ret_gu, 0); \
|
__builtin_expect(__ret_gu, 0); \
|
||||||
})
|
})
|
||||||
@ -192,9 +194,11 @@ extern void __put_user_nocheck_8(void);
|
|||||||
int __ret_pu; \
|
int __ret_pu; \
|
||||||
void __user *__ptr_pu; \
|
void __user *__ptr_pu; \
|
||||||
register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \
|
register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \
|
||||||
__chk_user_ptr(ptr); \
|
__typeof__(*(ptr)) __x = (x); /* eval x once */ \
|
||||||
__ptr_pu = (ptr); \
|
__typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \
|
||||||
__val_pu = (x); \
|
__chk_user_ptr(__ptr); \
|
||||||
|
__ptr_pu = __ptr; \
|
||||||
|
__val_pu = __x; \
|
||||||
asm volatile("call __" #fn "_%P[size]" \
|
asm volatile("call __" #fn "_%P[size]" \
|
||||||
: "=c" (__ret_pu), \
|
: "=c" (__ret_pu), \
|
||||||
ASM_CALL_CONSTRAINT \
|
ASM_CALL_CONSTRAINT \
|
||||||
@ -202,6 +206,7 @@ extern void __put_user_nocheck_8(void);
|
|||||||
"r" (__val_pu), \
|
"r" (__val_pu), \
|
||||||
[size] "i" (sizeof(*(ptr))) \
|
[size] "i" (sizeof(*(ptr))) \
|
||||||
:"ebx"); \
|
:"ebx"); \
|
||||||
|
instrument_put_user(__x, __ptr, sizeof(*(ptr))); \
|
||||||
__builtin_expect(__ret_pu, 0); \
|
__builtin_expect(__ret_pu, 0); \
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -248,23 +253,25 @@ extern void __put_user_nocheck_8(void);
|
|||||||
|
|
||||||
#define __put_user_size(x, ptr, size, label) \
|
#define __put_user_size(x, ptr, size, label) \
|
||||||
do { \
|
do { \
|
||||||
|
__typeof__(*(ptr)) __x = (x); /* eval x once */ \
|
||||||
__chk_user_ptr(ptr); \
|
__chk_user_ptr(ptr); \
|
||||||
switch (size) { \
|
switch (size) { \
|
||||||
case 1: \
|
case 1: \
|
||||||
__put_user_goto(x, ptr, "b", "iq", label); \
|
__put_user_goto(__x, ptr, "b", "iq", label); \
|
||||||
break; \
|
break; \
|
||||||
case 2: \
|
case 2: \
|
||||||
__put_user_goto(x, ptr, "w", "ir", label); \
|
__put_user_goto(__x, ptr, "w", "ir", label); \
|
||||||
break; \
|
break; \
|
||||||
case 4: \
|
case 4: \
|
||||||
__put_user_goto(x, ptr, "l", "ir", label); \
|
__put_user_goto(__x, ptr, "l", "ir", label); \
|
||||||
break; \
|
break; \
|
||||||
case 8: \
|
case 8: \
|
||||||
__put_user_goto_u64(x, ptr, label); \
|
__put_user_goto_u64(__x, ptr, label); \
|
||||||
break; \
|
break; \
|
||||||
default: \
|
default: \
|
||||||
__put_user_bad(); \
|
__put_user_bad(); \
|
||||||
} \
|
} \
|
||||||
|
instrument_put_user(__x, ptr, size); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
|
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
|
||||||
@ -305,6 +312,7 @@ do { \
|
|||||||
default: \
|
default: \
|
||||||
(x) = __get_user_bad(); \
|
(x) = __get_user_bad(); \
|
||||||
} \
|
} \
|
||||||
|
instrument_get_user(x); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define __get_user_asm(x, addr, itype, ltype, label) \
|
#define __get_user_asm(x, addr, itype, ltype, label) \
|
||||||
|
@ -29,6 +29,8 @@ KASAN_SANITIZE_sev.o := n
|
|||||||
# With some compiler versions the generated code results in boot hangs, caused
|
# With some compiler versions the generated code results in boot hangs, caused
|
||||||
# by several compilation units. To be safe, disable all instrumentation.
|
# by several compilation units. To be safe, disable all instrumentation.
|
||||||
KCSAN_SANITIZE := n
|
KCSAN_SANITIZE := n
|
||||||
|
KMSAN_SANITIZE_head$(BITS).o := n
|
||||||
|
KMSAN_SANITIZE_nmi.o := n
|
||||||
|
|
||||||
# If instrumentation of this dir is enabled, boot hangs during first second.
|
# If instrumentation of this dir is enabled, boot hangs during first second.
|
||||||
# Probably could be more selective here, but note that files related to irqs,
|
# Probably could be more selective here, but note that files related to irqs,
|
||||||
|
@ -12,6 +12,7 @@ endif
|
|||||||
# If these files are instrumented, boot hangs during the first second.
|
# If these files are instrumented, boot hangs during the first second.
|
||||||
KCOV_INSTRUMENT_common.o := n
|
KCOV_INSTRUMENT_common.o := n
|
||||||
KCOV_INSTRUMENT_perf_event.o := n
|
KCOV_INSTRUMENT_perf_event.o := n
|
||||||
|
KMSAN_SANITIZE_common.o := n
|
||||||
|
|
||||||
# As above, instrumenting secondary CPU boot code causes boot hangs.
|
# As above, instrumenting secondary CPU boot code causes boot hangs.
|
||||||
KCSAN_SANITIZE_common.o := n
|
KCSAN_SANITIZE_common.o := n
|
||||||
|
@ -177,6 +177,12 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function reads pointers from the stack and dereferences them. The
|
||||||
|
* pointers may not have their KMSAN shadow set up properly, which may result
|
||||||
|
* in false positive reports. Disable instrumentation to avoid those.
|
||||||
|
*/
|
||||||
|
__no_kmsan_checks
|
||||||
static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||||
unsigned long *stack, const char *log_lvl)
|
unsigned long *stack, const char *log_lvl)
|
||||||
{
|
{
|
||||||
|
@ -553,6 +553,7 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
|
|||||||
* Kprobes not supported here. Set the probe on schedule instead.
|
* Kprobes not supported here. Set the probe on schedule instead.
|
||||||
* Function graph tracer not supported too.
|
* Function graph tracer not supported too.
|
||||||
*/
|
*/
|
||||||
|
__no_kmsan_checks
|
||||||
__visible __notrace_funcgraph struct task_struct *
|
__visible __notrace_funcgraph struct task_struct *
|
||||||
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||||
{
|
{
|
||||||
|
@ -95,7 +95,7 @@ void __init tboot_probe(void)
|
|||||||
|
|
||||||
static pgd_t *tboot_pg_dir;
|
static pgd_t *tboot_pg_dir;
|
||||||
static struct mm_struct tboot_mm = {
|
static struct mm_struct tboot_mm = {
|
||||||
.mm_rb = RB_ROOT,
|
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
|
||||||
.pgd = swapper_pg_dir,
|
.pgd = swapper_pg_dir,
|
||||||
.mm_users = ATOMIC_INIT(2),
|
.mm_users = ATOMIC_INIT(2),
|
||||||
.mm_count = ATOMIC_INIT(1),
|
.mm_count = ATOMIC_INIT(1),
|
||||||
|
@ -183,6 +183,16 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* While walking the stack, KMSAN may stomp on stale locals from other
|
||||||
|
* functions that were marked as uninitialized upon function exit, and
|
||||||
|
* now hold the call frame information for the current function (e.g. the frame
|
||||||
|
* pointer). Because KMSAN does not specifically mark call frames as
|
||||||
|
* initialized, false positive reports are possible. To prevent such reports,
|
||||||
|
* we mark the functions scanning the stack (here and below) with
|
||||||
|
* __no_kmsan_checks.
|
||||||
|
*/
|
||||||
|
__no_kmsan_checks
|
||||||
static bool update_stack_state(struct unwind_state *state,
|
static bool update_stack_state(struct unwind_state *state,
|
||||||
unsigned long *next_bp)
|
unsigned long *next_bp)
|
||||||
{
|
{
|
||||||
@ -250,6 +260,7 @@ static bool update_stack_state(struct unwind_state *state,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__no_kmsan_checks
|
||||||
bool unwind_next_frame(struct unwind_state *state)
|
bool unwind_next_frame(struct unwind_state *state)
|
||||||
{
|
{
|
||||||
struct pt_regs *regs;
|
struct pt_regs *regs;
|
||||||
|
@ -65,7 +65,9 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
|
|||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
obj-y += iomap_copy_64.o
|
obj-y += iomap_copy_64.o
|
||||||
|
ifneq ($(CONFIG_GENERIC_CSUM),y)
|
||||||
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
|
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
|
||||||
|
endif
|
||||||
lib-y += clear_page_64.o copy_page_64.o
|
lib-y += clear_page_64.o copy_page_64.o
|
||||||
lib-y += memmove_64.o memset_64.o
|
lib-y += memmove_64.o memset_64.o
|
||||||
lib-y += copy_user_64.o
|
lib-y += copy_user_64.o
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <linux/string.h>
|
#include <linux/string.h>
|
||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
#include <linux/io.h>
|
#include <linux/io.h>
|
||||||
|
#include <linux/kmsan-checks.h>
|
||||||
|
|
||||||
#define movs(type,to,from) \
|
#define movs(type,to,from) \
|
||||||
asm volatile("movs" type:"=&D" (to), "=&S" (from):"0" (to), "1" (from):"memory")
|
asm volatile("movs" type:"=&D" (to), "=&S" (from):"0" (to), "1" (from):"memory")
|
||||||
@ -37,6 +38,8 @@ static void string_memcpy_fromio(void *to, const volatile void __iomem *from, si
|
|||||||
n-=2;
|
n-=2;
|
||||||
}
|
}
|
||||||
rep_movs(to, (const void *)from, n);
|
rep_movs(to, (const void *)from, n);
|
||||||
|
/* KMSAN must treat values read from devices as initialized. */
|
||||||
|
kmsan_unpoison_memory(to, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
|
static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
|
||||||
@ -44,6 +47,8 @@ static void string_memcpy_toio(volatile void __iomem *to, const void *from, size
|
|||||||
if (unlikely(!n))
|
if (unlikely(!n))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
/* Make sure uninitialized memory isn't copied to devices. */
|
||||||
|
kmsan_check_memory(from, n);
|
||||||
/* Align any unaligned destination IO */
|
/* Align any unaligned destination IO */
|
||||||
if (unlikely(1 & (unsigned long)to)) {
|
if (unlikely(1 & (unsigned long)to)) {
|
||||||
movs("b", to, from);
|
movs("b", to, from);
|
||||||
|
@ -14,6 +14,8 @@ KASAN_SANITIZE_pgprot.o := n
|
|||||||
# Disable KCSAN entirely, because otherwise we get warnings that some functions
|
# Disable KCSAN entirely, because otherwise we get warnings that some functions
|
||||||
# reference __initdata sections.
|
# reference __initdata sections.
|
||||||
KCSAN_SANITIZE := n
|
KCSAN_SANITIZE := n
|
||||||
|
# Avoid recursion by not calling KMSAN hooks for CEA code.
|
||||||
|
KMSAN_SANITIZE_cpu_entry_area.o := n
|
||||||
|
|
||||||
ifdef CONFIG_FUNCTION_TRACER
|
ifdef CONFIG_FUNCTION_TRACER
|
||||||
CFLAGS_REMOVE_mem_encrypt.o = -pg
|
CFLAGS_REMOVE_mem_encrypt.o = -pg
|
||||||
@ -44,6 +46,9 @@ obj-$(CONFIG_HIGHMEM) += highmem_32.o
|
|||||||
KASAN_SANITIZE_kasan_init_$(BITS).o := n
|
KASAN_SANITIZE_kasan_init_$(BITS).o := n
|
||||||
obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
|
obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
|
||||||
|
|
||||||
|
KMSAN_SANITIZE_kmsan_shadow.o := n
|
||||||
|
obj-$(CONFIG_KMSAN) += kmsan_shadow.o
|
||||||
|
|
||||||
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
|
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
|
||||||
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
|
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
|
||||||
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
|
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
|
||||||
|
@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address)
|
|||||||
}
|
}
|
||||||
NOKPROBE_SYMBOL(vmalloc_fault);
|
NOKPROBE_SYMBOL(vmalloc_fault);
|
||||||
|
|
||||||
void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
|
static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end)
|
||||||
{
|
{
|
||||||
unsigned long addr;
|
unsigned long addr;
|
||||||
|
|
||||||
@ -284,6 +284,27 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
|
||||||
|
{
|
||||||
|
__arch_sync_kernel_mappings(start, end);
|
||||||
|
#ifdef CONFIG_KMSAN
|
||||||
|
/*
|
||||||
|
* KMSAN maintains two additional metadata page mappings for the
|
||||||
|
* [VMALLOC_START, VMALLOC_END) range. These mappings start at
|
||||||
|
* KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and
|
||||||
|
* have to be synced together with the vmalloc memory mapping.
|
||||||
|
*/
|
||||||
|
if (start >= VMALLOC_START && end < VMALLOC_END) {
|
||||||
|
__arch_sync_kernel_mappings(
|
||||||
|
start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START,
|
||||||
|
end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START);
|
||||||
|
__arch_sync_kernel_mappings(
|
||||||
|
start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START,
|
||||||
|
end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static bool low_pfn(unsigned long pfn)
|
static bool low_pfn(unsigned long pfn)
|
||||||
{
|
{
|
||||||
return pfn < max_low_pfn;
|
return pfn < max_low_pfn;
|
||||||
|
@ -1054,7 +1054,7 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_SWAP
|
#ifdef CONFIG_SWAP
|
||||||
unsigned long max_swapfile_size(void)
|
unsigned long arch_max_swapfile_size(void)
|
||||||
{
|
{
|
||||||
unsigned long pages;
|
unsigned long pages;
|
||||||
|
|
||||||
|
@ -1288,7 +1288,7 @@ static void __init preallocate_vmalloc_pages(void)
|
|||||||
unsigned long addr;
|
unsigned long addr;
|
||||||
const char *lvl;
|
const char *lvl;
|
||||||
|
|
||||||
for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
|
for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
|
||||||
pgd_t *pgd = pgd_offset_k(addr);
|
pgd_t *pgd = pgd_offset_k(addr);
|
||||||
p4d_t *p4d;
|
p4d_t *p4d;
|
||||||
pud_t *pud;
|
pud_t *pud;
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
#include <linux/cc_platform.h>
|
#include <linux/cc_platform.h>
|
||||||
#include <linux/efi.h>
|
#include <linux/efi.h>
|
||||||
#include <linux/pgtable.h>
|
#include <linux/pgtable.h>
|
||||||
|
#include <linux/kmsan.h>
|
||||||
|
|
||||||
#include <asm/set_memory.h>
|
#include <asm/set_memory.h>
|
||||||
#include <asm/e820/api.h>
|
#include <asm/e820/api.h>
|
||||||
@ -479,6 +480,8 @@ void iounmap(volatile void __iomem *addr)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kmsan_iounmap_page_range((unsigned long)addr,
|
||||||
|
(unsigned long)addr + get_vm_area_size(p));
|
||||||
memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p));
|
memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p));
|
||||||
|
|
||||||
/* Finally remove it */
|
/* Finally remove it */
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user