diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 628a00fb20a9..1ef69e0271f9 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -151,3 +151,10 @@ Contact: Sergey Senozhatsky Description: The recompress file is write-only and triggers re-compression with secondary compression algorithms. + +What: /sys/block/zram/algorithm_params +Date: August 2024 +Contact: Sergey Senozhatsky +Description: + The algorithm_params file is write-only and is used to setup + compression algorithm parameters. diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 091e8bb38887..678d70d6e1c3 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -102,17 +102,41 @@ Examples:: #select lzo compression algorithm echo lzo > /sys/block/zram0/comp_algorithm -For the time being, the `comp_algorithm` content does not necessarily -show every compression algorithm supported by the kernel. We keep this -list primarily to simplify device configuration and one can configure -a new device with a compression algorithm that is not listed in -`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API -and, if some of the algorithms were built as modules, it's impossible -to list all of them using, for instance, /proc/crypto or any other -method. This, however, has an advantage of permitting the usage of -custom crypto compression modules (implementing S/W or H/W compression). +For the time being, the `comp_algorithm` content shows only compression +algorithms that are supported by zram. -4) Set Disksize +4) Set compression algorithm parameters: Optional +================================================= + +Compression algorithms may support specific parameters which can be +tweaked for particular dataset. ZRAM has an `algorithm_params` device +attribute which provides a per-algorithm params configuration. + +For example, several compression algorithms support `level` parameter. +In addition, certain compression algorithms support pre-trained dictionaries, +which significantly change algorithms' characteristics. In order to configure +compression algorithm to use external pre-trained dictionary, pass full +path to the `dict` along with other parameters:: + + #pass path to pre-trained zstd dictionary + echo "algo=zstd dict=/etc/dictioary" > /sys/block/zram0/algorithm_params + + #same, but using algorithm priority + echo "priority=1 dict=/etc/dictioary" > \ + /sys/block/zram0/algorithm_params + + #pass path to pre-trained zstd dictionary and compression level + echo "algo=zstd level=8 dict=/etc/dictioary" > \ + /sys/block/zram0/algorithm_params + +Parameters are algorithm specific: not all algorithms support pre-trained +dictionaries, not all algorithms support `level`. Furthermore, for certain +algorithms `level` controls the compression level (the higher the value the +better the compression ratio, it even can take negatives values for some +algorithms), for other algorithms `level` is acceleration level (the higher +the value the lower the compression ratio). + +5) Set Disksize =============== Set disk size by writing the value to sysfs node 'disksize'. @@ -132,7 +156,7 @@ There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. -5) Set memory limit: Optional +6) Set memory limit: Optional ============================= Set memory limit by writing the value to sysfs node 'mem_limit'. @@ -151,7 +175,7 @@ Examples:: # To disable memory limit echo 0 > /sys/block/zram0/mem_limit -6) Activate +7) Activate =========== :: @@ -162,7 +186,7 @@ Examples:: mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -7) Add/remove zram devices +8) Add/remove zram devices ========================== zram provides a control interface, which enables dynamic (on-demand) device @@ -182,7 +206,7 @@ execute:: echo X > /sys/class/zram-control/hot_remove -8) Stats +9) Stats ======== Per-device statistics are exported as various nodes under /sys/block/zram/ @@ -205,6 +229,7 @@ writeback_limit_enable RW show and set writeback_limit feature max_comp_streams RW the number of possible concurrent compress operations comp_algorithm RW show and change the compression algorithm +algorithm_params WO setup compression algorithm parameters compact WO trigger memory compaction debug_stat RO this file is used for zram debugging purposes backing_dev RW set up backend storage for zram to write out @@ -283,15 +308,15 @@ a single line of text and contains the following stats separated by whitespace: Unit: 4K bytes ============== ============================================================= -9) Deactivate -============= +10) Deactivate +============== :: swapoff /dev/zram0 umount /dev/zram1 -10) Reset +11) Reset ========= Write any positive value to 'reset' sysfs node:: @@ -487,11 +512,14 @@ registered compression algorithms, increases our chances of finding the algorithm that successfully compresses a particular page. Sometimes, however, it is convenient (and sometimes even necessary) to limit recompression to only one particular algorithm so that it will not try any other algorithms. -This can be achieved by providing a algo=NAME parameter::: +This can be achieved by providing a `algo` or `priority` parameter::: #use zstd algorithm only (if registered) echo "type=huge algo=zstd" > /sys/block/zramX/recompress + #use zstd algorithm only (if zstd was registered under priority 1) + echo "type=huge priority=1" > /sys/block/zramX/recompress + memory tracking =============== diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 9cde26d33843..270501db9f4e 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -78,18 +78,24 @@ Brief summary of control files. memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded memory.soft_limit_in_bytes set/show soft limit of memory usage This knob is not available on CONFIG_PREEMPT_RT systems. + This knob is deprecated and shouldn't be + used. memory.stat show various statistics memory.use_hierarchy set/show hierarchical account enabled This knob is deprecated and shouldn't be used. memory.force_empty trigger forced page reclaim memory.pressure_level set memory pressure notifications + This knob is deprecated and shouldn't be + used. memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) memory.move_charge_at_immigrate set/show controls of moving charges This knob is deprecated and shouldn't be used. memory.oom_control set/show oom controls. + This knob is deprecated and shouldn't be + used. memory.numa_stat show the number of memory usage per numa node memory.kmem.limit_in_bytes Deprecated knob to set and read the kernel @@ -105,10 +111,18 @@ Brief summary of control files. memory.kmem.max_usage_in_bytes show max kernel memory usage recorded memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory + This knob is deprecated and shouldn't be + used. memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation + This knob is deprecated and shouldn't be + used. memory.kmem.tcp.failcnt show the number of tcp buf memory usage hits limits + This knob is deprecated and shouldn't be + used. memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded + This knob is deprecated and shouldn't be + used. ==================================== ========================================== 1. History @@ -693,8 +707,10 @@ For compatibility reasons writing 1 to memory.use_hierarchy will always pass:: # echo 1 > memory.use_hierarchy -7. Soft limits -============== +7. Soft limits (DEPRECATED) +=========================== + +THIS IS DEPRECATED! Soft limits allow for greater sharing of memory. The idea behind soft limits is to allow control groups to use as much of the memory as needed, provided @@ -834,8 +850,10 @@ It's applicable for root and non-root cgroup. .. _cgroup-v1-memory-oom-control: -10. OOM Control -=============== +10. OOM Control (DEPRECATED) +============================ + +THIS IS DEPRECATED! memory.oom_control file is for OOM notification and other controls. @@ -882,8 +900,10 @@ At reading, current status of OOM is shown. The number of processes belonging to this cgroup killed by any kind of OOM killer. -11. Memory Pressure -=================== +11. Memory Pressure (DEPRECATED) +================================ + +THIS IS DEPRECATED! The pressure level notifications can be used to monitor the memory allocation cost; based on the pressure, applications can implement diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c65756afd372..69af2173555f 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1343,11 +1343,14 @@ The following nested keys are defined. all the existing limitations and potential future extensions. memory.peak - A read-only single value file which exists on non-root - cgroups. + A read-write single value file which exists on non-root cgroups. - The max memory usage recorded for the cgroup and its - descendants since the creation of the cgroup. + The max memory usage recorded for the cgroup and its descendants since + either the creation of the cgroup or the most recent reset for that FD. + + A write of any non-empty string to this file resets it to the + current memory usage for subsequent reads through the same + file descriptor. memory.oom.group A read-write single value file which exists on non-root @@ -1624,6 +1627,25 @@ The following nested keys are defined. Usually because failed to allocate some continuous swap space for the huge page. + numa_pages_migrated (npn) + Number of pages migrated by NUMA balancing. + + numa_pte_updates (npn) + Number of pages whose page table entries are modified by + NUMA balancing to produce NUMA hinting faults on access. + + numa_hint_faults (npn) + Number of NUMA hinting faults. + + pgdemote_kswapd + Number of pages demoted by kswapd. + + pgdemote_direct + Number of pages demoted directly. + + pgdemote_khugepaged + Number of pages demoted by khugepaged. + memory.numa_stat A read-only nested-keyed file which exists on non-root cgroups. @@ -1673,11 +1695,14 @@ The following nested keys are defined. Healthy workloads are not expected to reach this limit. memory.swap.peak - A read-only single value file which exists on non-root - cgroups. + A read-write single value file which exists on non-root cgroups. - The max swap usage recorded for the cgroup and its - descendants since the creation of the cgroup. + The max swap usage recorded for the cgroup and its descendants since + the creation of the cgroup or the most recent reset for that FD. + + A write of any non-empty string to this file resets it to the + current memory usage for subsequent reads through the same + file descriptor. memory.swap.max A read-write single value file which exists on non-root @@ -1741,6 +1766,8 @@ The following nested keys are defined. Note that this is subtly different from setting memory.swap.max to 0, as it still allows for pages to be written to the zswap pool. + This setting has no effect if zswap is disabled, and swapping + is allowed unless memory.swap.max is set to 0. memory.pressure A read-only nested-keyed file. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8337d0fed311..19b71ff1168e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4152,6 +4152,21 @@ Disable NUMA, Only set up a single NUMA node spanning all memory. + numa=fake=[MG] + [KNL, ARM64, RISCV, X86, EARLY] + If given as a memory unit, fills all system RAM with + nodes of size interleaved over physical nodes. + + numa=fake= + [KNL, ARM64, RISCV, X86, EARLY] + If given as an integer, fills all system RAM with N + fake nodes interleaved over physical nodes. + + numa=fake=U + [KNL, ARM64, RISCV, X86, EARLY] + If given as an integer followed by 'U', it will + divide each physical node into N emulated nodes. + numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic NUMA balancing. Allowed values are enable and disable @@ -6655,6 +6670,15 @@ : poll all this frequency 0: no polling (default) + thp_anon= [KNL] + Format: ,[KMG]:;-[KMG]: + state is one of "always", "madvise", "never" or "inherit". + Control the default behavior of the system with respect + to anonymous transparent hugepages. + Can be used multiple times for multiple anon THP sizes. + See Documentation/admin-guide/mm/transhuge.rst for more + details. + threadirqs [KNL,EARLY] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index 054010a7f3d8..c4dddf6733cd 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -7,7 +7,7 @@ Getting Started This document briefly describes how you can use DAMON by demonstrating its default user space tool. Please note that this document describes only a part of its features for brevity. Please refer to the usage `doc -`_ of the tool for more +`_ of the tool for more details. @@ -26,7 +26,7 @@ User Space Tool For the demonstration, we will use the default user space tool for DAMON, called DAMON Operator (DAMO). It is available at -https://github.com/awslabs/damo. The examples below assume that ``damo`` is on +https://github.com/damonitor/damo. The examples below assume that ``damo`` is on your ``$PATH``. It's not mandatory, though. Because DAMO is using the sysfs interface (refer to :doc:`usage` for the diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 26df6cfa4441..d9be9f7caa7d 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -7,19 +7,19 @@ Detailed Usages DAMON provides below interfaces for different users. - *DAMON user space tool.* - `This `_ is for privileged people such as + `This `_ is for privileged people such as system administrators who want a just-working human-friendly interface. Using this, users can use the DAMON’s major features in a human-friendly way. It may not be highly tuned for special cases, though. For more detail, please refer to its `usage document - `_. + `_. - *sysfs interface.* :ref:`This ` is for privileged user space programmers who want more optimized use of DAMON. Using this, users can use DAMON’s major features by reading from and writing to special sysfs files. Therefore, you can write and use your personalized DAMON sysfs wrapper programs that reads/writes the sysfs files instead of you. The `DAMON user space tool - `_ is one example of such programs. + `_ is one example of such programs. - *Kernel Space Programming Interface.* :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by @@ -543,7 +543,7 @@ memory rate becomes larger than 60%, or lower than 30%". :: # echo 300 > watermarks/low Please note that it's highly recommended to use user space tools like `damo -`_ rather than manually reading and writing +`_ rather than manually reading and writing the files as above. Above is only for an example. .. _tracepoint: diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 058485daf186..cfdd16a52e39 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -202,6 +202,16 @@ PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size +All THPs at fault and collapse time will be added to _deferred_list, +and will therefore be split under memory presure if they are considered +"underused". A THP is underused if the number of zero-filled pages in +the THP is above max_ptes_none (see below). It is possible to disable +this behaviour by writing 0 to shrink_underused, and enable it by writing +1 to it:: + + echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused + echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused + khugepaged will be automatically started when PMD-sized THP is enabled (either of the per-size anon control or the top-level control are set to "always" or "madvise"), and it'll be automatically shutdown when @@ -284,13 +294,37 @@ that THP is shared. Exceeding the number would block the collapse:: A higher value may increase memory footprint for some workloads. -Boot parameter -============== +Boot parameters +=============== -You can change the sysfs boot time defaults of Transparent Hugepage -Support by passing the parameter ``transparent_hugepage=always`` or -``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` -to the kernel command line. +You can change the sysfs boot time default for the top-level "enabled" +control by passing the parameter ``transparent_hugepage=always`` or +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` to the +kernel command line. + +Alternatively, each supported anonymous THP size can be controlled by +passing ``thp_anon=,[KMG]:;-[KMG]:``, +where ```` is the THP size (must be a power of 2 of PAGE_SIZE and +supported anonymous THP) and ```` is one of ``always``, ``madvise``, +``never`` or ``inherit``. + +For example, the following will set 16K, 32K, 64K THP to ``always``, +set 128K, 512K to ``inherit``, set 256K to ``madvise`` and 1M, 2M +to ``never``:: + + thp_anon=16K-64K:always;128K,512K:inherit;256K:madvise;1M-2M:never + +``thp_anon=`` may be specified multiple times to configure all THP sizes as +required. If ``thp_anon=`` is specified at least once, any anon THP sizes +not explicitly configured on the command line are implicitly set to +``never``. + +``transparent_hugepage`` setting only affects the global toggle. If +``thp_anon`` is not specified, PMD_ORDER THP will default to ``inherit``. +However, if a valid ``thp_anon`` setting is provided by the user, the +PMD_ORDER THP policy will be overridden. If the policy for PMD_ORDER +is not defined within a valid ``thp_anon``, its policy will default to +``never``. Hugepages in tmpfs/shmem ======================== @@ -447,6 +481,12 @@ thp_deferred_split_page splitting it would free up some memory. Pages on split queue are going to be split under memory pressure. +thp_underused_split_page + is incremented when a huge page on the split queue was split + because it was underused. A THP is underused if the number of + zero pages in the THP is above a certain threshold + (/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none). + thp_split_pmd is incremented every time a PMD split into table of PTEs. This can happen, for instance, when application calls mprotect() or @@ -527,6 +567,18 @@ split_deferred it would free up some memory. Pages on split queue are going to be split under memory pressure, if splitting is possible. +nr_anon + the number of anonymous THP we have in the whole system. These THPs + might be currently entirely mapped or have partially unmapped/unused + subpages. + +nr_anon_partially_mapped + the number of anonymous THP which are likely partially mapped, possibly + wasting memory, and have been queued for deferred memory reclamation. + Note that in corner some cases (e.g., failed migration), we might detect + an anonymous THP as "partially mapped" and count it here, even though it + is not actually partially mapped anymore. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/Documentation/arch/x86/x86_64/boot-options.rst b/Documentation/arch/x86/x86_64/boot-options.rst index 137432d34109..98d4805f0823 100644 --- a/Documentation/arch/x86/x86_64/boot-options.rst +++ b/Documentation/arch/x86/x86_64/boot-options.rst @@ -170,18 +170,6 @@ NUMA Don't parse the HMAT table for NUMA setup, or soft-reserved memory partitioning. - numa=fake=[MG] - If given as a memory unit, fills all system RAM with nodes of - size interleaved over physical nodes. - - numa=fake= - If given as an integer, fills all system RAM with N fake nodes - interleaved over physical nodes. - - numa=fake=U - If given as an integer followed by 'U', it will divide each - physical node into N emulated nodes. - ACPI ==== diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 4451ef501936..14e093da3ccd 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -576,13 +576,12 @@ The field width is passed by value, the bitmap is passed by reference. Helper macros cpumask_pr_args() and nodemask_pr_args() are available to ease printing cpumask and nodemask. -Flags bitfields such as page flags, page_type, gfp_flags +Flags bitfields such as page flags and gfp_flags -------------------------------------------------------- :: %pGp 0x17ffffc0002036(referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1fffff) - %pGt 0xffffff7f(buddy) %pGg GFP_USER|GFP_DMA32|GFP_NOWARN %pGv read|exec|mayread|maywrite|mayexec|denywrite @@ -591,7 +590,6 @@ would construct the value. The type of flags is given by the third character. Currently supported are: - p - [p]age flags, expects value of type (``unsigned long *``) - - t - page [t]ype, expects value of type (``unsigned int *``) - v - [v]ma_flags, expects value of type (``unsigned long *``) - g - [g]fp_flags, expects value of type (``gfp_t *``) diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index 936f6aaa75c8..541899353865 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -53,6 +53,13 @@ configurable via the Kconfig option ``CONFIG_KFENCE_DEFERRABLE``. The KUnit test suite is very likely to fail when using a deferrable timer since it currently causes very unpredictable sample intervals. +By default KFENCE will only sample 1 heap allocation within each sample +interval. *Burst mode* allows to sample successive heap allocations, where the +kernel boot parameter ``kfence.burst`` can be set to a non-zero value which +denotes the *additional* successive allocations within a sample interval; +setting ``kfence.burst=N`` means that ``1 + N`` successive allocations are +attempted through KFENCE for each sample interval. + The KFENCE memory pool is of fixed size, and if the pool is exhausted, no further KFENCE allocations occur. With ``CONFIG_KFENCE_NUM_OBJECTS`` (default 255), the number of available guarded objects can be controlled. Each object diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt deleted file mode 100644 index 5a7508b8c967..000000000000 --- a/Documentation/features/vm/PG_uncached/arch-support.txt +++ /dev/null @@ -1,30 +0,0 @@ -# -# Feature name: PG_uncached -# Kconfig: ARCH_USES_PG_UNCACHED -# description: arch supports the PG_uncached page flag -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | TODO | - | arc: | TODO | - | arm: | TODO | - | arm64: | TODO | - | csky: | TODO | - | hexagon: | TODO | - | loongarch: | TODO | - | m68k: | TODO | - | microblaze: | TODO | - | mips: | TODO | - | nios2: | TODO | - | openrisc: | TODO | - | parisc: | TODO | - | powerpc: | TODO | - | riscv: | TODO | - | s390: | TODO | - | sh: | TODO | - | sparc: | TODO | - | um: | TODO | - | x86: | ok | - | xtensa: | TODO | - ----------------------- diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 4f67b5ea0568..0b18af3f954e 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -913,8 +913,7 @@ cache in your filesystem. The following members are defined: stop attempting I/O, it can simply return. The caller will remove the remaining pages from the address space, unlock them and decrement the page refcount. Set PageUptodate if the I/O - completes successfully. Setting PageError on any page will be - ignored; simply unlock the page if an I/O error occurs. + completes successfully. ``write_begin`` Called by the generic buffered write code to ask the filesystem diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 8730c246ceaa..f9c50525bdbf 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -586,7 +586,7 @@ API, and return the results to the user-space. The ABIs are designed to be used for user space applications development, rather than human beings' fingers. Human users are recommended to use such user space tools. One such Python-written user space tool is available at -Github (https://github.com/awslabs/damo), Pypi +Github (https://github.com/damonitor/damo), Pypi (https://pypistats.org/packages/damo), and Fedora (https://packages.fedoraproject.org/pkgs/python-damo/damo/). diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index feccf6a0f6c3..2365c9a3c1f0 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -7,23 +7,27 @@ The DAMON subsystem covers the files that are listed in 'DATA ACCESS MONITOR' section of 'MAINTAINERS' file. The mailing lists for the subsystem are damon@lists.linux.dev and -linux-mm@kvack.org. Patches should be made against the mm-unstable tree [1]_ -whenever possible and posted to the mailing lists. +linux-mm@kvack.org. Patches should be made against the mm-unstable `tree +` whenever possible and posted to +the mailing lists. SCM Trees --------- There are multiple Linux trees for DAMON development. Patches under -development or testing are queued in damon/next [2]_ by the DAMON maintainer. -Sufficiently reviewed patches will be queued in mm-unstable [1]_ by the memory -management subsystem maintainer. After more sufficient tests, the patches will -be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the -memory management subsystem maintainer. +development or testing are queued in `damon/next +` by the DAMON maintainer. +Sufficiently reviewed patches will be queued in `mm-unstable +` by the memory management +subsystem maintainer. After more sufficient tests, the patches will be queued +in `mm-stable ` , and finally +pull-requested to the mainline by the memory management subsystem maintainer. -Note again the patches for mm-unstable tree [1]_ are queued by the memory +Note again the patches for mm-unstable `tree +` are queued by the memory management subsystem maintainer. If the patches requires some patches in -damon/next tree [2]_ which not yet merged in mm-unstable, please make sure the -requirement is clearly specified. +damon/next `tree ` which not yet merged +in mm-unstable, please make sure the requirement is clearly specified. Submit checklist addendum ------------------------- @@ -32,18 +36,27 @@ When making DAMON changes, you should do below. - Build changes related outputs including kernel and documents. - Ensure the builds introduce no new errors or warnings. -- Run and ensure no new failures for DAMON selftests [4]_ and kunittests [5]_ . +- Run and ensure no new failures for DAMON `selftests + ` and + `kunittests + `. Further doing below and putting the results will be helpful. -- Run damon-tests/corr [6]_ for normal changes. -- Run damon-tests/perf [7]_ for performance changes. +- Run `damon-tests/corr + ` for normal + changes. +- Run `damon-tests/perf + ` for performance + changes. Key cycle dates --------------- -Patches can be sent anytime. Key cycle dates of the mm-unstable [1]_ and -mm-stable [3]_ trees depend on the memory management subsystem maintainer. +Patches can be sent anytime. Key cycle dates of the `mm-unstable +` and `mm-stable +` trees depend on the memory +management subsystem maintainer. Review cadence -------------- @@ -58,16 +71,17 @@ Mailing tool Like many other Linux kernel subsystems, DAMON uses the mailing lists (damon@lists.linux.dev and linux-mm@kvack.org) as the major communication -channel. There is a simple tool called HacKerMaiL (``hkml``) [8]_ , which is -for people who are not very familiar with the mailing lists based -communication. The tool could be particularly helpful for DAMON community -members since it is developed and maintained by DAMON maintainer. The tool is -also officially announced to support DAMON and general Linux kernel development -workflow. +channel. There is a simple tool called `HacKerMaiL +` (``hkml``), which is for people who +are not very familiar with the mailing lists based communication. The tool +could be particularly helpful for DAMON community members since it is developed +and maintained by DAMON maintainer. The tool is also officially announced to +support DAMON and general Linux kernel development workflow. -In other words, ``hkml`` [8]_ is a mailing tool for DAMON community, which -DAMON maintainer is committed to support. Please feel free to try and report -issues or feature requests for the tool to the maintainer. +In other words, `hkml ` is a mailing +tool for DAMON community, which DAMON maintainer is committed to support. +Please feel free to try and report issues or feature requests for the tool to +the maintainer. Community meetup ---------------- @@ -83,17 +97,9 @@ members including the maintainer. The maintainer shares the available time slots, and attendees should reserve one of those at least 24 hours before the time slot, by reaching out to the maintainer. -Schedules and available reservation time slots are available at the Google doc -[9]_ . DAMON maintainer will also provide periodic reminder to the mailing -list (damon@lists.linux.dev). - - -.. [1] https://git.kernel.org/akpm/mm/h/mm-unstable -.. [2] https://git.kernel.org/sj/h/damon/next -.. [3] https://git.kernel.org/akpm/mm/h/mm-stable -.. [4] https://github.com/awslabs/damon-tests/blob/master/corr/run.sh#L49 -.. [5] https://github.com/awslabs/damon-tests/blob/master/corr/tests/kunit.sh -.. [6] https://github.com/awslabs/damon-tests/tree/master/corr -.. [7] https://github.com/awslabs/damon-tests/tree/master/perf -.. [8] https://github.com/damonitor/hackermail -.. [9] https://docs.google.com/document/d/1v43Kcj3ly4CYqmAkMaZzLiM2GEnWfgdGbZAH3mi2vpM/edit?usp=sharing +Schedules and available reservation time slots are available at the Google `doc +`. +There is also a public Google `calendar +` +that has the events. Anyone can subscribe it. DAMON maintainer will also +provide periodic reminder to the mailing list (damon@lists.linux.dev). diff --git a/Documentation/mm/page_migration.rst b/Documentation/mm/page_migration.rst index f1ce67a26615..519b35a4caf5 100644 --- a/Documentation/mm/page_migration.rst +++ b/Documentation/mm/page_migration.rst @@ -63,15 +63,15 @@ and then a low level description of how the low level details work. In kernel use of migrate_pages() ================================ -1. Remove pages from the LRU. +1. Remove folios from the LRU. - Lists of pages to be migrated are generated by scanning over - pages and moving them into lists. This is done by - calling isolate_lru_page(). - Calling isolate_lru_page() increases the references to the page - so that it cannot vanish while the page migration occurs. + Lists of folios to be migrated are generated by scanning over + folios and moving them into lists. This is done by + calling folio_isolate_lru(). + Calling folio_isolate_lru() increases the references to the folio + so that it cannot vanish while the folio migration occurs. It also prevents the swapper or other scans from encountering - the page. + the folio. 2. We need to have a function of type new_folio_t that can be passed to migrate_pages(). This function should figure out @@ -84,10 +84,10 @@ In kernel use of migrate_pages() How migrate_pages() works ========================= -migrate_pages() does several passes over its list of pages. A page is moved -if all references to a page are removable at the time. The page has -already been removed from the LRU via isolate_lru_page() and the refcount -is increased so that the page cannot be freed while page migration occurs. +migrate_pages() does several passes over its list of folios. A folio is moved +if all references to a folio are removable at the time. The folio has +already been removed from the LRU via folio_isolate_lru() and the refcount +is increased so that the folio cannot be freed while folio migration occurs. Steps: diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index 1ba0ad63246c..a2cd8800d527 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -31,10 +31,10 @@ Design principles feature that applies to all dynamic high order allocations in the kernel) -get_user_pages and follow_page -============================== +get_user_pages and pin_user_pages +================================= -get_user_pages and follow_page if run on a hugepage, will return the +get_user_pages and pin_user_pages if run on a hugepage, will return the head or tail pages as usual (exactly as they would do on hugetlbfs). Most GUP users will only care about the actual physical address of the page and its temporary pinning to release after the I/O diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 2feb2ed51ae2..8d11fe6a0854 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -80,7 +80,7 @@ on an additional LRU list for a few reasons: (2) We want to be able to migrate unevictable folios between nodes for memory defragmentation, workload management and memory hotplug. The Linux kernel can only migrate folios that it can successfully isolate from the LRU - lists (or "Movable" pages: outside of consideration here). If we were to + lists (or "Movable" folios: outside of consideration here). If we were to maintain folios elsewhere than on an LRU-like list, where they can be detected by folio_isolate_lru(), we would prevent their migration. @@ -230,7 +230,7 @@ In Nick's patch, he used one of the struct page LRU list link fields as a count of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years earlier). But this use of the link field for a count prevented the management of the pages on an LRU list, and thus mlocked pages were not migratable as -isolate_lru_page() could not detect them, and the LRU list link field was not +folio_isolate_lru() could not detect them, and the LRU list link field was not available to the migration subsystem. Nick resolved this by putting mlocked pages back on the LRU list before @@ -253,8 +253,8 @@ Basic Management mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable pages. When such a page has been "noticed" by the memory management subsystem, -the page is marked with the PG_mlocked flag. This can be manipulated using the -PageMlocked() functions. +the folio is marked with the PG_mlocked flag. This can be manipulated using +folio_set_mlocked() and folio_clear_mlocked() functions. A PG_mlocked page will be placed on the unevictable list when it is added to the LRU. Such pages can be "noticed" by memory management in several places: diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/start.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/start.rst index bf21ff84f396..cff7b6f98c59 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/start.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/start.rst @@ -15,7 +15,7 @@ 本文通过演示DAMON的默认用户空间工具,简要地介绍了如何使用DAMON。请注意,为了简洁 起见,本文档只描述了它的部分功能。更多细节请参考该工具的使用文档。 -`doc `_ . +`doc `_ . 前提条件 @@ -31,7 +31,7 @@ ------------ 在演示中,我们将使用DAMON的默认用户空间工具,称为DAMON Operator(DAMO)。它可以在 -https://github.com/awslabs/damo找到。下面的例子假设DAMO在你的$PATH上。当然,但 +https://github.com/damonitor/damo找到。下面的例子假设DAMO在你的$PATH上。当然,但 这并不是强制性的。 因为DAMO使用了DAMON的sysfs接口(详情请参考:doc:`usage`),你应该确保 diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index da2745464ece..50f6f0b6bf11 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -16,16 +16,16 @@ DAMON 为不同的用户提供了下面这些接口。 - *DAMON用户空间工具。* - `这 `_ 为有这特权的人, 如系统管理员,希望有一个刚好 + `这 `_ 为有这特权的人, 如系统管理员,希望有一个刚好 可以工作的人性化界面。 使用它,用户可以以人性化的方式使用DAMON的主要功能。不过,它可能不会为特殊情况进行高度调整。 它同时支持虚拟和物理地址空间的监测。更多细节,请参考它的 `使用文档 - `_。 + `_。 - *sysfs接口。* :ref:`这 ` 是为那些希望更高级的使用DAMON的特权用户空间程序员准备的。 使用它,用户可以通过读取和写入特殊的sysfs文件来使用DAMON的主要功能。因此,你可以编写和使 用你个性化的DAMON sysfs包装程序,代替你读/写sysfs文件。 `DAMON用户空间工具 - `_ 就是这种程序的一个例子 它同时支持虚拟和物理地址 + `_ 就是这种程序的一个例子 它同时支持虚拟和物理地址 空间的监测。注意,这个界面只提供简单的监测结果 :ref:`统计 `。对于详细的监测 结果,DAMON提供了一个:ref:`跟踪点 `。 - *debugfs interface.* @@ -332,7 +332,7 @@ tried_regions// # echo 500 > watermarks/mid # echo 300 > watermarks/low -请注意,我们强烈建议使用用户空间的工具,如 `damo `_ , +请注意,我们强烈建议使用用户空间的工具,如 `damo `_ , 而不是像上面那样手动读写文件。以上只是一个例子。 debugfs接口 diff --git a/Documentation/translations/zh_CN/mm/page_migration.rst b/Documentation/translations/zh_CN/mm/page_migration.rst index f95063826a15..8c8461c6cb9f 100644 --- a/Documentation/translations/zh_CN/mm/page_migration.rst +++ b/Documentation/translations/zh_CN/mm/page_migration.rst @@ -50,8 +50,8 @@ mbind()设置一个新的内存策略。一个进程的页面也可以通过sys_ 1. 从LRU中移除页面。 - 要迁移的页面列表是通过扫描页面并把它们移到列表中来生成的。这是通过调用 isolate_lru_page() - 来完成的。调用isolate_lru_page()增加了对该页的引用,这样在页面迁移发生时它就不会 + 要迁移的页面列表是通过扫描页面并把它们移到列表中来生成的。这是通过调用 folio_isolate_lru() + 来完成的。调用folio_isolate_lru()增加了对该页的引用,这样在页面迁移发生时它就不会 消失。它还可以防止交换器或其他扫描器遇到该页。 @@ -65,7 +65,7 @@ migrate_pages()如何工作 ======================= migrate_pages()对它的页面列表进行了多次处理。如果当时对一个页面的所有引用都可以被移除, -那么这个页面就会被移动。该页已经通过isolate_lru_page()从LRU中移除,并且refcount被 +那么这个页面就会被移动。该页已经通过folio_isolate_lru()从LRU中移除,并且refcount被 增加,以便在页面迁移发生时不释放该页。 步骤: diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/start.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/start.rst index 1822956be0e0..57d36bfbb1b3 100644 --- a/Documentation/translations/zh_TW/admin-guide/mm/damon/start.rst +++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/start.rst @@ -15,7 +15,7 @@ 本文通過演示DAMON的默認用戶空間工具,簡要地介紹瞭如何使用DAMON。請注意,爲了簡潔 起見,本文檔只描述了它的部分功能。更多細節請參考該工具的使用文檔。 -`doc `_ . +`doc `_ . 前提條件 @@ -31,7 +31,7 @@ ------------ 在演示中,我們將使用DAMON的默認用戶空間工具,稱爲DAMON Operator(DAMO)。它可以在 -https://github.com/awslabs/damo找到。下面的例子假設DAMO在你的$PATH上。當然,但 +https://github.com/damonitor/damo找到。下面的例子假設DAMO在你的$PATH上。當然,但 這並不是強制性的。 因爲DAMO使用了DAMON的sysfs接口(詳情請參考:doc:`usage`),你應該確保 diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst index 7464279f9b7d..fbbbbad59ee4 100644 --- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst @@ -16,16 +16,16 @@ DAMON 爲不同的用戶提供了下面這些接口。 - *DAMON用戶空間工具。* - `這 `_ 爲有這特權的人, 如系統管理員,希望有一個剛好 + `這 `_ 爲有這特權的人, 如系統管理員,希望有一個剛好 可以工作的人性化界面。 使用它,用戶可以以人性化的方式使用DAMON的主要功能。不過,它可能不會爲特殊情況進行高度調整。 它同時支持虛擬和物理地址空間的監測。更多細節,請參考它的 `使用文檔 - `_。 + `_。 - *sysfs接口。* :ref:`這 ` 是爲那些希望更高級的使用DAMON的特權用戶空間程序員準備的。 使用它,用戶可以通過讀取和寫入特殊的sysfs文件來使用DAMON的主要功能。因此,你可以編寫和使 用你個性化的DAMON sysfs包裝程序,代替你讀/寫sysfs文件。 `DAMON用戶空間工具 - `_ 就是這種程序的一個例子 它同時支持虛擬和物理地址 + `_ 就是這種程序的一個例子 它同時支持虛擬和物理地址 空間的監測。注意,這個界面只提供簡單的監測結果 :ref:`統計 `。對於詳細的監測 結果,DAMON提供了一個:ref:`跟蹤點 `。 - *debugfs interface.* @@ -332,7 +332,7 @@ tried_regions// # echo 500 > watermarks/mid # echo 300 > watermarks/low -請注意,我們強烈建議使用用戶空間的工具,如 `damo `_ , +請注意,我們強烈建議使用用戶空間的工具,如 `damo `_ , 而不是像上面那樣手動讀寫文件。以上只是一個例子。 debugfs接口 diff --git a/MAINTAINERS b/MAINTAINERS index 9278c30ef1d5..94941c8f6c03 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24603,6 +24603,20 @@ F: include/uapi/linux/vsockmon.h F: net/vmw_vsock/ F: tools/testing/vsock/ +VMA +M: Andrew Morton +R: Liam R. Howlett +R: Vlastimil Babka +R: Lorenzo Stoakes +L: linux-mm@kvack.org +S: Maintained +W: https://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: mm/vma.c +F: mm/vma.h +F: mm/vma_internal.h +F: tools/testing/vma/ + VMALLOC M: Andrew Morton R: Uladzislau Rezki diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index e5f881bc8288..8886ab539273 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1229,7 +1229,7 @@ arch_get_unmapped_area_1(unsigned long addr, unsigned long len, unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { unsigned long limit; diff --git a/arch/arc/mm/mmap.c b/arch/arc/mm/mmap.c index 69a915297155..2185afe8d59f 100644 --- a/arch/arc/mm/mmap.c +++ b/arch/arc/mm/mmap.c @@ -23,7 +23,8 @@ */ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 2286c2ea60ec..831793cd6ff9 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -61,7 +61,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address, return ret; } -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) /* * If we are using split PTE locks, then we need to take the page * lock here. Otherwise we are using shared mm->page_table_lock @@ -80,10 +80,10 @@ static inline void do_pte_unlock(spinlock_t *ptl) { spin_unlock(ptl); } -#else /* !USE_SPLIT_PTE_PTLOCKS */ +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline void do_pte_lock(spinlock_t *ptl) {} static inline void do_pte_unlock(spinlock_t *ptl) {} -#endif /* USE_SPLIT_PTE_PTLOCKS */ +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, unsigned long pfn) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index d65d0e6ed10a..3dbb383c26d5 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -28,7 +28,8 @@ */ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -78,8 +79,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long len, const unsigned long pgoff, + const unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e430ff33c80d..49f054dcd4de 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -101,6 +101,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT @@ -2104,7 +2105,8 @@ config ARM64_MTE depends on ARM64_PAN select ARCH_HAS_SUBPAGE_FAULTS select ARCH_USES_HIGH_VMA_FLAGS - select ARCH_USES_PG_ARCH_X + select ARCH_USES_PG_ARCH_2 + select ARCH_USES_PG_ARCH_3 help Memory Tagging (part of the ARMv8.5 Extensions) provides architectural support for run-time, always-on detection of diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 7d7d97ad3cd5..4e350df9a02d 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -9,6 +9,7 @@ syscall-y += unistd_compat_32.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += mmzone.h generic-y += qrwlock.h generic-y += qspinlock.h generic-y += parport.h diff --git a/arch/arm64/include/asm/mmzone.h b/arch/arm64/include/asm/mmzone.h deleted file mode 100644 index fa17e01d9ab2..000000000000 --- a/arch/arm64/include/asm/mmzone.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_MMZONE_H -#define __ASM_MMZONE_H - -#ifdef CONFIG_NUMA - -#include - -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[(nid)]) - -#endif /* CONFIG_NUMA */ -#endif /* __ASM_MMZONE_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 96c2b0b07c4c..c329ea061dc9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -407,6 +407,7 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pfn = pte_pfn(pte); @@ -600,6 +601,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); } +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +#define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL))) +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL)); +} +#endif + #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) @@ -617,6 +626,27 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +#define pud_special(pte) pte_special(pud_pte(pud)) +#define pud_mkspecial(pte) pte_pud(pte_mkspecial(pud_pte(pud))) +#endif + +#define pmd_pgprot pmd_pgprot +static inline pgprot_t pmd_pgprot(pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); +} + +#define pud_pgprot pud_pgprot +static inline pgprot_t pud_pgprot(pud_t pud) +{ + unsigned long pfn = pud_pfn(pud); + + return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); +} + static inline void __set_pte_at(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0f6ef432fb84..5fc3af9f8f29 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -5,6 +5,7 @@ #include #ifdef CONFIG_NUMA +#include struct pci_bus; int pcibus_to_node(struct pci_bus *bus); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 638b6205526f..f9e30dd34c7a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -62,7 +62,6 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) */ num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; tmp = kvrealloc(kvm->arch.nested_mmus, - size_mul(sizeof(*kvm->arch.nested_mmus), kvm->arch.nested_mmus_size), size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!tmp) diff --git a/arch/csky/abiv1/mmap.c b/arch/csky/abiv1/mmap.c index 7f826331d409..1047865e82a9 100644 --- a/arch/csky/abiv1/mmap.c +++ b/arch/csky/abiv1/mmap.c @@ -23,7 +23,8 @@ */ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; diff --git a/arch/csky/kernel/vdso.c b/arch/csky/kernel/vdso.c index 2ca886e4a458..5c9ef63c29f1 100644 --- a/arch/csky/kernel/vdso.c +++ b/arch/csky/kernel/vdso.c @@ -45,9 +45,16 @@ arch_initcall(vdso_init); int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { + struct vm_area_struct *vma; struct mm_struct *mm = current->mm; unsigned long vdso_base, vdso_len; int ret; + static struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + }; + static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + }; vdso_len = (vdso_pages + 1) << PAGE_SHIFT; @@ -65,22 +72,29 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, */ mm->context.vdso = (void *)vdso_base; - ret = - install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, + vdso_mapping.pages = vdso_pagelist; + vma = + _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), - vdso_pagelist); + &vdso_mapping); - if (unlikely(ret)) { + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); mm->context.vdso = NULL; goto end; } vdso_base += (vdso_pages << PAGE_SHIFT); - ret = install_special_mapping(mm, vdso_base, PAGE_SIZE, - (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]); + vvar_mapping.pages = &vdso_pagelist[vdso_pages]; + vma = _install_special_mapping(mm, vdso_base, PAGE_SIZE, + (VM_READ | VM_MAYREAD), &vvar_mapping); - if (unlikely(ret)) + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); mm->context.vdso = NULL; + goto end; + } + ret = 0; end: mmap_write_unlock(mm); return ret; diff --git a/arch/hexagon/kernel/vdso.c b/arch/hexagon/kernel/vdso.c index 2e4872d62124..6fd27ff1df73 100644 --- a/arch/hexagon/kernel/vdso.c +++ b/arch/hexagon/kernel/vdso.c @@ -51,7 +51,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { int ret; unsigned long vdso_base; + struct vm_area_struct *vma; struct mm_struct *mm = current->mm; + static struct vm_special_mapping vdso_mapping = { + name = "[vdso]", + }; if (mmap_write_lock_killable(mm)) return -EINTR; @@ -66,16 +70,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } /* MAYWRITE to allow gdb to COW and set breakpoints. */ - ret = install_special_mapping(mm, vdso_base, PAGE_SIZE, + vdso_mapping.pages = &vdso_page; + vma = _install_special_mapping(mm, vdso_base, PAGE_SIZE, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_page); + &vdso_mapping); - if (ret) + ret = PTR_ERR(vma); + if (IS_ERR(vma)) goto up_fail; mm->context.vdso = (void *)vdso_base; - + ret = 0; up_fail: mmap_write_unlock(mm); return ret; diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index b4252c357c8e..75b366407a60 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -96,7 +96,6 @@ CONFIG_ZPOOL=y CONFIG_ZSWAP=y CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD=y CONFIG_ZBUD=y -CONFIG_Z3FOLD=y CONFIG_ZSMALLOC=m # CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 4635b755b2b4..5b5a6c90e6e2 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -8,5 +8,6 @@ generic-y += early_ioremap.h generic-y += qrwlock.h generic-y += user.h generic-y += ioctl.h +generic-y += mmzone.h generic-y += statfs.h generic-y += param.h diff --git a/arch/loongarch/include/asm/mmzone.h b/arch/loongarch/include/asm/mmzone.h deleted file mode 100644 index 2b9a90727e19..000000000000 --- a/arch/loongarch/include/asm/mmzone.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Author: Huacai Chen (chenhuacai@loongson.cn) - * Copyright (C) 2020-2022 Loongson Technology Corporation Limited - */ -#ifndef _ASM_MMZONE_H_ -#define _ASM_MMZONE_H_ - -#include -#include - -extern struct pglist_data *node_data[]; - -#define NODE_DATA(nid) (node_data[(nid)]) - -#endif /* _ASM_MMZONE_H_ */ diff --git a/arch/loongarch/include/asm/topology.h b/arch/loongarch/include/asm/topology.h index 66128dec0bf6..50273c9187d0 100644 --- a/arch/loongarch/include/asm/topology.h +++ b/arch/loongarch/include/asm/topology.h @@ -8,6 +8,7 @@ #include #ifdef CONFIG_NUMA +#include extern cpumask_t cpus_on_node[]; diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 8fe21f868f72..84fe7f854820 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -27,10 +27,7 @@ #include int numa_off; -struct pglist_data *node_data[MAX_NUMNODES]; unsigned char node_distances[MAX_NUMNODES][MAX_NUMNODES]; - -EXPORT_SYMBOL(node_data); EXPORT_SYMBOL(node_distances); static struct numa_meminfo numa_meminfo; @@ -190,24 +187,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return numa_add_memblk_to(nid, start, end, &numa_meminfo); } -static void __init alloc_node_data(int nid) -{ - void *nd; - unsigned long nd_pa; - size_t nd_sz = roundup(sizeof(pg_data_t), PAGE_SIZE); - - nd_pa = memblock_phys_alloc_try_nid(nd_sz, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu Byte for node_data (initial node: %d)\n", nd_sz, nid); - return; - } - - nd = __va(nd_pa); - - node_data[nid] = nd; - memset(nd, 0, sizeof(pg_data_t)); -} - static void __init node_mem_init(unsigned int node) { unsigned long start_pfn, end_pfn; diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c index 889030985135..914e82ff3f65 100644 --- a/arch/loongarch/mm/mmap.c +++ b/arch/loongarch/mm/mmap.c @@ -89,7 +89,8 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr0, len, pgoff, flags, UP); @@ -101,7 +102,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, */ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr0, len, pgoff, flags, DOWN); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 023ad33a7e94..397edf05dd72 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -502,7 +502,6 @@ config MACH_LOONGSON64 select USE_OF select BUILTIN_DTB select PCI_HOST_GENERIC - select HAVE_ARCH_NODEDATA_EXTENSION if NUMA help This enables the support of Loongson-2/3 family of machines. @@ -735,7 +734,6 @@ config SGI_IP27 select WAR_R10000_LLSC select MIPS_L1_CACHE_SHIFT_7 select NUMA - select HAVE_ARCH_NODEDATA_EXTENSION help This are the SGI Origin 200, Origin 2000 and Onyx 2 Graphics workstations. To compile a Linux kernel that runs on these, say Y @@ -2613,9 +2611,6 @@ config NUMA config SYS_SUPPORTS_NUMA bool -config HAVE_ARCH_NODEDATA_EXTENSION - bool - config RELOCATABLE bool "Relocatable kernel" depends on SYS_SUPPORTS_RELOCATABLE diff --git a/arch/mips/include/asm/mach-ip27/mmzone.h b/arch/mips/include/asm/mach-ip27/mmzone.h index 08c36e50a860..56959eb9cb26 100644 --- a/arch/mips/include/asm/mach-ip27/mmzone.h +++ b/arch/mips/include/asm/mach-ip27/mmzone.h @@ -22,7 +22,6 @@ struct node_data { extern struct node_data *__node_data[]; -#define NODE_DATA(n) (&__node_data[(n)]->pglist) #define hub_data(n) (&__node_data[(n)]->hub) #endif /* _ASM_MACH_MMZONE_H */ diff --git a/arch/mips/include/asm/mach-loongson64/mmzone.h b/arch/mips/include/asm/mach-loongson64/mmzone.h index a3d65d37b8b5..8fb70fd3c9c4 100644 --- a/arch/mips/include/asm/mach-loongson64/mmzone.h +++ b/arch/mips/include/asm/mach-loongson64/mmzone.h @@ -14,10 +14,6 @@ #define pa_to_nid(addr) (((addr) & 0xf00000000000) >> NODE_ADDRSPACE_SHIFT) #define nid_to_addrbase(nid) ((unsigned long)(nid) << NODE_ADDRSPACE_SHIFT) -extern struct pglist_data *__node_data[]; - -#define NODE_DATA(n) (__node_data[n]) - extern void __init prom_init_numa_memory(void); #endif /* _ASM_MACH_MMZONE_H */ diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 68dafd6d3e25..8388400d052f 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -29,8 +29,6 @@ unsigned char __node_distances[MAX_NUMNODES][MAX_NUMNODES]; EXPORT_SYMBOL(__node_distances); -struct pglist_data *__node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(__node_data); cpumask_t __node_cpumask[MAX_NUMNODES]; EXPORT_SYMBOL(__node_cpumask); @@ -83,12 +81,8 @@ static void __init init_topology_matrix(void) static void __init node_mem_init(unsigned int node) { - struct pglist_data *nd; unsigned long node_addrspace_offset; unsigned long start_pfn, end_pfn; - unsigned long nd_pa; - int tnid; - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); node_addrspace_offset = nid_to_addrbase(node); pr_info("Node%d's addrspace_offset is 0x%lx\n", @@ -98,16 +92,8 @@ static void __init node_mem_init(unsigned int node) pr_info("Node%d: start_pfn=0x%lx, end_pfn=0x%lx\n", node, start_pfn, end_pfn); - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, node); - if (!nd_pa) - panic("Cannot allocate %zu bytes for node %d data\n", - nd_size, node); - nd = __va(nd_pa); - memset(nd, 0, sizeof(struct pglist_data)); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != node) - pr_info("NODE_DATA(%d) on node %d\n", node, tnid); - __node_data[node] = nd; + alloc_node_data(node); + NODE_DATA(node)->node_start_pfn = start_pfn; NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn; @@ -198,13 +184,3 @@ void __init prom_init_numa_memory(void) pr_info("CP0_PageGrain: CP0 5.1 (0x%x)\n", read_c0_pagegrain()); prom_meminit(); } - -pg_data_t * __init arch_alloc_nodedata(int nid) -{ - return memblock_alloc(sizeof(pg_data_t), SMP_CACHE_BYTES); -} - -void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - __node_data[nid] = pgdat; -} diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 7e11d7b58761..5d2a1225785b 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -98,7 +98,8 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr0, len, pgoff, flags, UP); @@ -110,7 +111,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, */ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr0, len, pgoff, flags, DOWN); diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index b8ca94cfb4fe..1963313f55d8 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -35,7 +35,6 @@ #define PFN_NASIDSHFT (NASID_SHFT - PAGE_SHIFT) struct node_data *__node_data[MAX_NUMNODES]; - EXPORT_SYMBOL(__node_data); static u64 gen_region_mask(void) @@ -361,6 +360,7 @@ static void __init node_mem_init(nasid_t node) */ __node_data[node] = __va(slot_freepfn << PAGE_SHIFT); memset(__node_data[node], 0, PAGE_SIZE); + node_data[node] = &__node_data[node]->pglist; NODE_DATA(node)->node_start_pfn = start_pfn; NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn; @@ -423,13 +423,3 @@ void __init mem_init(void) memblock_free_all(); setup_zero_pages(); /* This comes from node 0 */ } - -pg_data_t * __init arch_alloc_nodedata(int nid) -{ - return memblock_alloc(sizeof(pg_data_t), SMP_CACHE_BYTES); -} - -void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - __node_data[nid] = (struct node_data *)pgdat; -} diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c index 5d2652a1d35a..62733e049570 100644 --- a/arch/mips/sgi-ip27/ip27-smp.c +++ b/arch/mips/sgi-ip27/ip27-smp.c @@ -70,11 +70,13 @@ void cpu_node_probe(void) gda_t *gdap = GDA; nodes_clear(node_online_map); + nodes_clear(node_possible_map); for (i = 0; i < MAX_NUMNODES; i++) { nasid_t nasid = gdap->g_nasidtable[i]; if (nasid == INVALID_NASID) break; node_set_online(nasid); + node_set(nasid, node_possible_map); highest = node_scan_cpus(nasid, highest); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 3459df28afee..a2278485de19 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -82,6 +82,10 @@ void __init mmu_init(void) pgd_t swapper_pg_dir[PTRS_PER_PGD] __aligned(PAGE_SIZE); pte_t invalid_pte_table[PTRS_PER_PTE] __aligned(PAGE_SIZE); static struct page *kuser_page[1]; +static struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .pages = kuser_page, +}; static int alloc_kuser_page(void) { @@ -106,18 +110,18 @@ arch_initcall(alloc_kuser_page); int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; - int ret; + struct vm_area_struct *vma; mmap_write_lock(mm); /* Map kuser helpers to user space address */ - ret = install_special_mapping(mm, KUSER_BASE, KUSER_SIZE, + vma = _install_special_mapping(mm, KUSER_BASE, KUSER_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | - VM_MAYEXEC, kuser_page); + VM_MAYEXEC, &vdso_mapping); mmap_write_unlock(mm); - return ret; + return IS_ERR(vma) ? PTR_ERR(vma) : 0; } const char *arch_vma_name(struct vm_area_struct *vma) diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index f7722451276e..f852fe274abe 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -167,7 +167,8 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr, len, pgoff, flags, UP); @@ -175,7 +176,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr, len, pgoff, flags, DOWN); diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index 0356199bd9e7..aa664f7ddb63 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -40,7 +40,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, addr = ALIGN(addr, huge_page_size(h)); /* we need to make sure the colouring is OK */ - return arch_get_unmapped_area(file, addr, len, pgoff, flags); + return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); } diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 6001d580c0dd..a5e3e7f97f4d 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -81,7 +81,6 @@ CONFIG_MODULE_SIG_SHA512=y CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=m CONFIG_ZSWAP=y -CONFIG_Z3FOLD=y CONFIG_ZSMALLOC=y # CONFIG_SLAB_MERGE_DEFAULT is not set CONFIG_SLAB_FREELIST_RANDOM=y diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index f8ba72573cae..6d98e6f08d4d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1098,6 +1098,7 @@ extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); +extern pud_t pud_modify(pud_t pud, pgprot_t newprot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); extern void set_pud_at(struct mm_struct *mm, unsigned long addr, @@ -1358,6 +1359,8 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, #define __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +extern pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 99707456c2cd..a157ab513347 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -257,15 +257,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, extern void arch_exit_mmap(struct mm_struct *mm); -static inline void arch_unmap(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - unsigned long vdso_base = (unsigned long)mm->context.vdso; - - if (start <= vdso_base && vdso_base < end) - mm->context.vdso = NULL; -} - #ifdef CONFIG_PPC_MEM_KEYS bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign); diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index da827d2d0866..d99863cd6cde 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -20,12 +20,6 @@ #ifdef CONFIG_NUMA -extern struct pglist_data *node_data[]; -/* - * Return a pointer to the node data for node n. - */ -#define NODE_DATA(nid) (node_data[nid]) - /* * Following are specific to this numa platform. */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 264a6c09517a..2f72ad885332 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -65,6 +65,7 @@ static inline unsigned long pte_pfn(pte_t pte) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 7a2ff9010f17..ee4b9d676cff 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); } +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * close() is called for munmap() but also for mremap(). In the mremap() + * case the vdso pointer has already been updated by the mremap() hook + * above, so it must not be set to NULL here. + */ + if (vma->vm_start != (unsigned long)mm->context.vdso) + return; + + mm->context.vdso = NULL; +} + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf); @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap, + .close = vdso_close, }; static struct vm_special_mapping vdso64_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso64_mremap, + .close = vdso_close, }; #ifdef CONFIG_TIME_NS @@ -197,13 +214,6 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int /* Add required alignment. */ vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); - /* - * Put vDSO base into mm struct. We need to do this before calling - * install_special_mapping or the perf counter mmap tracking code - * will fail to recognise it as a vDSO. - */ - mm->context.vdso = (void __user *)vdso_base + vvar_size; - vma = _install_special_mapping(mm, vdso_base, vvar_size, VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | VM_PFNMAP, &vvar_spec); @@ -223,10 +233,15 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int vma = _install_special_mapping(mm, vdso_base + vvar_size, vdso_size, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, vdso_spec); - if (IS_ERR(vma)) + if (IS_ERR(vma)) { do_munmap(mm, vdso_base, vvar_size, NULL); + return PTR_ERR(vma); + } - return PTR_ERR_OR_ZERO(vma); + // Now that the mappings are in place, set the mm VDSO pointer + mm->context.vdso = (void __user *)vdso_base + vvar_size; + + return 0; } int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) @@ -240,8 +255,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return -EINTR; rc = __arch_setup_additional_pages(bprm, uses_interp); - if (rc) - mm->context.vdso = NULL; mmap_write_unlock(mm); return rc; diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index f4d8d3c40e5c..5a4a75369043 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -176,6 +176,17 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, return __pmd(old_pmd); } +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + unsigned long old_pud; + + VM_WARN_ON_ONCE(!pud_present(*pudp)); + old_pud = pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return __pud(old_pud); +} + pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full) { @@ -259,6 +270,15 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmdv &= _HPAGE_CHG_MASK; return pmd_set_protbits(__pmd(pmdv), newprot); } + +pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + unsigned long pudv; + + pudv = pud_val(pud); + pudv &= _HPAGE_CHG_MASK; + return pud_set_protbits(__pud(pudv), newprot); +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* For use by kexec, called with MMU off */ diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c index ef3ce37f1bb3..87307d0fc3b8 100644 --- a/arch/powerpc/mm/book3s64/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -637,10 +637,11 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, + vm_flags_t vm_flags) { if (radix_enabled()) - return generic_get_unmapped_area(filp, addr, len, pgoff, flags); + return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); return slice_get_unmapped_area(addr, len, flags, mm_ctx_user_psize(¤t->mm->context), 0); @@ -650,10 +651,11 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long flags, + vm_flags_t vm_flags) { if (radix_enabled()) - return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags); + return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags); return slice_get_unmapped_area(addr0, len, flags, mm_ctx_user_psize(¤t->mm->context), 1); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index aa89899f0c1a..3c1da08304d0 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -43,11 +43,9 @@ static char *cmdline __initdata; int numa_cpu_lookup_table[NR_CPUS]; cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; -struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(node_to_cpumask_map); -EXPORT_SYMBOL(node_data); static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; @@ -1095,27 +1093,9 @@ void __init dump_numa_cpu_topology(void) static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) { u64 spanned_pages = end_pfn - start_pfn; - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); - u64 nd_pa; - void *nd; - int tnid; - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) - panic("Cannot allocate %zu bytes for node %d data\n", - nd_size, nid); + alloc_node_data(nid); - nd = __va(nd_pa); - - /* report and initialize */ - pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", - nd_pa, nd_pa + nd_size - 1); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) - pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); - - node_data[nid] = nd; - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); NODE_DATA(nid)->node_id = nid; NODE_DATA(nid)->node_start_pfn = start_pfn; NODE_DATA(nid)->node_spanned_pages = spanned_pages; diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 8c31802f97e8..e89f64a0f24a 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -136,10 +136,10 @@ void pte_fragment_free(unsigned long *table, int kernel) #ifdef CONFIG_TRANSPARENT_HUGEPAGE void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { - struct page *page; + struct folio *folio; - page = virt_to_page(pgtable); - SetPageActive(page); + folio = virt_to_folio(pgtable); + folio_set_active(folio); pte_fragment_free((unsigned long *)pgtable, 0); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index ab0656115424..7316396e452d 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -297,6 +297,12 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, } #if defined(CONFIG_PPC_8xx) + +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) || defined(CONFIG_SPLIT_PMD_PTLOCKS) +/* We need the same lock to protect the PMD table and the two PTE tables. */ +#error "8M hugetlb folios are incompatible with split page table locks" +#endif + static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val) { pte_basic_t *entry = (pte_basic_t *)ptep; diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c index c29e85db5f35..1574176e3ffc 100644 --- a/arch/powerpc/platforms/pseries/papr-vpd.c +++ b/arch/powerpc/platforms/pseries/papr-vpd.c @@ -156,10 +156,7 @@ static int vpd_blob_extend(struct vpd_blob *blob, const char *data, size_t len) const char *old_ptr = blob->data; char *new_ptr; - new_ptr = old_ptr ? - kvrealloc(old_ptr, old_len, new_len, GFP_KERNEL_ACCOUNT) : - kvmalloc(len, GFP_KERNEL_ACCOUNT); - + new_ptr = kvrealloc(old_ptr, new_len, GFP_KERNEL_ACCOUNT); if (!new_ptr) return -ENOMEM; diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 5c589770f2a8..1461af12da6e 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -5,6 +5,7 @@ syscall-y += syscall_table_64.h generic-y += early_ioremap.h generic-y += flat.h generic-y += kvm_para.h +generic-y += mmzone.h generic-y += parport.h generic-y += spinlock.h generic-y += spinlock_types.h diff --git a/arch/riscv/include/asm/mmzone.h b/arch/riscv/include/asm/mmzone.h deleted file mode 100644 index fa17e01d9ab2..000000000000 --- a/arch/riscv/include/asm/mmzone.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_MMZONE_H -#define __ASM_MMZONE_H - -#ifdef CONFIG_NUMA - -#include - -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[(nid)]) - -#endif /* CONFIG_NUMA */ -#endif /* __ASM_MMZONE_H */ diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h index 61183688bdd5..fe1a8bf6902d 100644 --- a/arch/riscv/include/asm/topology.h +++ b/arch/riscv/include/asm/topology.h @@ -4,6 +4,10 @@ #include +#ifdef CONFIG_NUMA +#include +#endif + /* Replace task scheduler's default frequency-invariant accounting */ #define arch_scale_freq_tick topology_scale_freq_tick #define arch_set_freq_scale topology_set_freq_scale diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 4b904110d27c..297bf7157968 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -7,3 +7,4 @@ generated-y += unistd_nr.h generic-y += asm-offsets.h generic-y += kvm_types.h generic-y += mcs_spinlock.h +generic-y += mmzone.h diff --git a/arch/s390/include/asm/mmzone.h b/arch/s390/include/asm/mmzone.h deleted file mode 100644 index 73e3e7c6976c..000000000000 --- a/arch/s390/include/asm/mmzone.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * Copyright IBM Corp. 2015 - */ - -#ifndef _ASM_S390_MMZONE_H -#define _ASM_S390_MMZONE_H - -#ifdef CONFIG_NUMA - -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) - -#endif /* CONFIG_NUMA */ -#endif /* _ASM_S390_MMZONE_H */ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 16e4caa931f1..73e1e03317b4 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -176,8 +176,6 @@ static inline int devmem_is_allowed(unsigned long pfn) int arch_make_folio_accessible(struct folio *folio); #define HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE -int arch_make_page_accessible(struct page *page); -#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE struct vm_layout { unsigned long kaslr_offset; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 3fa280d0672a..0ffbaf741955 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -955,6 +955,7 @@ static inline int pte_unused(pte_t pte) * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID * must not be set. */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK; diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 23ab9f02f278..ddc1448ea2e1 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -14,9 +14,6 @@ #include #include -struct pglist_data *node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(node_data); - void __init numa_setup(void) { int nid; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 36db065c7cf7..9646f773208a 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -462,9 +463,9 @@ EXPORT_SYMBOL_GPL(gmap_convert_to_secure); int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) { struct vm_area_struct *vma; + struct folio_walk fw; unsigned long uaddr; struct folio *folio; - struct page *page; int rc; rc = -EFAULT; @@ -483,11 +484,15 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) goto out; rc = 0; - /* we take an extra reference here */ - page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); - if (IS_ERR_OR_NULL(page)) + folio = folio_walk_start(&fw, vma, uaddr, 0); + if (!folio) goto out; - folio = page_folio(page); + /* + * See gmap_make_secure(): large folios cannot be secure. Small + * folio implies FW_LEVEL_PTE. + */ + if (folio_test_large(folio) || !pte_write(fw.pte)) + goto out_walk_end; rc = uv_destroy_folio(folio); /* * Fault handlers can race; it is possible that two CPUs will fault @@ -500,7 +505,8 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) */ if (rc) rc = uv_convert_from_secure_folio(folio); - folio_put(folio); +out_walk_end: + folio_walk_end(&fw, vma); out: mmap_read_unlock(gmap->mm); return rc; @@ -548,11 +554,6 @@ int arch_make_folio_accessible(struct folio *folio) } EXPORT_SYMBOL_GPL(arch_make_folio_accessible); -int arch_make_page_accessible(struct page *page) -{ - return arch_make_folio_accessible(page_folio(page)); -} -EXPORT_SYMBOL_GPL(arch_make_page_accessible); static ssize_t uv_query_facilities(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 8e149ef5e89b..ad8b0d6b77ea 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -492,9 +493,9 @@ void do_secure_storage_access(struct pt_regs *regs) union teid teid = { .val = regs->int_parm_long }; unsigned long addr = get_fault_address(regs); struct vm_area_struct *vma; + struct folio_walk fw; struct mm_struct *mm; struct folio *folio; - struct page *page; struct gmap *gmap; int rc; @@ -536,15 +537,18 @@ void do_secure_storage_access(struct pt_regs *regs) vma = find_vma(mm, addr); if (!vma) return handle_fault_error(regs, SEGV_MAPERR); - page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); - if (IS_ERR_OR_NULL(page)) { + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) { mmap_read_unlock(mm); break; } - folio = page_folio(page); - if (arch_make_folio_accessible(folio)) - send_sig(SIGSEGV, current, 0); + /* arch_make_folio_accessible() needs a raised refcount. */ + folio_get(folio); + rc = arch_make_folio_accessible(folio); folio_put(folio); + folio_walk_end(&fw, vma); + if (rc) + send_sig(SIGSEGV, current, 0); mmap_read_unlock(mm); break; case KERNEL_FAULT: diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 206756946589..96efa061ce01 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -82,7 +82,7 @@ static int get_align_mask(struct file *filp, unsigned long flags) unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -117,7 +117,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 5398729bfe1b..de5c0b389a3e 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -118,12 +118,11 @@ static inline int __memcpy_toio_inuser(void __iomem *dst, SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, const void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -169,11 +168,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma, mmio_addr, &ptep, &ptl); + args.address = mmio_addr; + args.vma = vma; + ret = follow_pfnmap_start(&args); if (ret) goto out_unlock_mmap; - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) @@ -181,7 +182,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, ret = zpci_memcpy_toio(io_addr, buf, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); out_free: @@ -260,12 +261,11 @@ static inline int __memcpy_fromio_inuser(void __user *dst, SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -308,11 +308,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma, mmio_addr, &ptep, &ptl); + args.vma = vma; + args.address = mmio_addr; + ret = follow_pfnmap_start(&args); if (ret) goto out_unlock_mmap; - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) { @@ -322,7 +324,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, ret = zpci_memcpy_fromio(buf, io_addr, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h index 7b8dead2723d..63f88b465e39 100644 --- a/arch/sh/include/asm/mmzone.h +++ b/arch/sh/include/asm/mmzone.h @@ -5,9 +5,6 @@ #ifdef CONFIG_NUMA #include -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) - static inline int pfn_to_nid(unsigned long pfn) { int nid; diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 1bd85a6949c4..add35c51e017 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -36,6 +36,10 @@ __setup("vdso=", vdso_setup); */ extern const char vsyscall_trapa_start, vsyscall_trapa_end; static struct page *syscall_pages[1]; +static struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .pages = syscall_pages, +}; int __init vsyscall_init(void) { @@ -58,6 +62,7 @@ int __init vsyscall_init(void) int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; unsigned long addr; int ret; @@ -70,14 +75,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto up_fail; } - ret = install_special_mapping(mm, addr, PAGE_SIZE, + vdso_mapping.pages = syscall_pages; + vma = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - syscall_pages); - if (unlikely(ret)) + &vdso_mapping); + ret = PTR_ERR(vma); + if (IS_ERR(vma)) goto up_fail; current->mm->context.vdso = (void *)addr; + ret = 0; up_fail: mmap_write_unlock(mm); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index d1fe90b2f5ff..2a88b0c9e70f 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -212,12 +212,7 @@ void __init allocate_pgdat(unsigned int nid) get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #ifdef CONFIG_NUMA - NODE_DATA(nid) = memblock_alloc_try_nid( - sizeof(struct pglist_data), - SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); - if (!NODE_DATA(nid)) - panic("Can't allocate pgdat for node %d\n", nid); + alloc_node_data(nid); #endif NODE_DATA(nid)->node_start_pfn = start_pfn; diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c index bee329d4149a..c442734d9b0c 100644 --- a/arch/sh/mm/mmap.c +++ b/arch/sh/mm/mmap.c @@ -52,7 +52,8 @@ static inline unsigned long COLOUR_ALIGN(unsigned long addr, } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -99,7 +100,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c index 50f0dc1744d0..9bc212b5e762 100644 --- a/arch/sh/mm/numa.c +++ b/arch/sh/mm/numa.c @@ -14,9 +14,6 @@ #include #include -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL_GPL(node_data); - /* * On SH machines the conventional approach is to stash system RAM * in node 0, and other memory blocks in to node 1 and up, ordered by diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h index a236d8aa893a..74eb2c71d077 100644 --- a/arch/sparc/include/asm/mmzone.h +++ b/arch/sparc/include/asm/mmzone.h @@ -6,10 +6,6 @@ #include -extern struct pglist_data *node_data[]; - -#define NODE_DATA(nid) (node_data[nid]) - extern int numa_cpu_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[]; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 3fe429d73a65..2b7f358762c1 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -783,6 +783,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return __pmd(pte_val(pte)); } +#define pmd_pgprot pmd_pgprot static inline pgprot_t pmd_pgprot(pmd_t entry) { unsigned long val = pmd_val(entry); diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 08a19727795c..80822f922e76 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -39,7 +39,7 @@ SYSCALL_DEFINE0(getpagesize) return PAGE_SIZE; /* Possibly older binaries want 8192 on sun4's? */ } -unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_unmapped_area_info info = {}; diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index d9c3b34ca744..acade309dc2f 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -87,7 +87,7 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, return base + off; } -unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct * vma; @@ -146,7 +146,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 53d7cb5bbffe..21f8cbbd0581 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1075,14 +1075,9 @@ static void __init allocate_node_data(int nid) { struct pglist_data *p; unsigned long start_pfn, end_pfn; -#ifdef CONFIG_NUMA - NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data), - SMP_CACHE_BYTES, nid); - if (!NODE_DATA(nid)) { - prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid); - prom_halt(); - } +#ifdef CONFIG_NUMA + alloc_node_data(nid); NODE_DATA(nid)->node_id = nid; #endif @@ -1115,11 +1110,9 @@ static void init_node_masks_nonnuma(void) } #ifdef CONFIG_NUMA -struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(numa_cpumask_lookup_table); -EXPORT_SYMBOL(node_data); static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio, u32 cfg_handle) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 273b17a5cb81..2852fcd82cbd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE @@ -299,6 +300,7 @@ config X86 select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH + select NUMA_MEMBLKS if NUMA select PCI_DOMAINS if PCI select PCI_LOCKLESS_CONFIG if PCI select PERF_EVENTS @@ -1601,14 +1603,6 @@ config X86_64_ACPI_NUMA help Enable ACPI SRAT based node topology detection. -config NUMA_EMU - bool "NUMA emulation" - depends on NUMA - help - Enable NUMA emulation. A flat machine will be split - into virtual nodes when booted with "numa=fake=N", where N is the - number of nodes. This is only useful for debugging. - config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP range 1 10 @@ -1808,6 +1802,7 @@ config X86_PAT def_bool y prompt "x86 PAT support" if EXPERT depends on MTRR + select ARCH_USES_PG_ARCH_2 help Use PAT attributes to setup page level cache control. @@ -1819,10 +1814,6 @@ config X86_PAT If unsure, say Y. -config ARCH_USES_PG_UNCACHED - def_bool y - depends on X86_PAT - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 944454306ef4..04a35b2c26e9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -511,7 +511,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) if (init_unaccepted_memory()) { debug_putstr("Accepting memory... "); - accept_memory(__pa(output), __pa(output) + needed_size); + accept_memory(__pa(output), needed_size); } entry_offset = decompress_kernel(output, virt_addr, error); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index b353a7be380c..dd8d1a85f671 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -256,6 +256,6 @@ static inline bool init_unaccepted_memory(void) { return false; } /* Defined in EFI stub */ extern struct efi_unaccepted_memory *unaccepted_table; -void accept_memory(phys_addr_t start, phys_addr_t end); +void accept_memory(phys_addr_t start, unsigned long size); #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index a192bdea69e2..6c23d1661b17 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,3 +11,4 @@ generated-y += xen-hypercalls.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += mmzone.h diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 19091ebb8633..2886cb668d7f 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -238,11 +238,6 @@ static inline bool is_64bit_mm(struct mm_struct *mm) } #endif -static inline void arch_unmap(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ -} - /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h deleted file mode 100644 index c41b41edd691..000000000000 --- a/arch/x86/include/asm/mmzone.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_X86_32 -# include -#else -# include -#endif diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h deleted file mode 100644 index 2d4515e8b7df..000000000000 --- a/arch/x86/include/asm/mmzone_32.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002 - * - */ - -#ifndef _ASM_X86_MMZONE_32_H -#define _ASM_X86_MMZONE_32_H - -#include - -#ifdef CONFIG_NUMA -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) -#endif /* CONFIG_NUMA */ - -#endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h deleted file mode 100644 index 0c585046f744..000000000000 --- a/arch/x86/include/asm/mmzone_64.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* K8 NUMA support */ -/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */ -/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */ -#ifndef _ASM_X86_MMZONE_64_H -#define _ASM_X86_MMZONE_64_H - -#ifdef CONFIG_NUMA - -#include -#include - -extern struct pglist_data *node_data[]; - -#define NODE_DATA(nid) (node_data[nid]) - -#endif -#endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index ef2844d69173..5469d7a7c40f 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -10,8 +10,6 @@ #ifdef CONFIG_NUMA -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) - extern int numa_off; /* @@ -25,9 +23,6 @@ extern int numa_off; extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; -extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); -extern void __init numa_set_distance(int from, int to, int distance); - static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; @@ -54,31 +49,20 @@ static inline int numa_cpu_node(int cpu) extern void numa_set_node(int cpu, int node); extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); -extern void numa_add_cpu(int cpu); -extern void numa_remove_cpu(int cpu); +extern void numa_add_cpu(unsigned int cpu); +extern void numa_remove_cpu(unsigned int cpu); extern void init_gi_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } static inline void init_cpu_to_node(void) { } -static inline void numa_add_cpu(int cpu) { } -static inline void numa_remove_cpu(int cpu) { } +static inline void numa_add_cpu(unsigned int cpu) { } +static inline void numa_remove_cpu(unsigned int cpu) { } static inline void init_gi_nodes(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS -void debug_cpumask_set_cpu(int cpu, int node, bool enable); +void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable); #endif -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -int numa_emu_cmdline(char *str); -#else /* CONFIG_NUMA_EMU */ -static inline int numa_emu_cmdline(char *str) -{ - return -EINVAL; -} -#endif /* CONFIG_NUMA_EMU */ - #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e39311a89bf4..4c2d080d26b4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -120,6 +120,34 @@ extern pmdval_t early_pmd_flags; #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v & ~clear); +} + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -174,6 +202,13 @@ static inline int pud_young(pud_t pud) return pud_flags(pud) & _PAGE_ACCESSED; } +static inline bool pud_shstk(pud_t pud) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); +} + static inline int pte_write(pte_t pte) { /* @@ -310,6 +345,30 @@ static inline int pud_devmap(pud_t pud) } #endif +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SPECIAL; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SPECIAL; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + static inline int pgd_devmap(pgd_t pgd) { return 0; @@ -480,20 +539,6 @@ static inline pte_t pte_mkdevmap(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); } -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v | set); -} - -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { @@ -588,20 +633,6 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); #define pmd_mkwrite pmd_mkwrite -static inline pud_t pud_set_flags(pud_t pud, pudval_t set) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v | set); -} - -static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pud_t pud_mksaveddirty(pud_t pud) { @@ -780,6 +811,12 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } +static inline pud_t pud_mkinvalid(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -827,14 +864,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmd_result = __pmd(val); /* - * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid: - * 1. Marking Write=0 PMDs Dirty=1 - * 2. Marking Dirty=1 PMDs Write=0 - * - * The first case cannot happen because the _PAGE_CHG_MASK will filter - * out any Dirty bit passed in newprot. Handle the second case by - * going through the mksaveddirty exercise. Only do this if the old - * value was Write=1 to avoid doing this on Shadow Stack PTEs. + * Avoid creating shadow stack PMD by accident. See comment in + * pte_modify(). */ if (oldval & _PAGE_RW) pmd_result = pmd_mksaveddirty(pmd_result); @@ -844,6 +875,29 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pmd_result; } +static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + pudval_t val = pud_val(pud), oldval = val; + pud_t pud_result; + + val &= _HPAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; + val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK); + + pud_result = __pud(val); + + /* + * Avoid creating shadow stack PUD by accident. See comment in + * pte_modify(). + */ + if (oldval & _PAGE_RW) + pud_result = pud_mksaveddirty(pud_result); + else + pud_result = pud_clear_saveddirty(pud_result); + + return pud_result; +} + /* * mprotect needs to preserve PAT and encryption bits when updating * vm_page_prot @@ -1078,8 +1132,7 @@ static inline pmd_t *pud_pgtable(pud_t pud) #define pud_leaf pud_leaf static inline bool pud_leaf(pud_t pud) { - return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pud_val(pud) & _PAGE_PSE; } static inline int pud_bad(pud_t pud) @@ -1383,10 +1436,28 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline pud_t pudp_establish(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, pud_t pud) +{ + page_table_check_pud_set(vma->vm_mm, pudp, pud); + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pudp, pud); + } else { + pud_t old = *pudp; + WRITE_ONCE(*pudp, pud); + return old; + } +} +#endif + #define __HAVE_ARCH_PMDP_INVALIDATE_AD extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); + /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. @@ -1668,6 +1739,9 @@ void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); #define arch_check_zapped_pmd arch_check_zapped_pmd void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); +#define arch_check_zapped_pud arch_check_zapped_pud +void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud); + #ifdef CONFIG_XEN_PV #define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young static inline bool arch_has_hw_nonleaf_pmd_young(void) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 3c4407271d08..7e9db77231ac 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -245,7 +245,6 @@ extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define HAVE_ARCH_UNMAPPED_AREA_VMFLAGS #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 64df897c0ee3..3918c7a434f5 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -31,13 +31,4 @@ #endif /* CONFIG_SPARSEMEM */ -#ifndef __ASSEMBLY__ -#ifdef CONFIG_NUMA_KEEP_MEMINFO -extern int phys_to_target_node(phys_addr_t start); -#define phys_to_target_node phys_to_target_node -extern int memory_add_physaddr_to_nid(u64 start); -#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid -#endif -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_SPARSEMEM_H */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 01d7cd85ef97..87f8c9a71c49 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -121,7 +121,7 @@ static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) } unsigned long -arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, +arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; @@ -158,7 +158,7 @@ arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned l } unsigned long -arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr0, +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { @@ -228,20 +228,5 @@ arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr0, * can happen with large stack limits and large mmap() * allocations. */ - return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); -} - -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); -} - -unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) -{ - return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, flags, 0); + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0); } diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 8d3a00e5c528..690fbf48e853 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o -obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 9332b36a1091..628833afee37 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 6ce10e3c6228..64e5cdb2460a 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -22,16 +23,6 @@ #include "numa_internal.h" int numa_off; -nodemask_t numa_nodes_parsed __initdata; - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -static struct numa_meminfo numa_meminfo __initdata_or_meminfo; -static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; - -static int numa_distance_cnt; -static u8 *numa_distance; static __init int numa_setup(char *opt) { @@ -124,456 +115,27 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } -static int __init numa_add_memblk_to(int nid, u64 start, u64 end, - struct numa_meminfo *mi) +static int __init numa_register_nodes(void) { - /* ignore zero length blks */ - if (start == end) - return 0; - - /* whine about and ignore invalid blks */ - if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", - nid, start, end - 1); - return 0; - } - - if (mi->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("too many memblk ranges\n"); - return -EINVAL; - } - - mi->blk[mi->nr_blks].start = start; - mi->blk[mi->nr_blks].end = end; - mi->blk[mi->nr_blks].nid = nid; - mi->nr_blks++; - return 0; -} - -/** - * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo - * @idx: Index of memblk to remove - * @mi: numa_meminfo to remove memblk from - * - * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and - * decrementing @mi->nr_blks. - */ -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) -{ - mi->nr_blks--; - memmove(&mi->blk[idx], &mi->blk[idx + 1], - (mi->nr_blks - idx) * sizeof(mi->blk[0])); -} - -/** - * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another - * @dst: numa_meminfo to append block to - * @idx: Index of memblk to remove - * @src: numa_meminfo to remove memblk from - */ -static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, - struct numa_meminfo *src) -{ - dst->blk[dst->nr_blks++] = src->blk[idx]; - numa_remove_memblk_from(idx, src); -} - -/** - * numa_add_memblk - Add one numa_memblk to numa_meminfo - * @nid: NUMA node ID of the new memblk - * @start: Start address of the new memblk - * @end: End address of the new memblk - * - * Add a new memblk to the default numa_meminfo. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - return numa_add_memblk_to(nid, start, end, &numa_meminfo); -} - -/* Allocate NODE_DATA for a node on the local memory */ -static void __init alloc_node_data(int nid) -{ - const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - u64 nd_pa; - void *nd; - int tnid; - - /* - * Allocate node data. Try node-local memory and then any node. - * Never allocate in DMA zone. - */ - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", - nd_size, nid); - return; - } - nd = __va(nd_pa); - - /* report and initialize */ - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, - nd_pa, nd_pa + nd_size - 1); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); - - node_data[nid] = nd; - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - - node_set_online(nid); -} - -/** - * numa_cleanup_meminfo - Cleanup a numa_meminfo - * @mi: numa_meminfo to clean up - * - * Sanitize @mi by merging and removing unnecessary memblks. Also check for - * conflicts and clear unused memblks. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_cleanup_meminfo(struct numa_meminfo *mi) -{ - const u64 low = 0; - const u64 high = PFN_PHYS(max_pfn); - int i, j, k; - - /* first, trim all entries */ - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - /* move / save reserved memory ranges */ - if (!memblock_overlaps_region(&memblock.memory, - bi->start, bi->end - bi->start)) { - numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); - continue; - } - - /* make sure all non-reserved blocks are inside the limits */ - bi->start = max(bi->start, low); - - /* preserve info for non-RAM areas above 'max_pfn': */ - if (bi->end > high) { - numa_add_memblk_to(bi->nid, high, bi->end, - &numa_reserved_meminfo); - bi->end = high; - } - - /* and there's no empty block */ - if (bi->start >= bi->end) - numa_remove_memblk_from(i--, mi); - } - - /* merge neighboring / overlapping entries */ - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - for (j = i + 1; j < mi->nr_blks; j++) { - struct numa_memblk *bj = &mi->blk[j]; - u64 start, end; - - /* - * See whether there are overlapping blocks. Whine - * about but allow overlaps of the same nid. They - * will be merged below. - */ - if (bi->end > bj->start && bi->start < bj->end) { - if (bi->nid != bj->nid) { - pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, - bj->nid, bj->start, bj->end - 1); - return -EINVAL; - } - pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, - bj->start, bj->end - 1); - } - - /* - * Join together blocks on the same node, holes - * between which don't overlap with memory on other - * nodes. - */ - if (bi->nid != bj->nid) - continue; - start = min(bi->start, bj->start); - end = max(bi->end, bj->end); - for (k = 0; k < mi->nr_blks; k++) { - struct numa_memblk *bk = &mi->blk[k]; - - if (bi->nid == bk->nid) - continue; - if (start < bk->end && end > bk->start) - break; - } - if (k < mi->nr_blks) - continue; - printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, bj->start, - bj->end - 1, start, end - 1); - bi->start = start; - bi->end = end; - numa_remove_memblk_from(j--, mi); - } - } - - /* clear unused ones */ - for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { - mi->blk[i].start = mi->blk[i].end = 0; - mi->blk[i].nid = NUMA_NO_NODE; - } - - return 0; -} - -/* - * Set nodes, which have memory in @mi, in *@nodemask. - */ -static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mi->blk); i++) - if (mi->blk[i].start != mi->blk[i].end && - mi->blk[i].nid != NUMA_NO_NODE) - node_set(mi->blk[i].nid, *nodemask); -} - -/** - * numa_reset_distance - Reset NUMA distance table - * - * The current table is freed. The next numa_set_distance() call will - * create a new one. - */ -void __init numa_reset_distance(void) -{ - size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - - /* numa_distance could be 1LU marking allocation failure, test cnt */ - if (numa_distance_cnt) - memblock_free(numa_distance, size); - numa_distance_cnt = 0; - numa_distance = NULL; /* enable table creation */ -} - -static int __init numa_alloc_distance(void) -{ - nodemask_t nodes_parsed; - size_t size; - int i, j, cnt = 0; - u64 phys; - - /* size the new table and allocate it */ - nodes_parsed = numa_nodes_parsed; - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - - for_each_node_mask(i, nodes_parsed) - cnt = i; - cnt++; - size = cnt * cnt * sizeof(numa_distance[0]); - - phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, - PFN_PHYS(max_pfn_mapped)); - if (!phys) { - pr_warn("Warning: can't allocate distance table!\n"); - /* don't retry until explicitly reset */ - numa_distance = (void *)1LU; - return -ENOMEM; - } - - numa_distance = __va(phys); - numa_distance_cnt = cnt; - - /* fill with the default distances */ - for (i = 0; i < cnt; i++) - for (j = 0; j < cnt; j++) - numa_distance[i * cnt + j] = i == j ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); - - return 0; -} - -/** - * numa_set_distance - Set NUMA distance from one NUMA to another - * @from: the 'from' node to set distance - * @to: the 'to' node to set distance - * @distance: NUMA distance - * - * Set the distance from node @from to @to to @distance. If distance table - * doesn't exist, one which is large enough to accommodate all the currently - * known nodes will be created. - * - * If such table cannot be allocated, a warning is printed and further - * calls are ignored until the distance table is reset with - * numa_reset_distance(). - * - * If @from or @to is higher than the highest known node or lower than zero - * at the time of table creation or @distance doesn't make sense, the call - * is ignored. - * This is to allow simplification of specific NUMA config implementations. - */ -void __init numa_set_distance(int from, int to, int distance) -{ - if (!numa_distance && numa_alloc_distance() < 0) - return; - - if (from >= numa_distance_cnt || to >= numa_distance_cnt || - from < 0 || to < 0) { - pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - if ((u8)distance != distance || - (from == to && distance != LOCAL_DISTANCE)) { - pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - numa_distance[from * numa_distance_cnt + to] = distance; -} - -int __node_distance(int from, int to) -{ - if (from >= numa_distance_cnt || to >= numa_distance_cnt) - return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return numa_distance[from * numa_distance_cnt + to]; -} -EXPORT_SYMBOL(__node_distance); - -/* - * Mark all currently memblock-reserved physical memory (which covers the - * kernel's own memory ranges) as hot-unswappable. - */ -static void __init numa_clear_kernel_node_hotplug(void) -{ - nodemask_t reserved_nodemask = NODE_MASK_NONE; - struct memblock_region *mb_region; - int i; - - /* - * We have to do some preprocessing of memblock regions, to - * make them suitable for reservation. - * - * At this time, all memory regions reserved by memblock are - * used by the kernel, but those regions are not split up - * along node boundaries yet, and don't necessarily have their - * node ID set yet either. - * - * So iterate over all memory known to the x86 architecture, - * and use those ranges to set the nid in memblock.reserved. - * This will split up the memblock regions along node - * boundaries and will set the node IDs as well. - */ - for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = numa_meminfo.blk + i; - int ret; - - ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); - WARN_ON_ONCE(ret); - } - - /* - * Now go over all reserved memblock regions, to construct a - * node mask of all kernel reserved memory areas. - * - * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, - * numa_meminfo might not include all memblock.reserved - * memory ranges, because quirks such as trim_snb_memory() - * reserve specific pages for Sandy Bridge graphics. ] - */ - for_each_reserved_mem_region(mb_region) { - int nid = memblock_get_region_node(mb_region); - - if (nid != NUMA_NO_NODE) - node_set(nid, reserved_nodemask); - } - - /* - * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory - * belonging to the reserved node mask. - * - * Note that this will include memory regions that reside - * on nodes that contain kernel memory - entire nodes - * become hot-unpluggable: - */ - for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = numa_meminfo.blk + i; - - if (!node_isset(mb->nid, reserved_nodemask)) - continue; - - memblock_clear_hotplug(mb->start, mb->end - mb->start); - } -} - -static int __init numa_register_memblks(struct numa_meminfo *mi) -{ - int i, nid; - - /* Account for nodes with cpus and no memory */ - node_possible_map = numa_nodes_parsed; - numa_nodemask_from_meminfo(&node_possible_map, mi); - if (WARN_ON(nodes_empty(node_possible_map))) - return -EINVAL; - - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *mb = &mi->blk[i]; - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.memory, mb->nid); - } - - /* - * At very early time, the kernel have to use some memory such as - * loading the kernel image. We cannot prevent this anyway. So any - * node the kernel resides in should be un-hotpluggable. - * - * And when we come here, alloc node data won't fail. - */ - numa_clear_kernel_node_hotplug(); - - /* - * If sections array is gonna be used for pfn -> nid mapping, check - * whether its granularity is fine enough. - */ - if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { - unsigned long pfn_align = node_map_pfn_alignment(); - - if (pfn_align && pfn_align < PAGES_PER_SECTION) { - pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", - PFN_PHYS(pfn_align) >> 20, - PFN_PHYS(PAGES_PER_SECTION) >> 20); - return -EINVAL; - } - } + int nid; if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; /* Finally register nodes. */ for_each_node_mask(nid, node_possible_map) { - u64 start = PFN_PHYS(max_pfn); - u64 end = 0; + unsigned long start_pfn, end_pfn; - for (i = 0; i < mi->nr_blks; i++) { - if (nid != mi->blk[i].nid) - continue; - start = min(mi->blk[i].start, start); - end = max(mi->blk[i].end, end); - } - - if (start >= end) + /* + * Note, get_pfn_range_for_nid() depends on + * memblock_set_node() having already happened + */ + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + if (start_pfn >= end_pfn) continue; alloc_node_data(nid); + node_set_online(nid); } /* Dump memblock with node info and return. */ @@ -609,39 +171,11 @@ static int __init numa_init(int (*init_func)(void)) for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE); - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, - NUMA_NO_NODE)); - WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, - NUMA_NO_NODE)); - /* In case that parsing SRAT failed. */ - WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); - numa_reset_distance(); - - ret = init_func(); + ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true); if (ret < 0) return ret; - /* - * We reset memblock back to the top-down direction - * here because if we configured ACPI_NUMA, we have - * parsed SRAT in init_func(). It is ok to have the - * reset here even if we did't configure ACPI_NUMA - * or acpi numa init fails and fallbacks to dummy - * numa init. - */ - memblock_set_bottom_up(false); - - ret = numa_cleanup_meminfo(&numa_meminfo); - if (ret < 0) - return ret; - - numa_emulation(&numa_meminfo, numa_distance_cnt); - - ret = numa_register_memblks(&numa_meminfo); + ret = numa_register_nodes(); if (ret < 0) return ret; @@ -782,12 +316,12 @@ void __init init_cpu_to_node(void) #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU -void numa_add_cpu(int cpu) +void numa_add_cpu(unsigned int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } -void numa_remove_cpu(int cpu) +void numa_remove_cpu(unsigned int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } @@ -825,7 +359,7 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } -void debug_cpumask_set_cpu(int cpu, int node, bool enable) +void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) { struct cpumask *mask; @@ -857,12 +391,12 @@ static void numa_set_cpumask(int cpu, bool enable) debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } -void numa_add_cpu(int cpu) +void numa_add_cpu(unsigned int cpu) { numa_set_cpumask(cpu, true); } -void numa_remove_cpu(int cpu) +void numa_remove_cpu(unsigned int cpu) { numa_set_cpumask(cpu, false); } @@ -893,113 +427,29 @@ EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#ifdef CONFIG_NUMA_KEEP_MEMINFO -static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) +#ifdef CONFIG_NUMA_EMU +void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, + unsigned int nr_emu_nids) { - int i; - - for (i = 0; i < mi->nr_blks; i++) - if (mi->blk[i].start <= start && mi->blk[i].end > start) - return mi->blk[i].nid; - return NUMA_NO_NODE; -} - -int phys_to_target_node(phys_addr_t start) -{ - int nid = meminfo_to_nid(&numa_meminfo, start); + int i, j; /* - * Prefer online nodes, but if reserved memory might be - * hot-added continue the search with reserved ranges. + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. */ - if (nid != NUMA_NO_NODE) - return nid; - - return meminfo_to_nid(&numa_reserved_meminfo, start); -} -EXPORT_SYMBOL_GPL(phys_to_target_node); - -int memory_add_physaddr_to_nid(u64 start) -{ - int nid = meminfo_to_nid(&numa_meminfo, start); - - if (nid == NUMA_NO_NODE) - nid = numa_meminfo.blk[0].nid; - return nid; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); - -#endif - -static int __init cmp_memblk(const void *a, const void *b) -{ - const struct numa_memblk *ma = *(const struct numa_memblk **)a; - const struct numa_memblk *mb = *(const struct numa_memblk **)b; - - return (ma->start > mb->start) - (ma->start < mb->start); -} - -static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; - -/** - * numa_fill_memblks - Fill gaps in numa_meminfo memblks - * @start: address to begin fill - * @end: address to end fill - * - * Find and extend numa_meminfo memblks to cover the physical - * address range @start-@end - * - * RETURNS: - * 0 : Success - * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end - */ - -int __init numa_fill_memblks(u64 start, u64 end) -{ - struct numa_memblk **blk = &numa_memblk_list[0]; - struct numa_meminfo *mi = &numa_meminfo; - int count = 0; - u64 prev_end; - - /* - * Create a list of pointers to numa_meminfo memblks that - * overlap start, end. The list is used to make in-place - * changes that fill out the numa_meminfo memblks. - */ - for (int i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - if (memblock_addrs_overlap(start, end - start, bi->start, - bi->end - bi->start)) { - blk[count] = &mi->blk[i]; - count++; - } + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < nr_emu_nids; j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < nr_emu_nids ? j : 0; } - if (!count) - return NUMA_NO_MEMBLK; - - /* Sort the list of pointers in memblk->start order */ - sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); - - /* Make sure the first/last memblks include start/end */ - blk[0]->start = min(blk[0]->start, start); - blk[count - 1]->end = max(blk[count - 1]->end, end); - - /* - * Fill any gaps by tracking the previous memblks - * end address and backfilling to it if needed. - */ - prev_end = blk[0]->end; - for (int i = 1; i < count; i++) { - struct numa_memblk *curr = blk[i]; - - if (prev_end >= curr->start) { - if (prev_end < curr->end) - prev_end = curr->end; - } else { - curr->start = prev_end; - prev_end = curr->end; - } - } - return 0; } + +u64 __init numa_emu_dma_end(void) +{ + return PFN_PHYS(MAX_DMA32_PFN); +} +#endif /* CONFIG_NUMA_EMU */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 86860f279662..11e1ff370c10 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -5,30 +5,6 @@ #include #include -struct numa_memblk { - u64 start; - u64 end; - int nid; -}; - -struct numa_meminfo { - int nr_blks; - struct numa_memblk blk[NR_NODE_MEMBLKS]; -}; - -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); -int __init numa_cleanup_meminfo(struct numa_meminfo *mi); -void __init numa_reset_distance(void); - void __init x86_numa_init(void); -#ifdef CONFIG_NUMA_EMU -void __init numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt); -#else -static inline void numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt) -{ } -#endif - #endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index bdc2a240c2aa..f73b5ce270b3 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -104,7 +104,7 @@ __setup("debugpat", pat_debug_setup); #ifdef CONFIG_X86_PAT /* - * X86 PAT uses page flags arch_1 and uncached together to keep track of + * X86 PAT uses page flags arch_1 and arch_2 together to keep track of * memory type of pages that have backing page struct. * * X86 PAT supports 4 different memory types: @@ -118,9 +118,9 @@ __setup("debugpat", pat_debug_setup); #define _PGMT_WB 0 #define _PGMT_WC (1UL << PG_arch_1) -#define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_arch_2) +#define _PGMT_WT (1UL << PG_arch_2 | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_arch_2 | 1UL << PG_arch_1) #define _PGMT_CLEAR_MASK (~_PGMT_MASK) static inline enum page_cache_mode get_page_memtype(struct page *pg) @@ -951,23 +951,20 @@ static void free_pfn_range(u64 paddr, unsigned long size) static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, resource_size_t *phys) { - pte_t *ptep, pte; - spinlock_t *ptl; + struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start }; - if (follow_pte(vma, vma->vm_start, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) return -EINVAL; - pte = ptep_get(ptep); - /* Never return PFNs of anon folios in COW mappings. */ - if (vm_normal_folio(vma, vma->vm_start, pte)) { - pte_unmap_unlock(ptep, ptl); + if (!args.special) { + follow_pfnmap_end(&args); return -EINVAL; } - *prot = pgprot_val(pte_pgprot(pte)); - *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - pte_unmap_unlock(ptep, ptl); + *prot = pgprot_val(args.pgprot); + *phys = (resource_size_t)args.pfn << PAGE_SHIFT; + follow_pfnmap_end(&args); return 0; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index f5931499c2d6..5745a354a241 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -641,6 +641,18 @@ pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, } #endif +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + VM_WARN_ON_ONCE(!pud_present(*pudp)); + pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return old; +} +#endif + /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve @@ -926,3 +938,9 @@ void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); } + +void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) +{ + /* See note in arch_check_zapped_pte() */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); +} diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 76d9f6ce7a3d..f238f7b33cdd 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -52,8 +52,11 @@ subsys_initcall(init_vdso); int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - int err; + struct vm_area_struct *vma; struct mm_struct *mm = current->mm; + static struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + }; if (!vdso_enabled) return 0; @@ -61,12 +64,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (mmap_write_lock_killable(mm)) return -EINTR; - err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, + vdso_mapping.pages = vdsop; + vma = _install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdsop); + &vdso_mapping); mmap_write_unlock(mm); - return err; + return IS_ERR(vma) ? PTR_ERR(vma) : 0; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 839e6613753d..55a4996d0c04 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -665,7 +665,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) ptl = ptlock_ptr(page_ptdesc(page)); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif @@ -1553,7 +1553,8 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) && + !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); xen_mc_issue(XEN_LAZY_MMU); @@ -1581,7 +1582,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level) if (pinned) { xen_mc_batch(); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS)) __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); __set_pfn_prot(pfn, PAGE_KERNEL); diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c index b3c2450d6f23..dc54f854c2f5 100644 --- a/arch/xtensa/kernel/syscall.c +++ b/arch/xtensa/kernel/syscall.c @@ -55,7 +55,8 @@ asmlinkage long xtensa_fadvise64_64(int fd, int advice, #ifdef CONFIG_MMU unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { struct vm_area_struct *vmm; struct vma_iterator vmi; diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 44f91f2c6c5d..bec0dcd1f9c3 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -17,6 +17,7 @@ #include #include #include +#include static nodemask_t nodes_found_map = NODE_MASK_NONE; diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 2b8fd6bb7da0..064eb52ff7e2 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -226,6 +226,7 @@ config GENERIC_ARCH_TOPOLOGY config GENERIC_ARCH_NUMA bool + select NUMA_MEMBLKS help Enable support for generic NUMA implementation. Currently, RISC-V and ARM64 use it. diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 555aee3ee8e7..e18701676426 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -12,16 +12,12 @@ #include #include #include +#include #include -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); -nodemask_t numa_nodes_parsed __initdata; static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; -static int numa_distance_cnt; -static u8 *numa_distance; bool numa_off; static __init int numa_parse_early_param(char *opt) @@ -30,6 +26,8 @@ static __init int numa_parse_early_param(char *opt) return -EINVAL; if (str_has_prefix(opt, "off")) numa_off = true; + if (!strncmp(opt, "fake=", 5)) + return numa_emu_cmdline(opt + 5); return 0; } @@ -61,6 +59,7 @@ EXPORT_SYMBOL(cpumask_of_node); #endif +#ifndef CONFIG_NUMA_EMU static void numa_update_cpu(unsigned int cpu, bool remove) { int nid = cpu_to_node(cpu); @@ -83,6 +82,7 @@ void numa_remove_cpu(unsigned int cpu) { numa_update_cpu(cpu, true); } +#endif void numa_clear_node(unsigned int cpu) { @@ -144,7 +144,7 @@ void __init early_map_cpu_to_node(unsigned int cpu, int nid) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -int __init early_cpu_to_node(int cpu) +int early_cpu_to_node(int cpu) { return cpu_to_node_map[cpu]; } @@ -189,174 +189,24 @@ void __init setup_per_cpu_areas(void) } #endif -/** - * numa_add_memblk() - Set node id to memblk - * @nid: NUMA node ID of the new memblk - * @start: Start address of the new memblk - * @end: End address of the new memblk - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - int ret; - - ret = memblock_set_node(start, (end - start), &memblock.memory, nid); - if (ret < 0) { - pr_err("memblock [0x%llx - 0x%llx] failed to add on node %d\n", - start, (end - 1), nid); - return ret; - } - - node_set(nid, numa_nodes_parsed); - return ret; -} - /* * Initialize NODE_DATA for a node on the local memory */ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) { - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); - u64 nd_pa; - void *nd; - int tnid; - if (start_pfn >= end_pfn) pr_info("Initmem setup node %d []\n", nid); - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) - panic("Cannot allocate %zu bytes for node %d data\n", - nd_size, nid); + alloc_node_data(nid); - nd = __va(nd_pa); - - /* report and initialize */ - pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n", - nd_pa, nd_pa + nd_size - 1); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) - pr_info("NODE_DATA(%d) on node %d\n", nid, tnid); - - node_data[nid] = nd; - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); NODE_DATA(nid)->node_id = nid; NODE_DATA(nid)->node_start_pfn = start_pfn; NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; } -/* - * numa_free_distance - * - * The current table is freed. - */ -void __init numa_free_distance(void) -{ - size_t size; - - if (!numa_distance) - return; - - size = numa_distance_cnt * numa_distance_cnt * - sizeof(numa_distance[0]); - - memblock_free(numa_distance, size); - numa_distance_cnt = 0; - numa_distance = NULL; -} - -/* - * Create a new NUMA distance table. - */ -static int __init numa_alloc_distance(void) -{ - size_t size; - int i, j; - - size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); - numa_distance = memblock_alloc(size, PAGE_SIZE); - if (WARN_ON(!numa_distance)) - return -ENOMEM; - - numa_distance_cnt = nr_node_ids; - - /* fill with the default distances */ - for (i = 0; i < numa_distance_cnt; i++) - for (j = 0; j < numa_distance_cnt; j++) - numa_distance[i * numa_distance_cnt + j] = i == j ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - - pr_debug("Initialized distance table, cnt=%d\n", numa_distance_cnt); - - return 0; -} - -/** - * numa_set_distance() - Set inter node NUMA distance from node to node. - * @from: the 'from' node to set distance - * @to: the 'to' node to set distance - * @distance: NUMA distance - * - * Set the distance from node @from to @to to @distance. - * If distance table doesn't exist, a warning is printed. - * - * If @from or @to is higher than the highest known node or lower than zero - * or @distance doesn't make sense, the call is ignored. - */ -void __init numa_set_distance(int from, int to, int distance) -{ - if (!numa_distance) { - pr_warn_once("Warning: distance table not allocated yet\n"); - return; - } - - if (from >= numa_distance_cnt || to >= numa_distance_cnt || - from < 0 || to < 0) { - pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - if ((u8)distance != distance || - (from == to && distance != LOCAL_DISTANCE)) { - pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - numa_distance[from * numa_distance_cnt + to] = distance; -} - -/* - * Return NUMA distance @from to @to - */ -int __node_distance(int from, int to) -{ - if (from >= numa_distance_cnt || to >= numa_distance_cnt) - return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return numa_distance[from * numa_distance_cnt + to]; -} -EXPORT_SYMBOL(__node_distance); - static int __init numa_register_nodes(void) { int nid; - struct memblock_region *mblk; - - /* Check that valid nid is set to memblks */ - for_each_mem_region(mblk) { - int mblk_nid = memblock_get_region_node(mblk); - phys_addr_t start = mblk->base; - phys_addr_t end = mblk->base + mblk->size - 1; - - if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) { - pr_warn("Warning: invalid memblk node %d [mem %pap-%pap]\n", - mblk_nid, &start, &end); - return -EINVAL; - } - } /* Finally register nodes. */ for_each_node_mask(nid, numa_nodes_parsed) { @@ -381,11 +231,7 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_possible_map); nodes_clear(node_online_map); - ret = numa_alloc_distance(); - if (ret < 0) - return ret; - - ret = init_func(); + ret = numa_memblks_init(init_func, /* memblock_force_top_down */ false); if (ret < 0) goto out_free_distance; @@ -403,7 +249,7 @@ static int __init numa_init(int (*init_func)(void)) return 0; out_free_distance: - numa_free_distance(); + numa_reset_distance(); return ret; } @@ -433,6 +279,7 @@ static int __init dummy_numa_init(void) pr_err("NUMA init failed\n"); return ret; } + node_set(0, numa_nodes_parsed); numa_off = true; return 0; @@ -475,3 +322,54 @@ void __init arch_numa_init(void) numa_init(dummy_numa_init); } + +#ifdef CONFIG_NUMA_EMU +void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, + unsigned int nr_emu_nids) +{ + int i, j; + + /* + * Transform cpu_to_node_map table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i = 0; i < ARRAY_SIZE(cpu_to_node_map); i++) { + if (cpu_to_node_map[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < nr_emu_nids; j++) + if (cpu_to_node_map[i] == emu_nid_to_phys[j]) + break; + cpu_to_node_map[i] = j < nr_emu_nids ? j : 0; + } +} + +u64 __init numa_emu_dma_end(void) +{ + return memblock_start_of_DRAM() + SZ_4G; +} + +void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) +{ + struct cpumask *mask; + + if (node == NUMA_NO_NODE) + return; + + mask = node_to_cpumask_map[node]; + if (!cpumask_available(mask)) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return; + } + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + + pr_debug("%s cpu %d node %d: mask now %*pbl\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, cpumask_pr_args(mask)); +} +#endif /* CONFIG_NUMA_EMU */ diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index eacf1cba7bf4..6aea609b795c 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -2,8 +2,6 @@ config ZRAM tristate "Compressed RAM block device support" depends on BLOCK && SYSFS && MMU - depends on HAVE_ZSMALLOC - depends on CRYPTO_LZO || CRYPTO_ZSTD || CRYPTO_LZ4 || CRYPTO_LZ4HC || CRYPTO_842 select ZSMALLOC help Creates virtual block devices called /dev/zramX (X = 0, 1, ...). @@ -16,6 +14,49 @@ config ZRAM See Documentation/admin-guide/blockdev/zram.rst for more information. +config ZRAM_BACKEND_LZ4 + bool "lz4 compression support" + depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS + +config ZRAM_BACKEND_LZ4HC + bool "lz4hc compression support" + depends on ZRAM + select LZ4HC_COMPRESS + select LZ4_DECOMPRESS + +config ZRAM_BACKEND_ZSTD + bool "zstd compression support" + depends on ZRAM + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + +config ZRAM_BACKEND_DEFLATE + bool "deflate compression support" + depends on ZRAM + select ZLIB_DEFLATE + select ZLIB_INFLATE + +config ZRAM_BACKEND_842 + bool "842 compression support" + depends on ZRAM + select 842_COMPRESS + select 842_DECOMPRESS + +config ZRAM_BACKEND_FORCE_LZO + depends on ZRAM + def_bool !ZRAM_BACKEND_LZ4 && !ZRAM_BACKEND_LZ4HC && \ + !ZRAM_BACKEND_ZSTD && !ZRAM_BACKEND_DEFLATE && \ + !ZRAM_BACKEND_842 + +config ZRAM_BACKEND_LZO + bool "lzo and lzo-rle compression support" if !ZRAM_BACKEND_FORCE_LZO + depends on ZRAM + default ZRAM_BACKEND_FORCE_LZO + select LZO_COMPRESS + select LZO_DECOMPRESS + choice prompt "Default zram compressor" default ZRAM_DEF_COMP_LZORLE @@ -23,38 +64,44 @@ choice config ZRAM_DEF_COMP_LZORLE bool "lzo-rle" - depends on CRYPTO_LZO - -config ZRAM_DEF_COMP_ZSTD - bool "zstd" - depends on CRYPTO_ZSTD - -config ZRAM_DEF_COMP_LZ4 - bool "lz4" - depends on CRYPTO_LZ4 + depends on ZRAM_BACKEND_LZO config ZRAM_DEF_COMP_LZO bool "lzo" - depends on CRYPTO_LZO + depends on ZRAM_BACKEND_LZO + +config ZRAM_DEF_COMP_LZ4 + bool "lz4" + depends on ZRAM_BACKEND_LZ4 config ZRAM_DEF_COMP_LZ4HC bool "lz4hc" - depends on CRYPTO_LZ4HC + depends on ZRAM_BACKEND_LZ4HC + +config ZRAM_DEF_COMP_ZSTD + bool "zstd" + depends on ZRAM_BACKEND_ZSTD + +config ZRAM_DEF_COMP_DEFLATE + bool "deflate" + depends on ZRAM_BACKEND_DEFLATE config ZRAM_DEF_COMP_842 bool "842" - depends on CRYPTO_842 + depends on ZRAM_BACKEND_842 endchoice config ZRAM_DEF_COMP string default "lzo-rle" if ZRAM_DEF_COMP_LZORLE - default "zstd" if ZRAM_DEF_COMP_ZSTD - default "lz4" if ZRAM_DEF_COMP_LZ4 default "lzo" if ZRAM_DEF_COMP_LZO + default "lz4" if ZRAM_DEF_COMP_LZ4 default "lz4hc" if ZRAM_DEF_COMP_LZ4HC + default "zstd" if ZRAM_DEF_COMP_ZSTD + default "deflate" if ZRAM_DEF_COMP_DEFLATE default "842" if ZRAM_DEF_COMP_842 + default "unset-value" config ZRAM_WRITEBACK bool "Write back incompressible or idle page to backing device" diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index de9e457907b1..0fdefd576691 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,4 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only + zram-y := zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_BACKEND_LZO) += backend_lzorle.o backend_lzo.o +zram-$(CONFIG_ZRAM_BACKEND_LZ4) += backend_lz4.o +zram-$(CONFIG_ZRAM_BACKEND_LZ4HC) += backend_lz4hc.o +zram-$(CONFIG_ZRAM_BACKEND_ZSTD) += backend_zstd.o +zram-$(CONFIG_ZRAM_BACKEND_DEFLATE) += backend_deflate.o +zram-$(CONFIG_ZRAM_BACKEND_842) += backend_842.o + obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/backend_842.c b/drivers/block/zram/backend_842.c new file mode 100644 index 000000000000..10d9d5c60f53 --- /dev/null +++ b/drivers/block/zram/backend_842.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +#include "backend_842.h" + +static void release_params_842(struct zcomp_params *params) +{ +} + +static int setup_params_842(struct zcomp_params *params) +{ + return 0; +} + +static void destroy_842(struct zcomp_ctx *ctx) +{ + kfree(ctx->context); +} + +static int create_842(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + ctx->context = kmalloc(SW842_MEM_COMPRESS, GFP_KERNEL); + if (!ctx->context) + return -ENOMEM; + return 0; +} + +static int compress_842(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + unsigned int dlen = req->dst_len; + int ret; + + ret = sw842_compress(req->src, req->src_len, req->dst, &dlen, + ctx->context); + if (ret == 0) + req->dst_len = dlen; + return ret; +} + +static int decompress_842(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + unsigned int dlen = req->dst_len; + + return sw842_decompress(req->src, req->src_len, req->dst, &dlen); +} + +const struct zcomp_ops backend_842 = { + .compress = compress_842, + .decompress = decompress_842, + .create_ctx = create_842, + .destroy_ctx = destroy_842, + .setup_params = setup_params_842, + .release_params = release_params_842, + .name = "842", +}; diff --git a/drivers/block/zram/backend_842.h b/drivers/block/zram/backend_842.h new file mode 100644 index 000000000000..4dc85c188799 --- /dev/null +++ b/drivers/block/zram/backend_842.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_842_H__ +#define __BACKEND_842_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_842; + +#endif /* __BACKEND_842_H__ */ diff --git a/drivers/block/zram/backend_deflate.c b/drivers/block/zram/backend_deflate.c new file mode 100644 index 000000000000..0f7f252c12f4 --- /dev/null +++ b/drivers/block/zram/backend_deflate.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +#include "backend_deflate.h" + +/* Use the same value as crypto API */ +#define DEFLATE_DEF_WINBITS 11 +#define DEFLATE_DEF_MEMLEVEL MAX_MEM_LEVEL + +struct deflate_ctx { + struct z_stream_s cctx; + struct z_stream_s dctx; +}; + +static void deflate_release_params(struct zcomp_params *params) +{ +} + +static int deflate_setup_params(struct zcomp_params *params) +{ + if (params->level == ZCOMP_PARAM_NO_LEVEL) + params->level = Z_DEFAULT_COMPRESSION; + + return 0; +} + +static void deflate_destroy(struct zcomp_ctx *ctx) +{ + struct deflate_ctx *zctx = ctx->context; + + if (!zctx) + return; + + if (zctx->cctx.workspace) { + zlib_deflateEnd(&zctx->cctx); + vfree(zctx->cctx.workspace); + } + if (zctx->dctx.workspace) { + zlib_inflateEnd(&zctx->dctx); + vfree(zctx->dctx.workspace); + } + kfree(zctx); +} + +static int deflate_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + struct deflate_ctx *zctx; + size_t sz; + int ret; + + zctx = kzalloc(sizeof(*zctx), GFP_KERNEL); + if (!zctx) + return -ENOMEM; + + ctx->context = zctx; + sz = zlib_deflate_workspacesize(-DEFLATE_DEF_WINBITS, MAX_MEM_LEVEL); + zctx->cctx.workspace = vzalloc(sz); + if (!zctx->cctx.workspace) + goto error; + + ret = zlib_deflateInit2(&zctx->cctx, params->level, Z_DEFLATED, + -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL, + Z_DEFAULT_STRATEGY); + if (ret != Z_OK) + goto error; + + sz = zlib_inflate_workspacesize(); + zctx->dctx.workspace = vzalloc(sz); + if (!zctx->dctx.workspace) + goto error; + + ret = zlib_inflateInit2(&zctx->dctx, -DEFLATE_DEF_WINBITS); + if (ret != Z_OK) + goto error; + + return 0; + +error: + deflate_destroy(ctx); + return -EINVAL; +} + +static int deflate_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct deflate_ctx *zctx = ctx->context; + struct z_stream_s *deflate; + int ret; + + deflate = &zctx->cctx; + ret = zlib_deflateReset(deflate); + if (ret != Z_OK) + return -EINVAL; + + deflate->next_in = (u8 *)req->src; + deflate->avail_in = req->src_len; + deflate->next_out = (u8 *)req->dst; + deflate->avail_out = req->dst_len; + + ret = zlib_deflate(deflate, Z_FINISH); + if (ret != Z_STREAM_END) + return -EINVAL; + + req->dst_len = deflate->total_out; + return 0; +} + +static int deflate_decompress(struct zcomp_params *params, + struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct deflate_ctx *zctx = ctx->context; + struct z_stream_s *inflate; + int ret; + + inflate = &zctx->dctx; + + ret = zlib_inflateReset(inflate); + if (ret != Z_OK) + return -EINVAL; + + inflate->next_in = (u8 *)req->src; + inflate->avail_in = req->src_len; + inflate->next_out = (u8 *)req->dst; + inflate->avail_out = req->dst_len; + + ret = zlib_inflate(inflate, Z_SYNC_FLUSH); + if (ret != Z_STREAM_END) + return -EINVAL; + + return 0; +} + +const struct zcomp_ops backend_deflate = { + .compress = deflate_compress, + .decompress = deflate_decompress, + .create_ctx = deflate_create, + .destroy_ctx = deflate_destroy, + .setup_params = deflate_setup_params, + .release_params = deflate_release_params, + .name = "deflate", +}; diff --git a/drivers/block/zram/backend_deflate.h b/drivers/block/zram/backend_deflate.h new file mode 100644 index 000000000000..a39ac12b114c --- /dev/null +++ b/drivers/block/zram/backend_deflate.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_DEFLATE_H__ +#define __BACKEND_DEFLATE_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_deflate; + +#endif /* __BACKEND_DEFLATE_H__ */ diff --git a/drivers/block/zram/backend_lz4.c b/drivers/block/zram/backend_lz4.c new file mode 100644 index 000000000000..847f3334eb38 --- /dev/null +++ b/drivers/block/zram/backend_lz4.c @@ -0,0 +1,127 @@ +#include +#include +#include +#include + +#include "backend_lz4.h" + +struct lz4_ctx { + void *mem; + + LZ4_streamDecode_t *dstrm; + LZ4_stream_t *cstrm; +}; + +static void lz4_release_params(struct zcomp_params *params) +{ +} + +static int lz4_setup_params(struct zcomp_params *params) +{ + if (params->level == ZCOMP_PARAM_NO_LEVEL) + params->level = LZ4_ACCELERATION_DEFAULT; + + return 0; +} + +static void lz4_destroy(struct zcomp_ctx *ctx) +{ + struct lz4_ctx *zctx = ctx->context; + + if (!zctx) + return; + + vfree(zctx->mem); + kfree(zctx->dstrm); + kfree(zctx->cstrm); + kfree(zctx); +} + +static int lz4_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + struct lz4_ctx *zctx; + + zctx = kzalloc(sizeof(*zctx), GFP_KERNEL); + if (!zctx) + return -ENOMEM; + + ctx->context = zctx; + if (params->dict_sz == 0) { + zctx->mem = vmalloc(LZ4_MEM_COMPRESS); + if (!zctx->mem) + goto error; + } else { + zctx->dstrm = kzalloc(sizeof(*zctx->dstrm), GFP_KERNEL); + if (!zctx->dstrm) + goto error; + + zctx->cstrm = kzalloc(sizeof(*zctx->cstrm), GFP_KERNEL); + if (!zctx->cstrm) + goto error; + } + + return 0; + +error: + lz4_destroy(ctx); + return -ENOMEM; +} + +static int lz4_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct lz4_ctx *zctx = ctx->context; + int ret; + + if (!zctx->cstrm) { + ret = LZ4_compress_fast(req->src, req->dst, req->src_len, + req->dst_len, params->level, + zctx->mem); + } else { + /* Cstrm needs to be reset */ + ret = LZ4_loadDict(zctx->cstrm, params->dict, params->dict_sz); + if (ret != params->dict_sz) + return -EINVAL; + ret = LZ4_compress_fast_continue(zctx->cstrm, req->src, + req->dst, req->src_len, + req->dst_len, params->level); + } + if (!ret) + return -EINVAL; + req->dst_len = ret; + return 0; +} + +static int lz4_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct lz4_ctx *zctx = ctx->context; + int ret; + + if (!zctx->dstrm) { + ret = LZ4_decompress_safe(req->src, req->dst, req->src_len, + req->dst_len); + } else { + /* Dstrm needs to be reset */ + ret = LZ4_setStreamDecode(zctx->dstrm, params->dict, + params->dict_sz); + if (!ret) + return -EINVAL; + ret = LZ4_decompress_safe_continue(zctx->dstrm, req->src, + req->dst, req->src_len, + req->dst_len); + } + if (ret < 0) + return -EINVAL; + return 0; +} + +const struct zcomp_ops backend_lz4 = { + .compress = lz4_compress, + .decompress = lz4_decompress, + .create_ctx = lz4_create, + .destroy_ctx = lz4_destroy, + .setup_params = lz4_setup_params, + .release_params = lz4_release_params, + .name = "lz4", +}; diff --git a/drivers/block/zram/backend_lz4.h b/drivers/block/zram/backend_lz4.h new file mode 100644 index 000000000000..c11fa602a703 --- /dev/null +++ b/drivers/block/zram/backend_lz4.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_LZ4_H__ +#define __BACKEND_LZ4_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_lz4; + +#endif /* __BACKEND_LZ4_H__ */ diff --git a/drivers/block/zram/backend_lz4hc.c b/drivers/block/zram/backend_lz4hc.c new file mode 100644 index 000000000000..5f37d5abcaeb --- /dev/null +++ b/drivers/block/zram/backend_lz4hc.c @@ -0,0 +1,128 @@ +#include +#include +#include +#include + +#include "backend_lz4hc.h" + +struct lz4hc_ctx { + void *mem; + + LZ4_streamDecode_t *dstrm; + LZ4_streamHC_t *cstrm; +}; + +static void lz4hc_release_params(struct zcomp_params *params) +{ +} + +static int lz4hc_setup_params(struct zcomp_params *params) +{ + if (params->level == ZCOMP_PARAM_NO_LEVEL) + params->level = LZ4HC_DEFAULT_CLEVEL; + + return 0; +} + +static void lz4hc_destroy(struct zcomp_ctx *ctx) +{ + struct lz4hc_ctx *zctx = ctx->context; + + if (!zctx) + return; + + kfree(zctx->dstrm); + kfree(zctx->cstrm); + vfree(zctx->mem); + kfree(zctx); +} + +static int lz4hc_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + struct lz4hc_ctx *zctx; + + zctx = kzalloc(sizeof(*zctx), GFP_KERNEL); + if (!zctx) + return -ENOMEM; + + ctx->context = zctx; + if (params->dict_sz == 0) { + zctx->mem = vmalloc(LZ4HC_MEM_COMPRESS); + if (!zctx->mem) + goto error; + } else { + zctx->dstrm = kzalloc(sizeof(*zctx->dstrm), GFP_KERNEL); + if (!zctx->dstrm) + goto error; + + zctx->cstrm = kzalloc(sizeof(*zctx->cstrm), GFP_KERNEL); + if (!zctx->cstrm) + goto error; + } + + return 0; + +error: + lz4hc_destroy(ctx); + return -EINVAL; +} + +static int lz4hc_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct lz4hc_ctx *zctx = ctx->context; + int ret; + + if (!zctx->cstrm) { + ret = LZ4_compress_HC(req->src, req->dst, req->src_len, + req->dst_len, params->level, + zctx->mem); + } else { + /* Cstrm needs to be reset */ + LZ4_resetStreamHC(zctx->cstrm, params->level); + ret = LZ4_loadDictHC(zctx->cstrm, params->dict, + params->dict_sz); + if (ret != params->dict_sz) + return -EINVAL; + ret = LZ4_compress_HC_continue(zctx->cstrm, req->src, req->dst, + req->src_len, req->dst_len); + } + if (!ret) + return -EINVAL; + req->dst_len = ret; + return 0; +} + +static int lz4hc_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct lz4hc_ctx *zctx = ctx->context; + int ret; + + if (!zctx->dstrm) { + ret = LZ4_decompress_safe(req->src, req->dst, req->src_len, + req->dst_len); + } else { + /* Dstrm needs to be reset */ + ret = LZ4_setStreamDecode(zctx->dstrm, params->dict, + params->dict_sz); + if (!ret) + return -EINVAL; + ret = LZ4_decompress_safe_continue(zctx->dstrm, req->src, + req->dst, req->src_len, + req->dst_len); + } + if (ret < 0) + return -EINVAL; + return 0; +} + +const struct zcomp_ops backend_lz4hc = { + .compress = lz4hc_compress, + .decompress = lz4hc_decompress, + .create_ctx = lz4hc_create, + .destroy_ctx = lz4hc_destroy, + .setup_params = lz4hc_setup_params, + .release_params = lz4hc_release_params, + .name = "lz4hc", +}; diff --git a/drivers/block/zram/backend_lz4hc.h b/drivers/block/zram/backend_lz4hc.h new file mode 100644 index 000000000000..6de03551ed4d --- /dev/null +++ b/drivers/block/zram/backend_lz4hc.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_LZ4HC_H__ +#define __BACKEND_LZ4HC_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_lz4hc; + +#endif /* __BACKEND_LZ4HC_H__ */ diff --git a/drivers/block/zram/backend_lzo.c b/drivers/block/zram/backend_lzo.c new file mode 100644 index 000000000000..4c906beaae6b --- /dev/null +++ b/drivers/block/zram/backend_lzo.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +#include "backend_lzo.h" + +static void lzo_release_params(struct zcomp_params *params) +{ +} + +static int lzo_setup_params(struct zcomp_params *params) +{ + return 0; +} + +static int lzo_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + ctx->context = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + if (!ctx->context) + return -ENOMEM; + return 0; +} + +static void lzo_destroy(struct zcomp_ctx *ctx) +{ + kfree(ctx->context); +} + +static int lzo_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + int ret; + + ret = lzo1x_1_compress(req->src, req->src_len, req->dst, + &req->dst_len, ctx->context); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + int ret; + + ret = lzo1x_decompress_safe(req->src, req->src_len, + req->dst, &req->dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +const struct zcomp_ops backend_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create_ctx = lzo_create, + .destroy_ctx = lzo_destroy, + .setup_params = lzo_setup_params, + .release_params = lzo_release_params, + .name = "lzo", +}; diff --git a/drivers/block/zram/backend_lzo.h b/drivers/block/zram/backend_lzo.h new file mode 100644 index 000000000000..93d54749e63c --- /dev/null +++ b/drivers/block/zram/backend_lzo.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_LZO_H__ +#define __BACKEND_LZO_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_lzo; + +#endif /* __BACKEND_LZO_H__ */ diff --git a/drivers/block/zram/backend_lzorle.c b/drivers/block/zram/backend_lzorle.c new file mode 100644 index 000000000000..10640c96cbfc --- /dev/null +++ b/drivers/block/zram/backend_lzorle.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +#include "backend_lzorle.h" + +static void lzorle_release_params(struct zcomp_params *params) +{ +} + +static int lzorle_setup_params(struct zcomp_params *params) +{ + return 0; +} + +static int lzorle_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + ctx->context = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + if (!ctx->context) + return -ENOMEM; + return 0; +} + +static void lzorle_destroy(struct zcomp_ctx *ctx) +{ + kfree(ctx->context); +} + +static int lzorle_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + int ret; + + ret = lzorle1x_1_compress(req->src, req->src_len, req->dst, + &req->dst_len, ctx->context); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzorle_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + int ret; + + ret = lzo1x_decompress_safe(req->src, req->src_len, + req->dst, &req->dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +const struct zcomp_ops backend_lzorle = { + .compress = lzorle_compress, + .decompress = lzorle_decompress, + .create_ctx = lzorle_create, + .destroy_ctx = lzorle_destroy, + .setup_params = lzorle_setup_params, + .release_params = lzorle_release_params, + .name = "lzo-rle", +}; diff --git a/drivers/block/zram/backend_lzorle.h b/drivers/block/zram/backend_lzorle.h new file mode 100644 index 000000000000..6ecb163b09f1 --- /dev/null +++ b/drivers/block/zram/backend_lzorle.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_LZORLE_H__ +#define __BACKEND_LZORLE_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_lzorle; + +#endif /* __BACKEND_LZORLE_H__ */ diff --git a/drivers/block/zram/backend_zstd.c b/drivers/block/zram/backend_zstd.c new file mode 100644 index 000000000000..1184c0036f44 --- /dev/null +++ b/drivers/block/zram/backend_zstd.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +#include "backend_zstd.h" + +struct zstd_ctx { + zstd_cctx *cctx; + zstd_dctx *dctx; + void *cctx_mem; + void *dctx_mem; +}; + +struct zstd_params { + zstd_custom_mem custom_mem; + zstd_cdict *cdict; + zstd_ddict *ddict; + zstd_parameters cprm; +}; + +/* + * For C/D dictionaries we need to provide zstd with zstd_custom_mem, + * which zstd uses internally to allocate/free memory when needed. + * + * This means that allocator.customAlloc() can be called from zcomp_compress() + * under local-lock (per-CPU compression stream), in which case we must use + * GFP_ATOMIC. + * + * Another complication here is that we can be configured as a swap device. + */ +static void *zstd_custom_alloc(void *opaque, size_t size) +{ + if (!preemptible()) + return kvzalloc(size, GFP_ATOMIC); + + return kvzalloc(size, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN); +} + +static void zstd_custom_free(void *opaque, void *address) +{ + kvfree(address); +} + +static void zstd_release_params(struct zcomp_params *params) +{ + struct zstd_params *zp = params->drv_data; + + params->drv_data = NULL; + if (!zp) + return; + + zstd_free_cdict(zp->cdict); + zstd_free_ddict(zp->ddict); + kfree(zp); +} + +static int zstd_setup_params(struct zcomp_params *params) +{ + zstd_compression_parameters prm; + struct zstd_params *zp; + + zp = kzalloc(sizeof(*zp), GFP_KERNEL); + if (!zp) + return -ENOMEM; + + params->drv_data = zp; + if (params->level == ZCOMP_PARAM_NO_LEVEL) + params->level = zstd_default_clevel(); + + zp->cprm = zstd_get_params(params->level, PAGE_SIZE); + + zp->custom_mem.customAlloc = zstd_custom_alloc; + zp->custom_mem.customFree = zstd_custom_free; + + prm = zstd_get_cparams(params->level, PAGE_SIZE, + params->dict_sz); + + zp->cdict = zstd_create_cdict_byreference(params->dict, + params->dict_sz, + prm, + zp->custom_mem); + if (!zp->cdict) + goto error; + + zp->ddict = zstd_create_ddict_byreference(params->dict, + params->dict_sz, + zp->custom_mem); + if (!zp->ddict) + goto error; + + return 0; + +error: + zstd_release_params(params); + return -EINVAL; +} + +static void zstd_destroy(struct zcomp_ctx *ctx) +{ + struct zstd_ctx *zctx = ctx->context; + + if (!zctx) + return; + + /* + * If ->cctx_mem and ->dctx_mem were allocated then we didn't use + * C/D dictionary and ->cctx / ->dctx were "embedded" into these + * buffers. + * + * If otherwise then we need to explicitly release ->cctx / ->dctx. + */ + if (zctx->cctx_mem) + vfree(zctx->cctx_mem); + else + zstd_free_cctx(zctx->cctx); + + if (zctx->dctx_mem) + vfree(zctx->dctx_mem); + else + zstd_free_dctx(zctx->dctx); + + kfree(zctx); +} + +static int zstd_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + struct zstd_ctx *zctx; + zstd_parameters prm; + size_t sz; + + zctx = kzalloc(sizeof(*zctx), GFP_KERNEL); + if (!zctx) + return -ENOMEM; + + ctx->context = zctx; + if (params->dict_sz == 0) { + prm = zstd_get_params(params->level, PAGE_SIZE); + sz = zstd_cctx_workspace_bound(&prm.cParams); + zctx->cctx_mem = vzalloc(sz); + if (!zctx->cctx_mem) + goto error; + + zctx->cctx = zstd_init_cctx(zctx->cctx_mem, sz); + if (!zctx->cctx) + goto error; + + sz = zstd_dctx_workspace_bound(); + zctx->dctx_mem = vzalloc(sz); + if (!zctx->dctx_mem) + goto error; + + zctx->dctx = zstd_init_dctx(zctx->dctx_mem, sz); + if (!zctx->dctx) + goto error; + } else { + struct zstd_params *zp = params->drv_data; + + zctx->cctx = zstd_create_cctx_advanced(zp->custom_mem); + if (!zctx->cctx) + goto error; + + zctx->dctx = zstd_create_dctx_advanced(zp->custom_mem); + if (!zctx->dctx) + goto error; + } + + return 0; + +error: + zstd_release_params(params); + zstd_destroy(ctx); + return -EINVAL; +} + +static int zstd_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct zstd_params *zp = params->drv_data; + struct zstd_ctx *zctx = ctx->context; + size_t ret; + + if (params->dict_sz == 0) + ret = zstd_compress_cctx(zctx->cctx, req->dst, req->dst_len, + req->src, req->src_len, &zp->cprm); + else + ret = zstd_compress_using_cdict(zctx->cctx, req->dst, + req->dst_len, req->src, + req->src_len, + zp->cdict); + if (zstd_is_error(ret)) + return -EINVAL; + req->dst_len = ret; + return 0; +} + +static int zstd_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct zstd_params *zp = params->drv_data; + struct zstd_ctx *zctx = ctx->context; + size_t ret; + + if (params->dict_sz == 0) + ret = zstd_decompress_dctx(zctx->dctx, req->dst, req->dst_len, + req->src, req->src_len); + else + ret = zstd_decompress_using_ddict(zctx->dctx, req->dst, + req->dst_len, req->src, + req->src_len, zp->ddict); + if (zstd_is_error(ret)) + return -EINVAL; + return 0; +} + +const struct zcomp_ops backend_zstd = { + .compress = zstd_compress, + .decompress = zstd_decompress, + .create_ctx = zstd_create, + .destroy_ctx = zstd_destroy, + .setup_params = zstd_setup_params, + .release_params = zstd_release_params, + .name = "zstd", +}; diff --git a/drivers/block/zram/backend_zstd.h b/drivers/block/zram/backend_zstd.h new file mode 100644 index 000000000000..10fdfff1ec1c --- /dev/null +++ b/drivers/block/zram/backend_zstd.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_ZSTD_H__ +#define __BACKEND_ZSTD_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_zstd; + +#endif /* __BACKEND_ZSTD_H__ */ diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 8237b08c49d8..bb514403e305 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -1,7 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2014 Sergey Senozhatsky. - */ #include #include @@ -15,91 +12,97 @@ #include "zcomp.h" -static const char * const backends[] = { -#if IS_ENABLED(CONFIG_CRYPTO_LZO) - "lzo", - "lzo-rle", +#include "backend_lzo.h" +#include "backend_lzorle.h" +#include "backend_lz4.h" +#include "backend_lz4hc.h" +#include "backend_zstd.h" +#include "backend_deflate.h" +#include "backend_842.h" + +static const struct zcomp_ops *backends[] = { +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_LZO) + &backend_lzorle, + &backend_lzo, #endif -#if IS_ENABLED(CONFIG_CRYPTO_LZ4) - "lz4", +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_LZ4) + &backend_lz4, #endif -#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) - "lz4hc", +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_LZ4HC) + &backend_lz4hc, #endif -#if IS_ENABLED(CONFIG_CRYPTO_842) - "842", +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_ZSTD) + &backend_zstd, #endif -#if IS_ENABLED(CONFIG_CRYPTO_ZSTD) - "zstd", +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_DEFLATE) + &backend_deflate, #endif +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_842) + &backend_842, +#endif + NULL }; -static void zcomp_strm_free(struct zcomp_strm *zstrm) +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) { - if (!IS_ERR_OR_NULL(zstrm->tfm)) - crypto_free_comp(zstrm->tfm); + comp->ops->destroy_ctx(&zstrm->ctx); vfree(zstrm->buffer); - zstrm->tfm = NULL; zstrm->buffer = NULL; } -/* - * Initialize zcomp_strm structure with ->tfm initialized by backend, and - * ->buffer. Return a negative value on error. - */ -static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp) +static int zcomp_strm_init(struct zcomp *comp, struct zcomp_strm *zstrm) { - zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0); + int ret; + + ret = comp->ops->create_ctx(comp->params, &zstrm->ctx); + if (ret) + return ret; + /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ zstrm->buffer = vzalloc(2 * PAGE_SIZE); - if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { - zcomp_strm_free(zstrm); + if (!zstrm->buffer) { + zcomp_strm_free(comp, zstrm); return -ENOMEM; } return 0; } +static const struct zcomp_ops *lookup_backend_ops(const char *comp) +{ + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i]->name)) + break; + i++; + } + return backends[i]; +} + bool zcomp_available_algorithm(const char *comp) { - /* - * Crypto does not ignore a trailing new line symbol, - * so make sure you don't supply a string containing - * one. - * This also means that we permit zcomp initialisation - * with any compressing algorithm known to crypto api. - */ - return crypto_has_comp(comp, 0, 0) == 1; + return lookup_backend_ops(comp) != NULL; } /* show available compressors */ ssize_t zcomp_available_show(const char *comp, char *buf) { - bool known_algorithm = false; ssize_t sz = 0; int i; - for (i = 0; i < ARRAY_SIZE(backends); i++) { - if (!strcmp(comp, backends[i])) { - known_algorithm = true; + for (i = 0; i < ARRAY_SIZE(backends) - 1; i++) { + if (!strcmp(comp, backends[i]->name)) { sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", backends[i]); + "[%s] ", backends[i]->name); } else { sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "%s ", backends[i]); + "%s ", backends[i]->name); } } - /* - * Out-of-tree module known to crypto api or a missing - * entry in `backends'. - */ - if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1) - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", comp); - sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); return sz; } @@ -115,38 +118,34 @@ void zcomp_stream_put(struct zcomp *comp) local_unlock(&comp->stream->lock); } -int zcomp_compress(struct zcomp_strm *zstrm, - const void *src, unsigned int *dst_len) +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len) { - /* - * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized - * because sometimes we can endup having a bigger compressed data - * due to various reasons: for example compression algorithms tend - * to add some padding to the compressed buffer. Speaking of padding, - * comp algorithm `842' pads the compressed length to multiple of 8 - * and returns -ENOSP when the dst memory is not big enough, which - * is not something that ZRAM wants to see. We can handle the - * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we - * receive -ERRNO from the compressing backend we can't help it - * anymore. To make `842' happy we need to tell the exact size of - * the dst buffer, zram_drv will take care of the fact that - * compressed buffer is too big. - */ - *dst_len = PAGE_SIZE * 2; + struct zcomp_req req = { + .src = src, + .dst = zstrm->buffer, + .src_len = PAGE_SIZE, + .dst_len = 2 * PAGE_SIZE, + }; + int ret; - return crypto_comp_compress(zstrm->tfm, - src, PAGE_SIZE, - zstrm->buffer, dst_len); + ret = comp->ops->compress(comp->params, &zstrm->ctx, &req); + if (!ret) + *dst_len = req.dst_len; + return ret; } -int zcomp_decompress(struct zcomp_strm *zstrm, - const void *src, unsigned int src_len, void *dst) +int zcomp_decompress(struct zcomp *comp, struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst) { - unsigned int dst_len = PAGE_SIZE; + struct zcomp_req req = { + .src = src, + .dst = dst, + .src_len = src_len, + .dst_len = PAGE_SIZE, + }; - return crypto_comp_decompress(zstrm->tfm, - src, src_len, - dst, &dst_len); + return comp->ops->decompress(comp->params, &zstrm->ctx, &req); } int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) @@ -158,7 +157,7 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) zstrm = per_cpu_ptr(comp->stream, cpu); local_lock_init(&zstrm->lock); - ret = zcomp_strm_init(zstrm, comp); + ret = zcomp_strm_init(comp, zstrm); if (ret) pr_err("Can't allocate a compression stream\n"); return ret; @@ -170,11 +169,11 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node) struct zcomp_strm *zstrm; zstrm = per_cpu_ptr(comp->stream, cpu); - zcomp_strm_free(zstrm); + zcomp_strm_free(comp, zstrm); return 0; } -static int zcomp_init(struct zcomp *comp) +static int zcomp_init(struct zcomp *comp, struct zcomp_params *params) { int ret; @@ -182,12 +181,19 @@ static int zcomp_init(struct zcomp *comp) if (!comp->stream) return -ENOMEM; + comp->params = params; + ret = comp->ops->setup_params(comp->params); + if (ret) + goto cleanup; + ret = cpuhp_state_add_instance(CPUHP_ZCOMP_PREPARE, &comp->node); if (ret < 0) goto cleanup; + return 0; cleanup: + comp->ops->release_params(comp->params); free_percpu(comp->stream); return ret; } @@ -195,37 +201,35 @@ static int zcomp_init(struct zcomp *comp) void zcomp_destroy(struct zcomp *comp) { cpuhp_state_remove_instance(CPUHP_ZCOMP_PREPARE, &comp->node); + comp->ops->release_params(comp->params); free_percpu(comp->stream); kfree(comp); } -/* - * search available compressors for requested algorithm. - * allocate new zcomp and initialize it. return compressing - * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) - * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in - * case of allocation error, or any other error potentially - * returned by zcomp_init(). - */ -struct zcomp *zcomp_create(const char *alg) +struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params) { struct zcomp *comp; int error; /* - * Crypto API will execute /sbin/modprobe if the compression module - * is not loaded yet. We must do it here, otherwise we are about to - * call /sbin/modprobe under CPU hot-plug lock. + * The backends array has a sentinel NULL value, so the minimum + * size is 1. In order to be valid the array, apart from the + * sentinel NULL element, should have at least one compression + * backend selected. */ - if (!zcomp_available_algorithm(alg)) - return ERR_PTR(-EINVAL); + BUILD_BUG_ON(ARRAY_SIZE(backends) <= 1); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) return ERR_PTR(-ENOMEM); - comp->name = alg; - error = zcomp_init(comp); + comp->ops = lookup_backend_ops(alg); + if (!comp->ops) { + kfree(comp); + return ERR_PTR(-EINVAL); + } + + error = zcomp_init(comp, params); if (error) { kfree(comp); return ERR_PTR(error); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index e9fe63da0e9b..ad5762813842 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -1,24 +1,70 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - */ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ + #include +#define ZCOMP_PARAM_NO_LEVEL INT_MIN + +/* + * Immutable driver (backend) parameters. The driver may attach private + * data to it (e.g. driver representation of the dictionary, etc.). + * + * This data is kept per-comp and is shared among execution contexts. + */ +struct zcomp_params { + void *dict; + size_t dict_sz; + s32 level; + + void *drv_data; +}; + +/* + * Run-time driver context - scratch buffers, etc. It is modified during + * request execution (compression/decompression), cannot be shared, so + * it's in per-CPU area. + */ +struct zcomp_ctx { + void *context; +}; + struct zcomp_strm { - /* The members ->buffer and ->tfm are protected by ->lock. */ local_lock_t lock; - /* compression/decompression buffer */ + /* compression buffer */ void *buffer; - struct crypto_comp *tfm; + struct zcomp_ctx ctx; +}; + +struct zcomp_req { + const unsigned char *src; + const size_t src_len; + + unsigned char *dst; + size_t dst_len; +}; + +struct zcomp_ops { + int (*compress)(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req); + int (*decompress)(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req); + + int (*create_ctx)(struct zcomp_params *params, struct zcomp_ctx *ctx); + void (*destroy_ctx)(struct zcomp_ctx *ctx); + + int (*setup_params)(struct zcomp_params *params); + void (*release_params)(struct zcomp_params *params); + + const char *name; }; /* dynamic per-device compression frontend */ struct zcomp { struct zcomp_strm __percpu *stream; - const char *name; + const struct zcomp_ops *ops; + struct zcomp_params *params; struct hlist_node node; }; @@ -27,16 +73,15 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node); ssize_t zcomp_available_show(const char *comp, char *buf); bool zcomp_available_algorithm(const char *comp); -struct zcomp *zcomp_create(const char *alg); +struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); void zcomp_stream_put(struct zcomp *comp); -int zcomp_compress(struct zcomp_strm *zstrm, - const void *src, unsigned int *dst_len); - -int zcomp_decompress(struct zcomp_strm *zstrm, - const void *src, unsigned int src_len, void *dst); +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len); +int zcomp_decompress(struct zcomp *comp, struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 84cd62efac0f..c3d245617083 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "zram_drv.h" @@ -998,6 +999,103 @@ static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf) return 0; } +static void comp_params_reset(struct zram *zram, u32 prio) +{ + struct zcomp_params *params = &zram->params[prio]; + + vfree(params->dict); + params->level = ZCOMP_PARAM_NO_LEVEL; + params->dict_sz = 0; + params->dict = NULL; +} + +static int comp_params_store(struct zram *zram, u32 prio, s32 level, + const char *dict_path) +{ + ssize_t sz = 0; + + comp_params_reset(zram, prio); + + if (dict_path) { + sz = kernel_read_file_from_path(dict_path, 0, + &zram->params[prio].dict, + INT_MAX, + NULL, + READING_POLICY); + if (sz < 0) + return -EINVAL; + } + + zram->params[prio].dict_sz = sz; + zram->params[prio].level = level; + return 0; +} + +static ssize_t algorithm_params_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t len) +{ + s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NO_LEVEL; + char *args, *param, *val, *algo = NULL, *dict_path = NULL; + struct zram *zram = dev_to_zram(dev); + int ret; + + args = skip_spaces(buf); + while (*args) { + args = next_arg(args, ¶m, &val); + + if (!val || !*val) + return -EINVAL; + + if (!strcmp(param, "priority")) { + ret = kstrtoint(val, 10, &prio); + if (ret) + return ret; + continue; + } + + if (!strcmp(param, "level")) { + ret = kstrtoint(val, 10, &level); + if (ret) + return ret; + continue; + } + + if (!strcmp(param, "algo")) { + algo = val; + continue; + } + + if (!strcmp(param, "dict")) { + dict_path = val; + continue; + } + } + + /* Lookup priority by algorithm name */ + if (algo) { + s32 p; + + prio = -EINVAL; + for (p = ZRAM_PRIMARY_COMP; p < ZRAM_MAX_COMPS; p++) { + if (!zram->comp_algs[p]) + continue; + + if (!strcmp(zram->comp_algs[p], algo)) { + prio = p; + break; + } + } + } + + if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS) + return -EINVAL; + + ret = comp_params_store(zram, prio, level, dict_path); + return ret ? ret : len; +} + static ssize_t comp_algorithm_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1330,7 +1428,8 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, ret = 0; } else { dst = kmap_local_page(page); - ret = zcomp_decompress(zstrm, src, size, dst); + ret = zcomp_decompress(zram->comps[prio], zstrm, + src, size, dst); kunmap_local(dst); zcomp_stream_put(zram->comps[prio]); } @@ -1417,7 +1516,8 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) compress_again: zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); src = kmap_local_page(page); - ret = zcomp_compress(zstrm, src, &comp_len); + ret = zcomp_compress(zram->comps[ZRAM_PRIMARY_COMP], zstrm, + src, &comp_len); kunmap_local(src); if (unlikely(ret)) { @@ -1604,7 +1704,8 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, num_recomps++; zstrm = zcomp_stream_get(zram->comps[prio]); src = kmap_local_page(page); - ret = zcomp_compress(zstrm, src, &comp_len_new); + ret = zcomp_compress(zram->comps[prio], zstrm, + src, &comp_len_new); kunmap_local(src); if (ret) { @@ -1757,6 +1858,18 @@ static ssize_t recompress_store(struct device *dev, algo = val; continue; } + + if (!strcmp(param, "priority")) { + ret = kstrtouint(val, 10, &prio); + if (ret) + return ret; + + if (prio == ZRAM_PRIMARY_COMP) + prio = ZRAM_SECONDARY_COMP; + + prio_max = min(prio + 1, ZRAM_MAX_COMPS); + continue; + } } if (threshold >= huge_class_size) @@ -1979,6 +2092,15 @@ static void zram_slot_free_notify(struct block_device *bdev, zram_slot_unlock(zram, index); } +static void zram_comp_params_reset(struct zram *zram) +{ + u32 prio; + + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { + comp_params_reset(zram, prio); + } +} + static void zram_destroy_comps(struct zram *zram) { u32 prio; @@ -1992,6 +2114,13 @@ static void zram_destroy_comps(struct zram *zram) zcomp_destroy(comp); zram->num_active_comps--; } + + for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { + kfree(zram->comp_algs[prio]); + zram->comp_algs[prio] = NULL; + } + + zram_comp_params_reset(zram); } static void zram_reset_device(struct zram *zram) @@ -2049,7 +2178,8 @@ static ssize_t disksize_store(struct device *dev, if (!zram->comp_algs[prio]) continue; - comp = zcomp_create(zram->comp_algs[prio]); + comp = zcomp_create(zram->comp_algs[prio], + &zram->params[prio]); if (IS_ERR(comp)) { pr_err("Cannot initialise %s compressing backend\n", zram->comp_algs[prio]); @@ -2152,6 +2282,7 @@ static DEVICE_ATTR_RW(writeback_limit_enable); static DEVICE_ATTR_RW(recomp_algorithm); static DEVICE_ATTR_WO(recompress); #endif +static DEVICE_ATTR_WO(algorithm_params); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -2179,6 +2310,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_recomp_algorithm.attr, &dev_attr_recompress.attr, #endif + &dev_attr_algorithm_params.attr, NULL, }; @@ -2254,6 +2386,7 @@ static int zram_add(void) if (ret) goto out_cleanup_disk; + zram_comp_params_reset(zram); comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); zram_debugfs_register(zram); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 531cefc66668..cfc8c059db63 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -106,6 +106,7 @@ struct zram { struct zram_table_entry *table; struct zs_pool *mem_pool; struct zcomp *comps[ZRAM_MAX_COMPS]; + struct zcomp_params params[ZRAM_MAX_COMPS]; struct gendisk *disk; /* Prevent concurrent execution of device init */ struct rw_semaphore init_lock; diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 99b5c25be079..29c192f20082 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -6,7 +6,7 @@ menuconfig CXL_BUS select FW_UPLOAD select PCI_DOE select FIRMWARE_TABLE - select NUMA_KEEP_MEMINFO if (NUMA && X86) + select NUMA_KEEP_MEMINFO if NUMA_MEMBLKS help CXL is a bus that is electrically compatible with PCI Express, but layers three protocols on that signalling (CXL.io, CXL.cache, and diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index a88744244149..d656e4c0eb84 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -30,7 +30,7 @@ config DEV_DAX_PMEM config DEV_DAX_HMEM tristate "HMEM DAX: direct access to 'specific purpose' memory" depends on EFI_SOFT_RESERVE - select NUMA_KEEP_MEMINFO if (NUMA && X86) + select NUMA_KEEP_MEMINFO if NUMA_MEMBLKS default DEV_DAX help EFI 2.8 platforms, and others, may advertise 'specific purpose' diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 2051e4f73c8a..9c1a729cd77e 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -235,9 +235,9 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, unsigned int order) int id; struct dev_dax *dev_dax = filp->private_data; - dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) order:%d\n", current->comm, - (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", - vmf->vma->vm_start, vmf->vma->vm_end, order); + dev_dbg(&dev_dax->dev, "%s: op=%s addr=%#lx order=%d\n", current->comm, + (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", + vmf->address & ~((1UL << (order + PAGE_SHIFT)) - 1), order); id = dax_read_lock(); if (order == 0) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index d33ccbc4a2c6..685098f9626f 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -1229,7 +1229,7 @@ efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab); efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, struct efi_boot_memmap *map); void process_unaccepted_memory(u64 start, u64 end); -void accept_memory(phys_addr_t start, phys_addr_t end); +void accept_memory(phys_addr_t start, unsigned long size); void arch_accept_memory(phys_addr_t start, phys_addr_t end); #endif diff --git a/drivers/firmware/efi/libstub/unaccepted_memory.c b/drivers/firmware/efi/libstub/unaccepted_memory.c index c295ea3a6efc..757dbe734a47 100644 --- a/drivers/firmware/efi/libstub/unaccepted_memory.c +++ b/drivers/firmware/efi/libstub/unaccepted_memory.c @@ -177,9 +177,10 @@ void process_unaccepted_memory(u64 start, u64 end) start / unit_size, (end - start) / unit_size); } -void accept_memory(phys_addr_t start, phys_addr_t end) +void accept_memory(phys_addr_t start, unsigned long size) { unsigned long range_start, range_end; + phys_addr_t end = start + size; unsigned long bitmap_size; u64 unit_size; diff --git a/drivers/firmware/efi/unaccepted_memory.c b/drivers/firmware/efi/unaccepted_memory.c index 50f6503fe49f..c2c067eff634 100644 --- a/drivers/firmware/efi/unaccepted_memory.c +++ b/drivers/firmware/efi/unaccepted_memory.c @@ -30,11 +30,12 @@ static LIST_HEAD(accepting_list); * - memory that is below phys_base; * - memory that is above the memory that addressable by the bitmap; */ -void accept_memory(phys_addr_t start, phys_addr_t end) +void accept_memory(phys_addr_t start, unsigned long size) { struct efi_unaccepted_memory *unaccepted; unsigned long range_start, range_end; struct accept_range range, *entry; + phys_addr_t end = start + size; unsigned long flags; u64 unit_size; @@ -74,13 +75,13 @@ void accept_memory(phys_addr_t start, phys_addr_t end) * "guard" page is accepted in addition to the memory that needs to be * used: * - * 1. Implicitly extend the range_contains_unaccepted_memory(start, end) - * checks up to end+unit_size if 'end' is aligned on a unit_size - * boundary. + * 1. Implicitly extend the range_contains_unaccepted_memory(start, size) + * checks up to the next unit_size if 'start+size' is aligned on a + * unit_size boundary. * - * 2. Implicitly extend accept_memory(start, end) to end+unit_size if - * 'end' is aligned on a unit_size boundary. (immediately following - * this comment) + * 2. Implicitly extend accept_memory(start, size) to the next unit_size + * if 'size+end' is aligned on a unit_size boundary. (immediately + * following this comment) */ if (!(end % unit_size)) end += unit_size; @@ -156,9 +157,10 @@ void accept_memory(phys_addr_t start, phys_addr_t end) spin_unlock_irqrestore(&unaccepted_memory_lock, flags); } -bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end) +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size) { struct efi_unaccepted_memory *unaccepted; + phys_addr_t end = start + size; unsigned long flags; bool ret = false; u64 unit_size; diff --git a/drivers/gpu/drm/drm_exec.c b/drivers/gpu/drm/drm_exec.c index 2da094bdf8a4..18e366cc4993 100644 --- a/drivers/gpu/drm/drm_exec.c +++ b/drivers/gpu/drm/drm_exec.c @@ -145,8 +145,7 @@ static int drm_exec_obj_locked(struct drm_exec *exec, size_t size = exec->max_objects * sizeof(void *); void *tmp; - tmp = kvrealloc(exec->objects, size, size + PAGE_SIZE, - GFP_KERNEL); + tmp = kvrealloc(exec->objects, size + PAGE_SIZE, GFP_KERNEL); if (!tmp) return -ENOMEM; diff --git a/drivers/of/of_numa.c b/drivers/of/of_numa.c index 5949829a1b00..2ec20886d176 100644 --- a/drivers/of/of_numa.c +++ b/drivers/of/of_numa.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -44,7 +45,7 @@ static int __init of_numa_parse_memory_nodes(void) struct device_node *np = NULL; struct resource rsrc; u32 nid; - int i, r; + int i, r = -EINVAL; for_each_node_by_type(np, "memory") { r = of_property_read_u32(np, "numa-node-id", &nid); @@ -71,7 +72,7 @@ static int __init of_numa_parse_memory_nodes(void) } } - return 0; + return r; } static int __init of_numa_parse_distance_map_v1(struct device_node *map) diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 791d38d6284c..58116f89d8da 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -162,6 +162,7 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain, enum dma_data_direction dir) { struct vduse_bounce_map *map; + struct page *page; unsigned int offset; void *addr; size_t sz; @@ -178,7 +179,10 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain, map->orig_phys == INVALID_PHYS_ADDR)) return; - addr = kmap_local_page(map->bounce_page); + page = domain->user_bounce_pages ? + map->user_bounce_page : map->bounce_page; + + addr = kmap_local_page(page); do_bounce(map->orig_phys + offset, addr + offset, sz, dir); kunmap_local(addr); size -= sz; @@ -270,9 +274,8 @@ int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, memcpy_to_page(pages[i], 0, page_address(map->bounce_page), PAGE_SIZE); - __free_page(map->bounce_page); } - map->bounce_page = pages[i]; + map->user_bounce_page = pages[i]; get_page(pages[i]); } domain->user_bounce_pages = true; @@ -297,17 +300,17 @@ void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain) struct page *page = NULL; map = &domain->bounce_maps[i]; - if (WARN_ON(!map->bounce_page)) + if (WARN_ON(!map->user_bounce_page)) continue; /* Copy user page to kernel page if it's in use */ if (map->orig_phys != INVALID_PHYS_ADDR) { - page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL); + page = map->bounce_page; memcpy_from_page(page_address(page), - map->bounce_page, 0, PAGE_SIZE); + map->user_bounce_page, 0, PAGE_SIZE); } - put_page(map->bounce_page); - map->bounce_page = page; + put_page(map->user_bounce_page); + map->user_bounce_page = NULL; } domain->user_bounce_pages = false; out: diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index f92f22a7267d..7f3f0928ec78 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -21,6 +21,7 @@ struct vduse_bounce_map { struct page *bounce_page; + struct page *user_bounce_page; u64 orig_phys; }; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ba0ce0075b2f..2d7478e9a62d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1657,14 +1658,20 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma) return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } -static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) +static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, + unsigned int order) { struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; - unsigned long addr = vma->vm_start; vm_fault_t ret = VM_FAULT_SIGBUS; + if (order && (vmf->address & ((PAGE_SIZE << order) - 1) || + vmf->address + (PAGE_SIZE << order) > vma->vm_end)) { + ret = VM_FAULT_FALLBACK; + goto out; + } + pfn = vma_to_pfn(vma); down_read(&vdev->memory_lock); @@ -1672,30 +1679,49 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) goto out_unlock; - ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); - if (ret & VM_FAULT_ERROR) - goto out_unlock; - - /* - * Pre-fault the remainder of the vma, abort further insertions and - * supress error if fault is encountered during pre-fault. - */ - for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) { - if (addr == vmf->address) - continue; - - if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR) - break; + switch (order) { + case 0: + ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); + break; +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP + case PMD_ORDER: + ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff, + PFN_DEV), false); + break; +#endif +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP + case PUD_ORDER: + ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff, + PFN_DEV), false); + break; +#endif + default: + ret = VM_FAULT_FALLBACK; } out_unlock: up_read(&vdev->memory_lock); +out: + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", + __func__, order, + vma->vm_pgoff >> + (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), + pgoff, (unsigned int)ret); return ret; } +static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) +{ + return vfio_pci_mmap_huge_fault(vmf, 0); +} + static const struct vm_operations_struct vfio_pci_mmap_ops = { - .fault = vfio_pci_mmap_fault, + .fault = vfio_pci_mmap_page_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = vfio_pci_mmap_huge_fault, +#endif }; int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0960699e7554..bf391b40e576 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -513,12 +513,10 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, unsigned long vaddr, unsigned long *pfn, bool write_fault) { - pte_t *ptep; - pte_t pte; - spinlock_t *ptl; + struct follow_pfnmap_args args = { .vma = vma, .address = vaddr }; int ret; - ret = follow_pte(vma, vaddr, &ptep, &ptl); + ret = follow_pfnmap_start(&args); if (ret) { bool unlocked = false; @@ -532,19 +530,17 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, if (ret) return ret; - ret = follow_pte(vma, vaddr, &ptep, &ptl); + ret = follow_pfnmap_start(&args); if (ret) return ret; } - pte = ptep_get(ptep); - - if (write_fault && !pte_write(pte)) + if (write_fault && !args.writable) ret = -EFAULT; else - *pfn = pte_pfn(pte); + *pfn = args.pfn; - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); return ret; } diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c index db8ff1d0ac23..4c2f28715b70 100644 --- a/drivers/virt/acrn/mm.c +++ b/drivers/virt/acrn/mm.c @@ -177,9 +177,7 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap) vma = vma_lookup(current->mm, memmap->vma_base); if (vma && ((vma->vm_flags & VM_PFNMAP) != 0)) { unsigned long start_pfn, cur_pfn; - spinlock_t *ptl; bool writable; - pte_t *ptep; if ((memmap->vma_base + memmap->len) > vma->vm_end) { mmap_read_unlock(current->mm); @@ -187,16 +185,20 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap) } for (i = 0; i < nr_pages; i++) { - ret = follow_pte(vma, memmap->vma_base + i * PAGE_SIZE, - &ptep, &ptl); + struct follow_pfnmap_args args = { + .vma = vma, + .address = memmap->vma_base + i * PAGE_SIZE, + }; + + ret = follow_pfnmap_start(&args); if (ret) break; - cur_pfn = pte_pfn(ptep_get(ptep)); + cur_pfn = args.pfn; if (i == 0) start_pfn = cur_pfn; - writable = !!pte_write(ptep_get(ptep)); - pte_unmap_unlock(ptep, ptl); + writable = args.writable; + follow_pfnmap_end(&args); /* Disallow write access if the PTE is not writable. */ if (!writable && diff --git a/fs/Kconfig b/fs/Kconfig index a46b0cbc4d8f..0e4efec1d92e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -288,6 +288,10 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP +config HUGETLB_PMD_PAGE_TABLE_SHARING + def_bool HUGETLB_PAGE + depends on ARCH_WANT_HUGE_PMD_SHARE && SPLIT_PMD_PTLOCKS + config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/exec.c b/fs/exec.c index dad402d55681..6c53920795c2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -709,80 +709,6 @@ static int copy_strings_kernel(int argc, const char *const *argv, #ifdef CONFIG_MMU -/* - * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once - * the binfmt code determines where the new stack should reside, we shift it to - * its final location. The process proceeds as follows: - * - * 1) Use shift to calculate the new vma endpoints. - * 2) Extend vma to cover both the old and new ranges. This ensures the - * arguments passed to subsequent functions are consistent. - * 3) Move vma's page tables to the new range. - * 4) Free up any cleared pgd range. - * 5) Shrink the vma to cover only the new range. - */ -static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long old_start = vma->vm_start; - unsigned long old_end = vma->vm_end; - unsigned long length = old_end - old_start; - unsigned long new_start = old_start - shift; - unsigned long new_end = old_end - shift; - VMA_ITERATOR(vmi, mm, new_start); - struct vm_area_struct *next; - struct mmu_gather tlb; - - BUG_ON(new_start > new_end); - - /* - * ensure there are no vmas between where we want to go - * and where we are - */ - if (vma != vma_next(&vmi)) - return -EFAULT; - - vma_iter_prev_range(&vmi); - /* - * cover the whole range: [new_start, old_end) - */ - if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) - return -ENOMEM; - - /* - * move the page tables downwards, on failure we rely on - * process cleanup to remove whatever mess we made. - */ - if (length != move_page_tables(vma, old_start, - vma, new_start, length, false, true)) - return -ENOMEM; - - lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - next = vma_next(&vmi); - if (new_end > old_start) { - /* - * when the old and new regions overlap clear from new_end. - */ - free_pgd_range(&tlb, new_end, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); - } else { - /* - * otherwise, clean from old_start; this is done to not touch - * the address space in [new_end, old_start) some architectures - * have constraints on va-space that make this illegal (IA64) - - * for the others its just a little faster. - */ - free_pgd_range(&tlb, old_start, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); - } - tlb_finish_mmu(&tlb); - - vma_prev(&vmi); - /* Shrink the vma to just the new range */ - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); -} - /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. @@ -876,7 +802,12 @@ int setup_arg_pages(struct linux_binprm *bprm, /* Move stack pages down in memory. */ if (stack_shift) { - ret = shift_arg_pages(vma, stack_shift); + /* + * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once + * the binfmt code determines where the new stack should reside, we shift it to + * its final location. + */ + ret = relocate_vma_down(vma, stack_shift); if (ret) goto out_unlock; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9e3f25e4c188..87e4d6282025 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -166,8 +166,7 @@ static inline int folio_precise_page_mapcount(struct folio *folio, { int mapcount = atomic_read(&page->_mapcount) + 1; - /* Handle page_has_type() pages */ - if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) + if (page_mapcount_is_type(mapcount)) mapcount = 0; if (folio_test_large(folio)) mapcount += folio_entire_mapcount(folio); diff --git a/fs/proc/page.c b/fs/proc/page.c index b7a5c84b5819..a55f5acefa97 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -182,7 +182,6 @@ u64 stable_page_flags(const struct page *page) #endif u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); @@ -207,18 +206,16 @@ u64 stable_page_flags(const struct page *page) u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison); #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); -#endif - u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); - u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2); u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 27a3e9285fbf..68cdd89c97a3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; } -static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, - vm_flags_t flags) -{ - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; - - vm_flags_reset(vma, flags); - /* - * For shared mappings, we want to enable writenotify while - * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply - * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. - */ - if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) - vma_set_page_prot(vma); -} - static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -386,15 +371,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) unsigned int blocking_state; /* - * We don't do userfault handling for the final child pid update. - * - * We also don't do userfault handling during - * coredumping. hugetlbfs has the special - * hugetlb_follow_page_mask() to skip missing pages in the - * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with - * the no_page_table() helper in follow_page_mask(), but the - * shmem_vm_ops->fault method is invoked even during - * coredumping and it ends up here. + * We don't do userfault handling for the final child pid update + * and when coredumping (faults triggered by get_dump_page()). */ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; @@ -615,22 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, spin_unlock_irq(&ctx->event_wqh.lock); if (release_new_ctx) { - struct vm_area_struct *vma; - struct mm_struct *mm = release_new_ctx->mm; - VMA_ITERATOR(vmi, mm, 0); - - /* the various vma->vm_userfaultfd_ctx still points to it */ - mmap_write_lock(mm); - for_each_vma(vmi, vma) { - if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, - vma->vm_flags & ~__VM_UFFD_FLAGS); - } - } - mmap_write_unlock(mm); - + userfaultfd_release_new(release_new_ctx); userfaultfd_ctx_put(release_new_ctx); } @@ -662,9 +625,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) return 0; if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); + userfaultfd_reset_ctx(vma); return 0; } @@ -749,9 +710,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, up_write(&ctx->map_changing_lock); } else { /* Drop uffd context if remap feature not enabled */ - vma_start_write(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); + userfaultfd_reset_ctx(vma); } } @@ -870,53 +829,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; struct mm_struct *mm = ctx->mm; - struct vm_area_struct *vma, *prev; /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; - unsigned long new_flags; - VMA_ITERATOR(vmi, mm, 0); WRITE_ONCE(ctx->released, true); - if (!mmget_not_zero(mm)) - goto wakeup; + userfaultfd_release_all(mm, ctx); - /* - * Flush page faults out of all CPUs. NOTE: all page faults - * must be retried without returning VM_FAULT_SIGBUS if - * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx - * changes while handle_userfault released the mmap_lock. So - * it's critical that released is set to true (above), before - * taking the mmap_lock for writing. - */ - mmap_write_lock(mm); - prev = NULL; - for_each_vma(vmi, vma) { - cond_resched(); - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ - !!(vma->vm_flags & __VM_UFFD_FLAGS)); - if (vma->vm_userfaultfd_ctx.ctx != ctx) { - prev = vma; - continue; - } - /* Reset ptes for the whole vma range if wr-protected */ - if (userfaultfd_wp(vma)) - uffd_wp_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start, false); - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, - vma->vm_end, new_flags, - NULL_VM_UFFD_CTX); - - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - - prev = vma; - } - mmap_write_unlock(mm); - mmput(mm); -wakeup: /* * After no new page faults can wait on this fault_*wqh, flush * the last page faults that may have been already waiting on @@ -1293,14 +1212,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { struct mm_struct *mm = ctx->mm; - struct vm_area_struct *vma, *prev, *cur; + struct vm_area_struct *vma, *cur; int ret; struct uffdio_register uffdio_register; struct uffdio_register __user *user_uffdio_register; - unsigned long vm_flags, new_flags; + unsigned long vm_flags; bool found; bool basic_ioctls; - unsigned long start, end, vma_end; + unsigned long start, end; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); @@ -1428,57 +1347,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - vma_iter_set(&vmi, start); - prev = vma_prev(&vmi); - if (vma->vm_start < start) - prev = vma; - - ret = 0; - for_each_vma_range(vmi, vma, end) { - cond_resched(); - - BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); - BUG_ON(vma->vm_userfaultfd_ctx.ctx && - vma->vm_userfaultfd_ctx.ctx != ctx); - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); - - /* - * Nothing to do: this vma is already registered into this - * userfaultfd and with the right tracking mode too. - */ - if (vma->vm_userfaultfd_ctx.ctx == ctx && - (vma->vm_flags & vm_flags) == vm_flags) - goto skip; - - if (vma->vm_start > start) - start = vma->vm_start; - vma_end = min(end, vma->vm_end); - - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, - (struct vm_userfaultfd_ctx){ctx}); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - break; - } - - /* - * In the vma_merge() successful mprotect-like case 8: - * the next vma was merged into the current one and - * the current one has not been updated yet. - */ - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx.ctx = ctx; - - if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) - hugetlb_unshare_all_pmds(vma); - - skip: - prev = vma; - start = vma->vm_end; - } + ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, + wp_async); out_unlock: mmap_write_unlock(mm); @@ -1519,7 +1389,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, *prev, *cur; int ret; struct uffdio_range uffdio_unregister; - unsigned long new_flags; bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; @@ -1622,27 +1491,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } - /* Reset ptes for the whole vma range if wr-protected */ - if (userfaultfd_wp(vma)) - uffd_wp_range(vma, start, vma_end - start, false); - - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, NULL_VM_UFFD_CTX); + vma = userfaultfd_clear_vma(&vmi, prev, vma, + start, vma_end); if (IS_ERR(vma)) { ret = PTR_ERR(vma); break; } - /* - * In the vma_merge() successful mprotect-like case 8: - * the next vma was merged into the current one and - * the current one has not been updated yet. - */ - vma_start_write(vma); - userfaultfd_set_vm_flags(vma, new_flags); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - skip: prev = vma; start = vma->vm_end; diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 9b5d98fe1f8a..c753c79df203 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -126,7 +126,7 @@ xfile_load( unsigned int len; unsigned int offset; - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_READ) < 0) break; if (!folio) { @@ -196,7 +196,7 @@ xfile_store( unsigned int len; unsigned int offset; - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE) < 0) break; if (filemap_check_wb_err(inode->i_mapping, 0)) { @@ -267,7 +267,7 @@ xfile_get_folio( i_size_write(inode, pos + len); pflags = memalloc_nofs_save(); - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); memalloc_nofs_restore(pflags); if (error) diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 9bb2d24de709..07bebbfb16ee 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -149,7 +149,7 @@ xmbuf_map_page( return -ENOMEM; } - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE); + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE); if (error) return error; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1a74fe22672e..ec766b4bc853 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2128,7 +2128,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL); + ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL); if (!ptr) return -ENOMEM; memcpy(&ptr[old_len], dp, len); diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index 4dbb177d1150..6eea3b3c1e65 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Define generic no-op hooks for arch_dup_mmap, arch_exit_mmap - * and arch_unmap to be included in asm-FOO/mmu_context.h for any - * arch FOO which doesn't need to hook these. + * Define generic no-op hooks for arch_dup_mmap and arch_exit_mmap + * to be included in asm-FOO/mmu_context.h for any arch FOO which + * doesn't need to hook these. */ #ifndef _ASM_GENERIC_MM_HOOKS_H #define _ASM_GENERIC_MM_HOOKS_H @@ -17,11 +17,6 @@ static inline void arch_exit_mmap(struct mm_struct *mm) { } -static inline void arch_unmap(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} - static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) { diff --git a/include/asm-generic/mmzone.h b/include/asm-generic/mmzone.h new file mode 100644 index 000000000000..2ab5193e8394 --- /dev/null +++ b/include/asm-generic/mmzone.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_MMZONE_H +#define _ASM_GENERIC_MMZONE_H + +#endif diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h index c32e0cf23c90..e063d6487f66 100644 --- a/include/asm-generic/numa.h +++ b/include/asm-generic/numa.h @@ -32,10 +32,8 @@ static inline const struct cpumask *cpumask_of_node(int node) void __init arch_numa_init(void); int __init numa_add_memblk(int nodeid, u64 start, u64 end); -void __init numa_set_distance(int from, int to, int distance); -void __init numa_free_distance(void); void __init early_map_cpu_to_node(unsigned int cpu, int nid); -int __init early_cpu_to_node(int cpu); +int early_cpu_to_node(int cpu); void numa_store_cpu_info(unsigned int cpu); void numa_add_cpu(unsigned int cpu); void numa_remove_cpu(unsigned int cpu); @@ -51,4 +49,8 @@ static inline int early_cpu_to_node(int cpu) { return 0; } #endif /* CONFIG_NUMA */ +#ifdef CONFIG_NUMA_EMU +void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable); +#endif + #endif /* __ASM_GENERIC_NUMA_H */ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 8c61ccd161ba..1f0a9ff23a2c 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -70,7 +70,7 @@ static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) /* * When percpu variables are required to be defined as weak, static percpu * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). - * Instead we will accound all module allocations to a single counter. + * Instead we will account all module allocations to a single counter. */ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); @@ -137,7 +137,16 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {} /* Caller should verify both ref and tag to be valid */ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { + alloc_tag_add_check(ref, tag); + if (!ref || !tag) + return; + ref->ct = &tag->ct; +} + +static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +{ + __alloc_tag_ref_set(ref, tag); /* * We need in increment the call counter every time we have a new * allocation or when we split a large allocation into smaller ones. @@ -147,22 +156,9 @@ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag this_cpu_inc(tag->counters->calls); } -static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) -{ - alloc_tag_add_check(ref, tag); - if (!ref || !tag) - return; - - __alloc_tag_ref_set(ref, tag); -} - static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) { - alloc_tag_add_check(ref, tag); - if (!ref || !tag) - return; - - __alloc_tag_ref_set(ref, tag); + alloc_tag_ref_set(ref, tag); this_cpu_add(tag->counters->bytes, bytes); } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index eb0f6f349496..47ae4c4d924c 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -172,7 +172,11 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; - /* siblings list anchored at the parent's ->children */ + /* + * siblings list anchored at the parent's ->children + * + * linkage is protected by cgroup_mutex or RCU + */ struct list_head sibling; struct list_head children; @@ -789,6 +793,11 @@ struct cgroup_subsys { extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +struct cgroup_of_peak { + unsigned long value; + struct list_head list; +}; + /** * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups * @tsk: target task diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c60ba0ab1462..3e0563753cc3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -854,4 +855,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of); + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/cma.h b/include/linux/cma.h index 9db877506ea8..d15b64f51336 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -52,4 +52,20 @@ extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); extern void cma_reserve_pages_on_error(struct cma *cma); + +#ifdef CONFIG_CMA +struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); +bool cma_free_folio(struct cma *cma, const struct folio *folio); +#else +static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) +{ + return NULL; +} + +static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) +{ + return false; +} +#endif + #endif diff --git a/include/linux/damon.h b/include/linux/damon.h index 27c546bfc6d4..a67f2c4940e9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -233,7 +233,6 @@ struct damos_quota { unsigned long charge_addr_from; /* For prioritization */ - unsigned long histogram[DAMOS_MAX_SCORE + 1]; unsigned int min_score; /* For feedback loop */ @@ -630,6 +629,8 @@ struct damon_ctx { unsigned long next_ops_update_sis; /* for waiting until the execution of the kdamond_fn is started */ struct completion kdamond_started; + /* for scheme quotas prioritization */ + unsigned long *regions_score_histogram; /* public: */ struct task_struct *kdamond; diff --git a/include/linux/err.h b/include/linux/err.h index b5d9bb2a2349..a4dacd745fcf 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -41,6 +41,9 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } +/* Return the pointer in the percpu address space. */ +#define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) + /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. @@ -51,6 +54,9 @@ static inline long __must_check PTR_ERR(__force const void *ptr) return (long) ptr; } +/* Read an error pointer from the percpu address space. */ +#define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr))) + /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. @@ -61,6 +67,9 @@ static inline bool __must_check IS_ERR(__force const void *ptr) return IS_ERR_VALUE((unsigned long)ptr); } +/* Read an error pointer from the percpu address space. */ +#define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr))) + /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. diff --git a/include/linux/fs.h b/include/linux/fs.h index 0df3e5f0dd2b..6b8df574729c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3513,7 +3513,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; - kiocb_flags |= IOCB_NOIO; } if (flags & RWF_ATOMIC) { if (rw_type != WRITE) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f53f76e0b17e..a951de920e20 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -319,7 +319,7 @@ static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order } static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { - return __folio_alloc_node(gfp, order, numa_node_id()); + return __folio_alloc_node_noprof(gfp, order, numa_node_id()); } static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid) @@ -446,4 +446,27 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_ #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); +#ifdef CONFIG_CONTIG_ALLOC +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, + int nid, nodemask_t *node) +{ + struct page *page; + + if (WARN_ON(!order || !(gfp & __GFP_COMP))) + return NULL; + + page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); + + return page ? page_folio(page) : NULL; +} +#else +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, + int nid, nodemask_t *node) +{ + return NULL; +} +#endif +/* This should be paired with folio_put() rather than free_contig_range(). */ +#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) + #endif /* __LINUX_GFP_H */ diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 313be4ad79fd..65db9349f905 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -215,7 +215,8 @@ enum { * the caller still has to check for failures) while costly requests try to be * not disruptive and back off even without invoking the OOM killer. * The following three modifiers might be used to override some of these - * implicit rules. + * implicit rules. Please note that all of them must be used along with + * %__GFP_DIRECT_RECLAIM flag. * * %__GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus @@ -246,11 +247,14 @@ enum { * cannot handle allocation failures. The allocation could block * indefinitely but will never return with failure. Testing for * failure is pointless. + * It _must_ be blockable and used together with __GFP_DIRECT_RECLAIM. + * It should _never_ be used in non-sleepable contexts. * New users should be evaluated carefully (and the flag should be * used only when there is no reasonable failure policy) but it is * definitely preferable to use the flag rather than opencode endless * loop around allocator. - * Using this flag for costly allocations is _highly_ discouraged. + * Allocating pages from the buddy with __GFP_NOFAIL and order > 1 is + * not supported. Please consider using kvmalloc() instead. */ #define __GFP_IO ((__force gfp_t)___GFP_IO) #define __GFP_FS ((__force gfp_t)___GFP_FS) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7a570e0437c9..67d0ab3c3bba 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -76,9 +76,9 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for file THP. Folios in a DAX * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to - * it. + * it. Same to PFNMAPs where there's neither page* nor pagecache. */ -#define THP_ORDERS_ALL_FILE_DAX \ +#define THP_ORDERS_ALL_SPECIAL \ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) #define THP_ORDERS_ALL_FILE_DEFAULT \ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) @@ -87,7 +87,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL \ - (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ @@ -116,6 +116,53 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) +enum mthp_stat_item { + MTHP_STAT_ANON_FAULT_ALLOC, + MTHP_STAT_ANON_FAULT_FALLBACK, + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_SWPOUT, + MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, + MTHP_STAT_SPLIT, + MTHP_STAT_SPLIT_FAILED, + MTHP_STAT_SPLIT_DEFERRED, + MTHP_STAT_NR_ANON, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, + __MTHP_STAT_COUNT +}; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) +struct mthp_stat { + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; +}; + +DECLARE_PER_CPU(struct mthp_stat, mthp_stats); + +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) +{ + if (order <= 0 || order > PMD_ORDER) + return; + + this_cpu_add(mthp_stats.stats[order][item], delta); +} + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + mod_mthp_stat(order, item, 1); +} + +#else +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) +{ +} + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long transparent_hugepage_flags; @@ -271,41 +318,6 @@ struct thpsize { #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) -enum mthp_stat_item { - MTHP_STAT_ANON_FAULT_ALLOC, - MTHP_STAT_ANON_FAULT_FALLBACK, - MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, - MTHP_STAT_SWPOUT, - MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_SHMEM_ALLOC, - MTHP_STAT_SHMEM_FALLBACK, - MTHP_STAT_SHMEM_FALLBACK_CHARGE, - MTHP_STAT_SPLIT, - MTHP_STAT_SPLIT_FAILED, - MTHP_STAT_SPLIT_DEFERRED, - __MTHP_STAT_COUNT -}; - -struct mthp_stat { - unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; -}; - -#ifdef CONFIG_SYSFS -DECLARE_PER_CPU(struct mthp_stat, mthp_stats); - -static inline void count_mthp_stat(int order, enum mthp_stat_item item) -{ - if (order <= 0 || order > PMD_ORDER) - return; - - this_cpu_inc(mthp_stats.stats[order][item]); -} -#else -static inline void count_mthp_stat(int order, enum mthp_stat_item item) -{ -} -#endif - #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1< 1; @@ -1287,8 +1285,7 @@ bool __vma_private_lock(struct vm_area_struct *vma); static inline pte_t * hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { -#if defined(CONFIG_HUGETLB_PAGE) && \ - defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) +#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; /* diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index 859f4b0c1b2b..196778a087c4 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -10,12 +10,11 @@ */ #define KPF_RESERVED 32 #define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 +#define KPF_OWNER_2 34 #define KPF_PRIVATE 35 #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 #define KPF_ARCH 38 -#define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 #define KPF_ARCH_3 42 diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 88100cc9caba..0ad1ddbb8b99 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -124,7 +124,7 @@ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp if (!static_branch_likely(&kfence_allocation_key)) return NULL; #endif - if (likely(atomic_read(&kfence_allocation_gate))) + if (likely(atomic_read(&kfence_allocation_gate) > 0)) return NULL; return __kfence_alloc(s, size, flags); } diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index f68865e19b0b..30baae91b225 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -4,6 +4,7 @@ #include /* MMF_VM_HUGEPAGE */ +extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a53ad4dabd7e..c2c11004085e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -52,9 +52,9 @@ * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * - * Once the type is decided, the decision of an allocation range type or a range - * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE - * flag. + * Once the type is decided, the decision of an allocation range type or a + * range type is done by examining the immutable tree flag for the + * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root @@ -148,6 +148,18 @@ enum maple_type { maple_arange_64, }; +enum store_type { + wr_invalid, + wr_new_root, + wr_store_root, + wr_exact_fit, + wr_spanning_store, + wr_split_store, + wr_rebalance, + wr_append, + wr_node_store, + wr_slot_store, +}; /** * DOC: Maple tree flags @@ -436,6 +448,7 @@ struct ma_state { unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ + enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { @@ -477,6 +490,7 @@ struct ma_wr_state { .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ + .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e5bf25d324f..34d2da05f2f1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -57,7 +57,7 @@ enum memcg_memory_event { struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; - unsigned int generation; + int generation; }; #ifdef CONFIG_MEMCG @@ -70,6 +70,7 @@ struct mem_cgroup_id { }; struct memcg_vmstats_percpu; +struct memcg1_events_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; struct lruvec_stats; @@ -77,7 +78,7 @@ struct lruvec_stats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ - unsigned int generation; + atomic_t generation; }; /* @@ -193,6 +194,11 @@ struct mem_cgroup { struct page_counter memsw; /* v1 only */ }; + /* registered local peak watchers */ + struct list_head memory_peaks; + struct list_head swap_peaks; + spinlock_t peaks_lock; + /* Range enforcement for interrupt charges */ struct work_struct high_work; @@ -270,6 +276,8 @@ struct mem_cgroup { struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ + struct memcg1_events_percpu __percpu *events_percpu; + unsigned long soft_limit; /* protected by memcg_oom_lock */ @@ -361,11 +369,11 @@ static inline bool folio_memcg_kmem(struct folio *folio); * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * - * The caller must ensure that the returned memcg won't be released: - * e.g. acquire the rcu_read_lock or css_set_lock. + * The caller must ensure that the returned memcg won't be released. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { + lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex)); return READ_ONCE(objcg->memcg); } @@ -439,6 +447,19 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return __folio_memcg(folio); } +/* + * folio_memcg_charged - If a folio is charged to a memory cgroup. + * @folio: Pointer to the folio. + * + * Returns true if folio is charged to a memory cgroup, otherwise returns false. + */ +static inline bool folio_memcg_charged(struct folio *folio) +{ + if (folio_memcg_kmem(folio)) + return __folio_objcg(folio) != NULL; + return __folio_memcg(folio) != NULL; +} + /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -455,7 +476,6 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) unsigned long memcg_data = READ_ONCE(folio->memcg_data); VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - WARN_ON_ONCE(!rcu_read_lock_held()); if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; @@ -464,6 +484,8 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) return obj_cgroup_memcg(objcg); } + WARN_ON_ONCE(!rcu_read_lock_held()); + return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } @@ -677,7 +699,8 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); + +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); void __mem_cgroup_uncharge(struct folio *folio); @@ -762,6 +785,8 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_current(void); +struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio); + struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, @@ -1006,8 +1031,8 @@ static inline void count_memcg_folio_events(struct folio *folio, count_memcg_events(memcg, idx, nr); } -static inline void count_memcg_event_mm(struct mm_struct *mm, - enum vm_event_item idx) +static inline void count_memcg_events_mm(struct mm_struct *mm, + enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; @@ -1017,10 +1042,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) - count_memcg_events(memcg, idx, 1); + count_memcg_events(memcg, idx, count); rcu_read_unlock(); } +static inline void count_memcg_event_mm(struct mm_struct *mm, + enum vm_event_item idx) +{ + count_memcg_events_mm(mm, idx, 1); +} + static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { @@ -1176,7 +1207,7 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, return 0; } -static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) { } @@ -1240,6 +1271,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_current(void) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { @@ -1462,6 +1498,11 @@ static inline void count_memcg_folio_events(struct folio *folio, { } +static inline void count_memcg_events_mm(struct mm_struct *mm, + enum vm_event_item idx, unsigned long count) +{ +} + static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { @@ -1717,7 +1758,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -struct mem_cgroup *mem_cgroup_from_obj(void *p); struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_event(struct obj_cgroup *objcg, @@ -1780,11 +1820,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return -1; } -static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) -{ - return NULL; -} - static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { return NULL; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ebe876930e78..b27ddce5d324 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,54 +16,6 @@ struct resource; struct vmem_altmap; struct dev_pagemap; -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION -/* - * For supporting node-hotadd, we have to allocate a new pgdat. - * - * If an arch has generic style NODE_DATA(), - * node_data[nid] = kzalloc() works well. But it depends on the architecture. - * - * In general, generic_alloc_nodedata() is used. - * - */ -extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); - -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) - -#ifdef CONFIG_NUMA -/* - * XXX: node aware allocation can't work well to get new node's memory at this time. - * Because, pgdat for the new node is not allocated/initialized yet itself. - * To use new node's memory, more consideration will be necessary. - */ -#define generic_alloc_nodedata(nid) \ -({ \ - memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ -}) - -extern pg_data_t *node_data[]; -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - node_data[nid] = pgdat; -} - -#else /* !CONFIG_NUMA */ - -/* never called */ -static inline pg_data_t *generic_alloc_nodedata(int nid) -{ - BUG(); - return NULL; -} -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - #ifdef CONFIG_MEMORY_HOTPLUG struct page *pfn_to_online_page(unsigned long pfn); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 644be30b69c8..002e49b2ebd9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -70,6 +70,7 @@ int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, unsigned int *ret_succeeded); struct folio *alloc_migration_target(struct folio *src, unsigned long private); bool isolate_movable_page(struct page *page, isolate_mode_t mode); +bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); @@ -91,6 +92,8 @@ static inline struct folio *alloc_migration_target(struct folio *src, { return NULL; } static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode) { return false; } +static inline bool isolate_folio_to_list(struct folio *folio, struct list_head *list) + { return false; } static inline int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) diff --git a/include/linux/mm.h b/include/linux/mm.h index 13bff7cf03b7..b62437447077 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -98,7 +98,11 @@ extern int mmap_rnd_compat_bits __read_mostly; #endif #ifndef PHYSMEM_END +# ifdef MAX_PHYSMEM_BITS # define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +# else +# define PHYSMEM_END (-1ULL) +# endif #endif #include @@ -1015,27 +1019,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) return mas_prev(&vmi->mas, 0); } -static inline -struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) -{ - return mas_prev_range(&vmi->mas, 0); -} - -static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) -{ - return vmi->mas.index; -} - -static inline unsigned long vma_iter_end(struct vma_iterator *vmi) -{ - return vmi->mas.last + 1; -} -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, - unsigned long count) -{ - return mas_expected_entries(&vmi->mas, count); -} - static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { @@ -1259,8 +1242,7 @@ static inline int folio_mapcount(const struct folio *folio) if (likely(!folio_test_large(folio))) { mapcount = atomic_read(&folio->_mapcount) + 1; - /* Handle page_has_type() pages */ - if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) + if (page_mapcount_is_type(mapcount)) mapcount = 0; return mapcount; } @@ -1756,6 +1738,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } + +bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { @@ -1809,6 +1793,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } +static inline bool folio_use_access_time(struct folio *folio) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) @@ -2158,14 +2146,19 @@ static inline size_t folio_size(const struct folio *folio) * MM ("mapped shared"), or if the folio is only mapped into a single MM * ("mapped exclusively"). * + * For KSM folios, this function also returns "mapped shared" when a folio is + * mapped multiple times into the same MM, because the individual page mappings + * are independent. + * * As precise information is not easily available for all folios, this function * estimates the number of MMs ("sharers") that are currently mapping a folio * using the number of times the first page of the folio is currently mapped * into page tables. * - * For small anonymous folios (except KSM folios) and anonymous hugetlb folios, - * the return value will be exactly correct, because they can only be mapped - * at most once into an MM, and they cannot be partially mapped. + * For small anonymous folios and anonymous hugetlb folios, the return + * value will be exactly correct: non-KSM folios can only be mapped at most once + * into an MM, and they cannot be partially mapped. KSM folios are + * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly @@ -2174,9 +2167,6 @@ static inline size_t folio_size(const struct folio *folio) * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. - * #. For (small) KSM folios, the return value can wrongly indicate "mapped - * shared" (false positive), when the folio is mapped multiple times into - * the same MM. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). @@ -2210,26 +2200,10 @@ static inline bool folio_likely_mapped_shared(struct folio *folio) return atomic_read(&folio->_mapcount) > 0; } -#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -static inline int arch_make_page_accessible(struct page *page) -{ - return 0; -} -#endif - #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { - int ret; - long i, nr = folio_nr_pages(folio); - - for (i = 0; i < nr; i++) { - ret = arch_make_page_accessible(folio_page(folio, i)); - if (ret) - break; - } - - return ret; + return 0; } #endif @@ -2409,11 +2383,40 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte(struct vm_area_struct *vma, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); +struct follow_pfnmap_args { + /** + * Inputs: + * @vma: Pointer to @vm_area_struct struct + * @address: the virtual address to walk + */ + struct vm_area_struct *vma; + unsigned long address; + /** + * Internals: + * + * The caller shouldn't touch any of these. + */ + spinlock_t *lock; + pte_t *ptep; + /** + * Outputs: + * + * @pfn: the PFN of the address + * @pgprot: the pgprot_t of the mapping + * @writable: whether the mapping is writable + * @special: whether the mapping is a special mapping (real PFN maps) + */ + unsigned long pfn; + pgprot_t pgprot; + bool writable; + bool special; +}; +int follow_pfnmap_start(struct follow_pfnmap_args *args); +void follow_pfnmap_end(struct follow_pfnmap_args *args); + extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -2541,11 +2544,6 @@ int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -extern unsigned long move_page_tables(struct vm_area_struct *vma, - unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len, - bool need_rmap_locks, bool for_stack); - /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However @@ -2566,21 +2564,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) -bool vma_needs_dirty_tracking(struct vm_area_struct *vma); -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); -static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) -{ - /* - * We want to check manually if we can change individual PTEs writable - * if we can't do that automatically for all PTEs in a mapping. For - * private mappings, that's always the case when we have write - * permissions as we properly have to handle COW. - */ - if (vma->vm_flags & VM_SHARED) - return vma_wants_writenotify(vma, vma->vm_page_prot); - return !!(vma->vm_flags & VM_WRITE); - -} bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, @@ -2704,6 +2687,30 @@ static inline pte_t pte_mkspecial(pte_t pte) } #endif +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { @@ -2896,7 +2903,7 @@ static inline void pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); @@ -2954,7 +2961,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) return true; } -#else /* !USE_SPLIT_PTE_PTLOCKS */ +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ @@ -2969,7 +2976,7 @@ static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} -#endif /* USE_SPLIT_PTE_PTLOCKS */ +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { @@ -3029,7 +3036,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) -#if USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { @@ -3288,78 +3295,9 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next); -extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void unlink_file_vma(struct vm_area_struct *); -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long vm_flags, - struct mempolicy *policy, - struct vm_userfaultfd_ctx uffd_ctx, - struct anon_vma_name *anon_name); - -/* We are about to modify the VMA's flags. */ -static inline struct vm_area_struct -*vma_modify_flags(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); -} - -/* We are about to modify the VMA's flags and/or anon_name. */ -static inline struct vm_area_struct -*vma_modify_flags_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long new_flags, - struct anon_vma_name *new_name) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); -} - -/* We are about to modify the VMA's memory policy. */ -static inline struct vm_area_struct -*vma_modify_policy(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct mempolicy *new_pol) -{ - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} - -/* We are about to modify the VMA's flags and/or uffd context. */ -static inline struct vm_area_struct -*vma_modify_flags_uffd(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), new_ctx, anon_vma_name(vma)); -} +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3392,10 +3330,6 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); -/* This is an obsolete alternative to _install_special_mapping. */ -extern int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); @@ -3421,14 +3355,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); +int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU -extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf, bool unlock); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) @@ -3656,9 +3590,6 @@ static inline vm_fault_t vmf_fs_error(int err) return VM_FAULT_SIGBUS; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags); - static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) @@ -4194,18 +4125,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, #ifdef CONFIG_UNACCEPTED_MEMORY -bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); -void accept_memory(phys_addr_t start, phys_addr_t end); +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); +void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, - phys_addr_t end) + unsigned long size) { return false; } -static inline void accept_memory(phys_addr_t start, phys_addr_t end) +static inline void accept_memory(phys_addr_t start, unsigned long size) { } @@ -4213,9 +4144,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end) static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { - phys_addr_t paddr = pfn << PAGE_SHIFT; - - return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); + return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } void vma_pgtable_walk_begin(struct vm_area_struct *vma); @@ -4233,4 +4162,61 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif +#ifdef CONFIG_MEM_ALLOC_PROFILING +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ + int i; + struct alloc_tag *tag; + unsigned int nr_pages = 1 << new_order; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = pgalloc_tag_get(&folio->page); + if (!tag) + return; + + for (i = nr_pages; i < (1 << old_order); i += nr_pages) { + union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i)); + + if (ref) { + /* Set new reference to point to the original tag */ + alloc_tag_ref_set(ref, tag); + put_page_tag_ref(ref); + } + } +} + +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ + struct alloc_tag *tag; + union codetag_ref *ref; + + tag = pgalloc_tag_get(&old->page); + if (!tag) + return; + + ref = get_page_tag_ref(&new->page); + if (!ref) + return; + + /* Clear the old ref to the original allocation tag. */ + clear_page_tag_ref(&old->page); + /* Decrement the counters of the tag on get_new_folio. */ + alloc_tag_sub(ref, folio_nr_pages(new)); + + __alloc_tag_ref_set(ref, tag); + + put_page_tag_ref(ref); +} +#else /* !CONFIG_MEM_ALLOC_PROFILING */ +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ +} + +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ +} +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 485424979254..6e3bdf8e38bc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -109,7 +109,7 @@ struct page { /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. - * Used for swp_entry_t if PageSwapCache. + * Used for swp_entry_t if swapcache flag set. * Indicates order in the buddy system if PageBuddy. */ unsigned long private; @@ -660,6 +660,9 @@ struct vma_numab_state { * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). + * + * Only explicitly marked struct members may be accessed by RCU readers before + * getting a stable reference. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -675,7 +678,11 @@ struct vm_area_struct { #endif }; - struct mm_struct *vm_mm; /* The address space we belong to. */ + /* + * The address space we belong to. + * Unstable RCU readers are allowed to read this. + */ + struct mm_struct *vm_mm; pgprot_t vm_page_prot; /* Access permissions of this VMA. */ /* @@ -688,7 +695,10 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* Flag to indicate areas detached from the mm->mm_mt tree */ + /* + * Flag to indicate areas detached from the mm->mm_mt tree. + * Unstable RCU readers are allowed to read this. + */ bool detached; /* @@ -706,6 +716,7 @@ struct vm_area_struct { * slowpath. */ int vm_lock_seq; + /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -947,7 +958,7 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_subscriptions *notifier_subscriptions; #endif -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_NUMA_BALANCING @@ -1313,6 +1324,9 @@ struct vm_special_mapping { int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); + + void (*close)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma); }; enum tlb_flush_reason { diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index a2f6179b672b..bff5706b76e1 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -16,9 +16,6 @@ #include #endif -#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) -#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ - IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1dc6248feb83..17506e4a2835 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -666,11 +666,6 @@ enum zone_watermarks { #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) -#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) -#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) -#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) -#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) - /* * Flags used in pcp->flags field. * @@ -1016,6 +1011,32 @@ enum zone_flags { ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; +static inline unsigned long wmark_pages(const struct zone *z, + enum zone_watermarks w) +{ + return z->_watermark[w] + z->watermark_boost; +} + +static inline unsigned long min_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_MIN); +} + +static inline unsigned long low_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_LOW); +} + +static inline unsigned long high_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_HIGH); +} + +static inline unsigned long promo_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_PROMO); +} + static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); @@ -1688,7 +1709,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ - for (zone = z->zone; \ + for (zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) @@ -1724,7 +1745,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes) nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); - return (!z->zone) ? true : false; + return (!zonelist_zone(z)) ? true : false; } diff --git a/include/linux/numa.h b/include/linux/numa.h index eb19503604fe..3567e40329eb 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -30,6 +30,12 @@ static inline bool numa_valid_node(int nid) #ifdef CONFIG_NUMA #include +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + +void __init alloc_node_data(int nid); +void __init alloc_offline_node_data(int nid); + /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); @@ -57,6 +63,8 @@ static inline int phys_to_target_node(u64 start) { return 0; } + +static inline void alloc_offline_node_data(int nid) {} #endif #define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE) diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h new file mode 100644 index 000000000000..cfad6ce7e1bd --- /dev/null +++ b/include/linux/numa_memblks.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NUMA_MEMBLKS_H +#define __NUMA_MEMBLKS_H + +#ifdef CONFIG_NUMA_MEMBLKS +#include + +#define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) + +void __init numa_set_distance(int from, int to, int distance); +void __init numa_reset_distance(void); + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +int __init numa_add_memblk(int nodeid, u64 start, u64 end); +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); + +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); + +int __init numa_memblks_init(int (*init_func)(void), + bool memblock_force_top_down); + +#ifdef CONFIG_NUMA_EMU +int numa_emu_cmdline(char *str); +void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, + unsigned int nr_emu_nids); +u64 __init numa_emu_dma_end(void); +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +static inline int numa_emu_cmdline(char *str) +{ + return -EINVAL; +} +#endif /* CONFIG_NUMA_EMU */ + +#ifdef CONFIG_NUMA_KEEP_MEMINFO +extern int phys_to_target_node(u64 start); +#define phys_to_target_node phys_to_target_node +extern int memory_add_physaddr_to_nid(u64 start); +#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +#endif /* CONFIG_NUMA_KEEP_MEMINFO */ + +#endif /* CONFIG_NUMA_MEMBLKS */ + +#endif /* __NUMA_MEMBLKS_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5769fe6e4950..1b3a76710487 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -66,8 +66,6 @@ * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * - * PG_error is set to indicate that an I/O error occurred on this page. - * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. @@ -103,22 +101,18 @@ enum pageflags { PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, - PG_error, - PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ + PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ + PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ - PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - PG_uncached, /* Page has been mapped as uncached */ -#endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif @@ -126,14 +120,21 @@ enum pageflags { PG_young, PG_idle, #endif -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, + /* Anonymous memory (and shmem) */ + PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ + /* Some filesystems */ + PG_checked = PG_owner_priv_1, + /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped @@ -141,13 +142,13 @@ enum pageflags { * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ - PG_anon_exclusive = PG_mappedtodisk, + PG_anon_exclusive = PG_owner_2, - /* Filesystems */ - PG_checked = PG_owner_priv_1, - - /* SwapBacked */ - PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ + /* + * Set if all buffer heads in the folio are mapped. + * Filesystems which do not use BHs can use it for their own purpose. + */ + PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes @@ -183,8 +184,9 @@ enum pageflags { */ /* At least one page in this folio has the hwpoison flag set */ - PG_has_hwpoisoned = PG_error, + PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -235,7 +237,7 @@ static __always_inline int page_is_fake_head(const struct page *page) return page_fixed_fake_head(page) != page; } -static inline unsigned long _compound_head(const struct page *page) +static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); @@ -506,7 +508,6 @@ static inline int TestClearPage##uname(struct page *page) { return 0; } __PAGEFLAG(Locked, locked, PF_NO_TAIL) FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) -PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE) @@ -514,8 +515,9 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) -PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) - TESTCLEARFLAG(Active, active, PF_HEAD) +FOLIO_FLAG(active, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ @@ -531,9 +533,9 @@ PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) -PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) - __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) - __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) +FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE) /* * Private page markings that may be used by the filesystem that owns the page @@ -542,8 +544,9 @@ PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) */ PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) -PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) - TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) + +/* owner_2 can be set on tail pages for anon memory */ +FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) /* * Only test-and-set exist for PG_writeback. The unconditional operators are @@ -556,8 +559,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) -PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) - TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND) +FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) #ifdef CONFIG_HIGHMEM /* @@ -577,34 +580,26 @@ static __always_inline bool folio_test_swapcache(const struct folio *folio) test_bit(PG_swapcache, const_folio_flags(folio, 0)); } -static __always_inline bool PageSwapCache(const struct page *page) -{ - return folio_test_swapcache(page_folio(page)); -} - -SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) -CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) +FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE) +FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) #else -PAGEFLAG_FALSE(SwapCache, swapcache) +FOLIO_FLAG_FALSE(swapcache) #endif -PAGEFLAG(Unevictable, unevictable, PF_HEAD) - __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) - TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) +FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) - __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) - TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) +FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) + FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE) #else -PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked) - TESTSCFLAG_FALSE(Mlocked, mlocked) -#endif - -#ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) -#else -PAGEFLAG_FALSE(Uncached, uncached) +FOLIO_FLAG_FALSE(mlocked) + __FOLIO_CLEAR_FLAG_NOOP(mlocked) + FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked) + FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif #ifdef CONFIG_MEMORY_FAILURE @@ -865,8 +860,18 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +/* + * PG_partially_mapped is protected by deferred_split split_queue_lock, + * so its safe to use non-atomic set/clear. + */ +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) +FOLIO_TEST_FLAG_FALSE(partially_mapped) +__FOLIO_SET_FLAG_NOOP(partially_mapped) +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) @@ -927,79 +932,74 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif /* - * For pages that are never mapped to userspace, - * page_type may be used. Because it is initialised to -1, we invert the - * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and - * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and - * low bits so that an underflow or overflow of _mapcount won't be - * mistaken for a page type value. + * For pages that do not use mapcount, page_type may be used. + * The low 24 bits of pagetype may be used for your own purposes, as long + * as you are careful to not affect the top 8 bits. The low bits of + * pagetype will be overwritten when you clear the page_type from the page. */ - enum pagetype { - PG_buddy = 0x40000000, - PG_offline = 0x20000000, - PG_table = 0x10000000, - PG_guard = 0x08000000, - PG_hugetlb = 0x04000000, - PG_slab = 0x02000000, - PG_zsmalloc = 0x01000000, + /* 0x00-0x7f are positive numbers, ie mapcount */ + /* Reserve 0x80-0xef for mapcount overflow. */ + PGTY_buddy = 0xf0, + PGTY_offline = 0xf1, + PGTY_table = 0xf2, + PGTY_guard = 0xf3, + PGTY_hugetlb = 0xf4, + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, - PAGE_TYPE_BASE = 0x80000000, - - /* - * Reserve 0xffff0000 - 0xfffffffe to catch _mapcount underflows and - * allow owners that set a type to reuse the lower 16 bit for their own - * purposes. - */ - PAGE_MAPCOUNT_RESERVE = ~0x0000ffff, + PGTY_mapcount_underflow = 0xff }; -#define PageType(page, flag) \ - ((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) -#define folio_test_type(folio, flag) \ - ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) - -static inline int page_type_has_type(unsigned int page_type) +static inline bool page_type_has_type(int page_type) { - return (int)page_type < PAGE_MAPCOUNT_RESERVE; + return page_type < (PGTY_mapcount_underflow << 24); } -static inline int page_has_type(const struct page *page) +/* This takes a mapcount which is one more than page->_mapcount */ +static inline bool page_mapcount_is_type(unsigned int mapcount) { - return page_type_has_type(READ_ONCE(page->page_type)); + return page_type_has_type(mapcount - 1); +} + +static inline bool page_has_type(const struct page *page) +{ + return page_mapcount_is_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ -static __always_inline bool folio_test_##fname(const struct folio *folio)\ +static __always_inline bool folio_test_##fname(const struct folio *folio) \ { \ - return folio_test_type(folio, PG_##lname); \ + return data_race(folio->page.page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ - VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ - folio->page.page_type &= ~PG_##lname; \ + VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ + folio); \ + folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ - folio->page.page_type |= PG_##lname; \ + folio->page.page_type = UINT_MAX; \ } #define PAGE_TYPE_OPS(uname, lname, fname) \ FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ - return PageType(page, PG_##lname); \ + return data_race(page->page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ - VM_BUG_ON_PAGE(!PageType(page, 0), page); \ - page->page_type &= ~PG_##lname; \ + VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ + page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ - page->page_type |= PG_##lname; \ + page->page_type = UINT_MAX; \ } /* @@ -1076,6 +1076,13 @@ FOLIO_TEST_FLAG_FALSE(hugetlb) PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) +/* + * Mark pages that has to be accepted before touched for the first time. + * + * Serialized with zone lock. + */ +PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. @@ -1175,25 +1182,20 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ - 1UL << PG_large_rmappable) + 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** - * page_has_private - Determine if page has private stuff - * @page: The page to be checked + * folio_has_private - Determine if folio has private stuff + * @folio: The folio to be checked * - * Determine if a page has private stuff, indicating that release routines + * Determine if a folio has private stuff, indicating that release routines * should be invoked upon it. */ -static inline int page_has_private(const struct page *page) +static inline int folio_has_private(const struct folio *folio) { - return !!(page->flags & PAGE_FLAGS_PRIVATE); -} - -static inline bool folio_has_private(const struct folio *folio) -{ - return page_has_private(&folio->page); + return !!(folio->flags & PAGE_FLAGS_PRIVATE); } #undef PF_ANY diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 904c52f97284..79dbd8bc35a7 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -26,11 +26,14 @@ struct page_counter { atomic_long_t children_low_usage; unsigned long watermark; + /* Latest cg2 reset watermark */ + unsigned long local_watermark; unsigned long failcnt; /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); + bool protection_support; unsigned long min; unsigned long low; unsigned long high; @@ -44,12 +47,17 @@ struct page_counter { #define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) #endif +/* + * Protection is supported only for the first counter (with id 0). + */ static inline void page_counter_init(struct page_counter *counter, - struct page_counter *parent) + struct page_counter *parent, + bool protection_support) { - atomic_long_set(&counter->usage, 0); + counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0); counter->max = PAGE_COUNTER_MAX; counter->parent = parent; + counter->protection_support = protection_support; } static inline unsigned long page_counter_read(struct page_counter *counter) @@ -78,11 +86,24 @@ int page_counter_memparse(const char *buf, const char *max, static inline void page_counter_reset_watermark(struct page_counter *counter) { - counter->watermark = page_counter_read(counter); + unsigned long usage = page_counter_read(counter); + + /* + * Update local_watermark first, so it's always <= watermark + * (modulo CPU/compiler re-ordering) + */ + counter->local_watermark = usage; + counter->watermark = usage; } +#ifdef CONFIG_MEMCG void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection); +#else +static inline void page_counter_calculate_protection(struct page_counter *root, + struct page_counter *counter, + bool recursive_protection) {} +#endif #endif /* _LINUX_PAGE_COUNTER_H */ diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 27cd1e59ccf7..f5eb5a32aeed 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -130,4 +130,62 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private); +typedef int __bitwise folio_walk_flags_t; + +/* + * Walk migration entries as well. Careful: a large folio might get split + * concurrently. + */ +#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) + +/* Walk shared zeropages (small + huge) as well. */ +#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) + +enum folio_walk_level { + FW_LEVEL_PTE, + FW_LEVEL_PMD, + FW_LEVEL_PUD, +}; + +/** + * struct folio_walk - folio_walk_start() / folio_walk_end() data + * @page: exact folio page referenced (if applicable) + * @level: page table level identifying the entry type + * @pte: pointer to the page table entry (FW_LEVEL_PTE). + * @pmd: pointer to the page table entry (FW_LEVEL_PMD). + * @pud: pointer to the page table entry (FW_LEVEL_PUD). + * @ptl: pointer to the page table lock. + * + * (see folio_walk_start() documentation for more details) + */ +struct folio_walk { + /* public */ + struct page *page; + enum folio_walk_level level; + union { + pte_t *ptep; + pud_t *pudp; + pmd_t *pmdp; + }; + union { + pte_t pte; + pud_t pud; + pmd_t pmd; + }; + /* private */ + struct vm_area_struct *vma; + spinlock_t *ptl; +}; + +struct folio *folio_walk_start(struct folio_walk *fw, + struct vm_area_struct *vma, unsigned long addr, + folio_walk_flags_t flags); + +#define folio_walk_end(__fw, __vma) do { \ + spin_unlock((__fw)->ptl); \ + if (likely((__fw)->level == FW_LEVEL_PTE)) \ + pte_unmap((__fw)->ptep); \ + vma_pgtable_walk_end(__vma); \ +} while (0) + #endif /* _LINUX_PAGEWALK_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 4b2047b78b67..b6321fc49159 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -135,7 +135,6 @@ extern void __init setup_per_cpu_areas(void); extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp_t gfp) __alloc_size(1); -extern size_t pcpu_alloc_size(void __percpu *__pdata); #define __alloc_percpu_gfp(_size, _align, _gfp) \ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp)) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 207f0c83c8e9..59a3deb792a8 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -80,36 +80,6 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) } } -static inline void pgalloc_tag_split(struct page *page, unsigned int nr) -{ - int i; - struct page_ext *first_page_ext; - struct page_ext *page_ext; - union codetag_ref *ref; - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - first_page_ext = page_ext = page_ext_get(page); - if (unlikely(!page_ext)) - return; - - ref = codetag_ref_from_page_ext(page_ext); - if (!ref->ct) - goto out; - - tag = ct_to_alloc_tag(ref->ct); - page_ext = page_ext_next(page_ext); - for (i = 1; i < nr; i++) { - /* Set new reference to point to the original tag */ - alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag); - page_ext = page_ext_next(page_ext); - } -out: - page_ext_put(first_page_ext); -} - static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { struct alloc_tag *tag = NULL; @@ -142,7 +112,6 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2a6a3cccfc36..e8b2ac6bd2ae 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -447,6 +447,12 @@ static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, } #endif +#ifndef arch_check_zapped_pud +static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) +{ +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, @@ -1950,6 +1956,18 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +#ifndef pte_pgprot +#define pte_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pmd_pgprot +#define pmd_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pud_pgprot +#define pud_pgprot(x) ((pgprot_t) {0}) +#endif + /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0978c64f49d8..d5e93e44322e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -331,7 +331,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { - atomic_inc(&page->_mapcount); + atomic_inc(&folio->_mapcount); break; } @@ -425,7 +425,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, if (!folio_test_large(folio)) { if (PageAnonExclusive(page)) ClearPageAnonExclusive(page); - atomic_inc(&page->_mapcount); + atomic_inc(&folio->_mapcount); break; } @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); +enum rmp_flags { + RMP_LOCKED = 1 << 0, + RMP_USE_SHARED_ZEROPAGE = 1 << 1, +}; + +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); /* * rmap_walk_control: To control rmap traversing for specific needs diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 91546493c43d..07bb8d4181d7 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -179,27 +179,20 @@ static inline void mm_update_next_owner(struct mm_struct *mm) extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); -extern unsigned long -arch_get_unmapped_area(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t); unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -unsigned long -arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t vm_flags); -unsigned long -arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t); - unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, @@ -211,11 +204,11 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index ccd72b978e1f..bf10bdb487dd 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -95,23 +95,11 @@ static inline int object_is_on_stack(const void *obj) extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p); +#else static inline unsigned long stack_not_used(struct task_struct *p) { - unsigned long *n = end_of_stack(p); - - do { /* Skip over canary */ -# ifdef CONFIG_STACK_GROWSUP - n--; -# else - n++; -# endif - } while (!*n); - -# ifdef CONFIG_STACK_GROWSUP - return (unsigned long)end_of_stack(p) - (unsigned long)n; -# else - return (unsigned long)n - (unsigned long)end_of_stack(p); -# endif + return 0; } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 95ac8398ee72..e7aec20fb44f 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -8,10 +8,10 @@ #ifdef CONFIG_ARCH_HAS_SET_MEMORY #include #else -static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif #ifndef set_memory_rox diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1d06b1e5408a..515a9a6a3c6f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -111,20 +111,13 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge); + loff_t write_end, bool shmem_huge_force); #else -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) -{ - return false; -} static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge) + loff_t write_end, bool shmem_huge_force) { return 0; } @@ -150,8 +143,8 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); diff --git a/include/linux/slab.h b/include/linux/slab.h index da3a546571e7..b35e2db7eb0e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -930,6 +930,16 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz * @new_n: new number of elements to alloc * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * See krealloc_noprof() for further details. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. */ static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p, size_t new_n, @@ -1038,8 +1048,8 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) -extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - __realloc_size(3); +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) + __realloc_size(2); #define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) extern void kvfree(const void *addr); diff --git a/include/linux/swap.h b/include/linux/swap.h index ba7ea95d1c57..ca533b478c21 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -243,22 +243,24 @@ enum { * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * - * The data field stores next cluster if the cluster is free or cluster usage - * counter otherwise. The flags field determines if a cluster is free. This is - * protected by swap_info_struct.lock. + * The flags field determines if a cluster is free. This is + * protected by cluster lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields - * and swap_info_struct->swap_map - * elements correspond to the swap - * cluster + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. */ - unsigned int data:24; - unsigned int flags:8; + u16 count; + u8 flags; + u8 order; + struct list_head list; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ +#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ +#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ /* * The first page in the swap file is the swap header, which is always marked @@ -283,11 +285,6 @@ struct percpu_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; -struct swap_cluster_list { - struct swap_cluster_info head; - struct swap_cluster_info tail; -}; - /* * The in-memory structure used to track swap areas. */ @@ -299,8 +296,15 @@ struct swap_info_struct { signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ - struct swap_cluster_list free_clusters; /* free clusters list */ + struct list_head free_clusters; /* free clusters list */ + struct list_head full_clusters; /* full clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ + unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ @@ -331,7 +335,7 @@ struct swap_info_struct { * list. */ struct work_struct discard_work; /* discard worker */ - struct swap_cluster_list discard_clusters; /* discard clusters list */ + struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one * entry per node. @@ -478,9 +482,9 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t); +extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t); +extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -545,7 +549,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp) +static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } @@ -554,7 +558,7 @@ static inline int swap_duplicate(swp_entry_t swp) return 0; } -static inline int swapcache_prepare(swp_entry_t swp) +static inline int swapcache_prepare(swp_entry_t swp, int nr) { return 0; } diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a12bcf042551..9fc6ce15c499 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); extern bool userfaultfd_wp_async(struct vm_area_struct *vma); +void userfaultfd_reset_ctx(struct vm_area_struct *vma); + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end); + +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + unsigned long vm_flags, + unsigned long start, unsigned long end, + bool wp_async); + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx); + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 747943bc8cc2..aed952d04132 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -50,6 +50,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_ANON, PGSTEAL_FILE, #ifdef CONFIG_NUMA + PGSCAN_ZONE_RECLAIM_SUCCESS, PGSCAN_ZONE_RECLAIM_FAILED, #endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, @@ -104,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, + THP_UNDERUSED_SPLIT_PAGE, THP_SPLIT_PMD, THP_SCAN_EXCEED_NONE_PTE, THP_SCAN_EXCEED_SWAP_PTE, @@ -154,6 +156,30 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, VMA_LOCK_RETRY, VMA_LOCK_MISS, #endif +#ifdef CONFIG_DEBUG_STACK_USAGE + KSTACK_1K, +#if THREAD_SIZE > 1024 + KSTACK_2K, +#endif +#if THREAD_SIZE > 2048 + KSTACK_4K, +#endif +#if THREAD_SIZE > 4096 + KSTACK_8K, +#endif +#if THREAD_SIZE > 8192 + KSTACK_16K, +#endif +#if THREAD_SIZE > 16384 + KSTACK_32K, +#endif +#if THREAD_SIZE > 32768 + KSTACK_64K, +#endif +#if THREAD_SIZE > 65536 + KSTACK_REST, +#endif +#endif /* CONFIG_DEBUG_STACK_USAGE */ NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index e4a631ec430b..ad2ce7a6ab7a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -189,6 +189,10 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1 extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) +void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) + __realloc_size(2); +#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) + extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9eb77c9007e6..d2761bf8ff32 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -32,6 +32,7 @@ struct reclaim_stat { unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; + unsigned nr_demoted; }; /* Stat data for system wide items */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8f651bb0a1a5..d6db822e4bb3 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -79,6 +79,9 @@ struct writeback_control { */ struct swap_iocb **swap_plug; + /* Target list for splitting a large folio */ + struct list_head *list; + /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; diff --git a/include/linux/zstd.h b/include/linux/zstd.h index 113408eef6ec..b2c7cf310c8f 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -77,6 +77,30 @@ int zstd_min_clevel(void); */ int zstd_max_clevel(void); +/** + * zstd_default_clevel() - default compression level + * + * Return: Default compression level. + */ +int zstd_default_clevel(void); + +/** + * struct zstd_custom_mem - custom memory allocation + */ +typedef ZSTD_customMem zstd_custom_mem; + +/** + * struct zstd_dict_load_method - Dictionary load method. + * See zstd_lib.h. + */ +typedef ZSTD_dictLoadMethod_e zstd_dict_load_method; + +/** + * struct zstd_dict_content_type - Dictionary context type. + * See zstd_lib.h. + */ +typedef ZSTD_dictContentType_e zstd_dict_content_type; + /* ====== Parameter Selection ====== */ /** @@ -136,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters; zstd_parameters zstd_get_params(int level, unsigned long long estimated_src_size); + +/** + * zstd_get_cparams() - returns zstd_compression_parameters for selected level + * @level: The compression level + * @estimated_src_size: The estimated source size to compress or 0 + * if unknown. + * @dict_size: Dictionary size. + * + * Return: The selected zstd_compression_parameters. + */ +zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size); + /* ====== Single-pass Compression ====== */ typedef ZSTD_CCtx zstd_cctx; @@ -180,6 +217,71 @@ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size); size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, const void *src, size_t src_size, const zstd_parameters *parameters); +/** + * zstd_create_cctx_advanced() - Create compression context + * @custom_mem: Custom allocator. + * + * Return: NULL on error, pointer to compression context otherwise. + */ +zstd_cctx *zstd_create_cctx_advanced(zstd_custom_mem custom_mem); + +/** + * zstd_free_cctx() - Free compression context + * @cdict: Pointer to compression context. + * + * Return: Always 0. + */ +size_t zstd_free_cctx(zstd_cctx* cctx); + +/** + * struct zstd_cdict - Compression dictionary. + * See zstd_lib.h. + */ +typedef ZSTD_CDict zstd_cdict; + +/** + * zstd_create_cdict_byreference() - Create compression dictionary + * @dict: Pointer to dictionary buffer. + * @dict_size: Size of the dictionary buffer. + * @dict_load_method: Dictionary load method. + * @dict_content_type: Dictionary content type. + * @custom_mem: Memory allocator. + * + * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be + * free before zstd_cdict is destroyed. + * + * Return: NULL on error, pointer to compression dictionary + * otherwise. + */ +zstd_cdict *zstd_create_cdict_byreference(const void *dict, size_t dict_size, + zstd_compression_parameters cparams, + zstd_custom_mem custom_mem); + +/** + * zstd_free_cdict() - Free compression dictionary + * @cdict: Pointer to compression dictionary. + * + * Return: Always 0. + */ +size_t zstd_free_cdict(zstd_cdict* cdict); + +/** + * zstd_compress_using_cdict() - compress src into dst using a dictionary + * @cctx: The context. Must have been initialized with zstd_init_cctx(). + * @dst: The buffer to compress src into. + * @dst_capacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @src_size: The size of the data to compress. + * @cdict: The dictionary to be used. + * + * Return: The compressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_compress_using_cdict(zstd_cctx *cctx, void *dst, + size_t dst_capacity, const void *src, size_t src_size, + const zstd_cdict *cdict); + /* ====== Single-pass Decompression ====== */ typedef ZSTD_DCtx zstd_dctx; @@ -220,6 +322,71 @@ zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size); size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, const void *src, size_t src_size); +/** + * struct zstd_ddict - Decompression dictionary. + * See zstd_lib.h. + */ +typedef ZSTD_DDict zstd_ddict; + +/** + * zstd_create_ddict_byreference() - Create decompression dictionary + * @dict: Pointer to dictionary buffer. + * @dict_size: Size of the dictionary buffer. + * @dict_load_method: Dictionary load method. + * @dict_content_type: Dictionary content type. + * @custom_mem: Memory allocator. + * + * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be + * free before zstd_ddict is destroyed. + * + * Return: NULL on error, pointer to decompression dictionary + * otherwise. + */ +zstd_ddict *zstd_create_ddict_byreference(const void *dict, size_t dict_size, + zstd_custom_mem custom_mem); +/** + * zstd_free_ddict() - Free decompression dictionary + * @dict: Pointer to the dictionary. + * + * Return: Always 0. + */ +size_t zstd_free_ddict(zstd_ddict *ddict); + +/** + * zstd_create_dctx_advanced() - Create decompression context + * @custom_mem: Custom allocator. + * + * Return: NULL on error, pointer to decompression context otherwise. + */ +zstd_dctx *zstd_create_dctx_advanced(zstd_custom_mem custom_mem); + +/** + * zstd_free_dctx() -- Free decompression context + * @dctx: Pointer to decompression context. + * Return: Always 0. + */ +size_t zstd_free_dctx(zstd_dctx *dctx); + +/** + * zstd_decompress_using_ddict() - decompress src into dst using a dictionary + * @dctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dst_capacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @src_size: The exact size of the data to decompress. + * @ddict: The dictionary to be used. + * + * Return: The decompressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_decompress_using_ddict(zstd_dctx *dctx, + void *dst, size_t dst_capacity, const void *src, size_t src_size, + const zstd_ddict *ddict); + + /* ====== Streaming Buffers ====== */ /** diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 6cecb4a4f68b..9cd1beef0654 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages; struct zswap_lruvec_state { /* - * Number of pages in zswap that should be protected from the shrinker. - * This number is an estimate of the following counts: + * Number of swapped in pages from disk, i.e not found in the zswap pool. * - * a) Recent page faults. - * b) Recent insertion to the zswap LRU. This includes new zswap stores, - * as well as recent zswap LRU rotations. - * - * These pages are likely to be warm, and might incur IO if the are written - * to swap. + * This is consumed and subtracted from the lru size in + * zswap_shrinker_count() to penalize past overshrinking that led to disk + * swapins. The idea is that had we considered this many more pages in the + * LRU active/protected and not written them back, we would not have had to + * swapped them in. */ - atomic_long_t nr_zswap_protected; + atomic_long_t nr_disk_swapins; }; unsigned long zswap_total_pages(void); diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h index 46c89c1e460c..f48fe637bfd2 100644 --- a/include/trace/events/filemap.h +++ b/include/trace/events/filemap.h @@ -56,6 +56,90 @@ DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache, TP_ARGS(folio) ); +DECLARE_EVENT_CLASS(mm_filemap_op_page_cache_range, + + TP_PROTO( + struct address_space *mapping, + pgoff_t index, + pgoff_t last_index + ), + + TP_ARGS(mapping, index, last_index), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(unsigned long, index) + __field(unsigned long, last_index) + ), + + TP_fast_assign( + __entry->i_ino = mapping->host->i_ino; + if (mapping->host->i_sb) + __entry->s_dev = + mapping->host->i_sb->s_dev; + else + __entry->s_dev = mapping->host->i_rdev; + __entry->index = index; + __entry->last_index = last_index; + ), + + TP_printk( + "dev=%d:%d ino=%lx ofs=%lld-%lld", + MAJOR(__entry->s_dev), + MINOR(__entry->s_dev), __entry->i_ino, + ((loff_t)__entry->index) << PAGE_SHIFT, + ((((loff_t)__entry->last_index + 1) << PAGE_SHIFT) - 1) + ) +); + +DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_get_pages, + TP_PROTO( + struct address_space *mapping, + pgoff_t index, + pgoff_t last_index + ), + TP_ARGS(mapping, index, last_index) +); + +DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_map_pages, + TP_PROTO( + struct address_space *mapping, + pgoff_t index, + pgoff_t last_index + ), + TP_ARGS(mapping, index, last_index) +); + +TRACE_EVENT(mm_filemap_fault, + TP_PROTO(struct address_space *mapping, pgoff_t index), + + TP_ARGS(mapping, index), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(unsigned long, index) + ), + + TP_fast_assign( + __entry->i_ino = mapping->host->i_ino; + if (mapping->host->i_sb) + __entry->s_dev = + mapping->host->i_sb->s_dev; + else + __entry->s_dev = mapping->host->i_rdev; + __entry->index = index; + ), + + TP_printk( + "dev=%d:%d ino=%lx ofs=%lld", + MAJOR(__entry->s_dev), + MINOR(__entry->s_dev), __entry->i_ino, + ((loff_t)__entry->index) << PAGE_SHIFT + ) +); + TRACE_EVENT(filemap_set_wb_err, TP_PROTO(struct address_space *mapping, errseq_t eseq), diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 37265977d524..bb8a59c6caa2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -71,12 +71,6 @@ #define IF_HAVE_PG_MLOCK(_name) #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED -#define IF_HAVE_PG_UNCACHED(_name) ,{1UL << PG_##_name, __stringify(_name)} -#else -#define IF_HAVE_PG_UNCACHED(_name) -#endif - #ifdef CONFIG_MEMORY_FAILURE #define IF_HAVE_PG_HWPOISON(_name) ,{1UL << PG_##_name, __stringify(_name)} #else @@ -89,10 +83,16 @@ #define IF_HAVE_PG_IDLE(_name) #endif -#ifdef CONFIG_ARCH_USES_PG_ARCH_X -#define IF_HAVE_PG_ARCH_X(_name) ,{1UL << PG_##_name, __stringify(_name)} +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 +#define IF_HAVE_PG_ARCH_2(_name) ,{1UL << PG_##_name, __stringify(_name)} #else -#define IF_HAVE_PG_ARCH_X(_name) +#define IF_HAVE_PG_ARCH_2(_name) +#endif + +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 +#define IF_HAVE_PG_ARCH_3(_name) ,{1UL << PG_##_name, __stringify(_name)} +#else +#define IF_HAVE_PG_ARCH_3(_name) #endif #define DEF_PAGEFLAG_NAME(_name) { 1UL << PG_##_name, __stringify(_name) } @@ -100,7 +100,6 @@ #define __def_pageflag_names \ DEF_PAGEFLAG_NAME(locked), \ DEF_PAGEFLAG_NAME(waiters), \ - DEF_PAGEFLAG_NAME(error), \ DEF_PAGEFLAG_NAME(referenced), \ DEF_PAGEFLAG_NAME(uptodate), \ DEF_PAGEFLAG_NAME(dirty), \ @@ -108,39 +107,28 @@ DEF_PAGEFLAG_NAME(active), \ DEF_PAGEFLAG_NAME(workingset), \ DEF_PAGEFLAG_NAME(owner_priv_1), \ + DEF_PAGEFLAG_NAME(owner_2), \ DEF_PAGEFLAG_NAME(arch_1), \ DEF_PAGEFLAG_NAME(reserved), \ DEF_PAGEFLAG_NAME(private), \ DEF_PAGEFLAG_NAME(private_2), \ DEF_PAGEFLAG_NAME(writeback), \ DEF_PAGEFLAG_NAME(head), \ - DEF_PAGEFLAG_NAME(mappedtodisk), \ DEF_PAGEFLAG_NAME(reclaim), \ DEF_PAGEFLAG_NAME(swapbacked), \ DEF_PAGEFLAG_NAME(unevictable) \ IF_HAVE_PG_MLOCK(mlocked) \ -IF_HAVE_PG_UNCACHED(uncached) \ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ IF_HAVE_PG_IDLE(young) \ -IF_HAVE_PG_ARCH_X(arch_2) \ -IF_HAVE_PG_ARCH_X(arch_3) +IF_HAVE_PG_ARCH_2(arch_2) \ +IF_HAVE_PG_ARCH_3(arch_3) #define show_page_flags(flags) \ (flags) ? __print_flags(flags, "|", \ __def_pageflag_names \ ) : "none" -#define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) } - -#define __def_pagetype_names \ - DEF_PAGETYPE_NAME(slab), \ - DEF_PAGETYPE_NAME(hugetlb), \ - DEF_PAGETYPE_NAME(offline), \ - DEF_PAGETYPE_NAME(guard), \ - DEF_PAGETYPE_NAME(table), \ - DEF_PAGETYPE_NAME(buddy) - #if defined(CONFIG_X86) #define __VM_ARCH_SPECIFIC_1 {VM_PAT, "pat" } #elif defined(CONFIG_PPC64) diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index a42be4c8563b..9f0a5d1482c4 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -55,8 +55,8 @@ TRACE_EVENT(reclaim_retry_zone, ), TP_fast_assign( - __entry->node = zone_to_nid(zoneref->zone); - __entry->zone_idx = zoneref->zone_idx; + __entry->node = zonelist_node_idx(zoneref); + __entry->zone_idx = zonelist_zone_idx(zoneref); __entry->order = order; __entry->reclaimable = reclaimable; __entry->available = available; diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 6f2f2720f3ac..ff8032227876 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -7,7 +7,7 @@ */ #define KPF_LOCKED 0 -#define KPF_ERROR 1 +#define KPF_ERROR 1 /* Now unused */ #define KPF_REFERENCED 2 #define KPF_UPTODATE 3 #define KPF_DIRTY 4 diff --git a/init/Kconfig b/init/Kconfig index 8afb993b09cc..c00926fb9887 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -986,7 +986,7 @@ config MEMCG_V1 going to shrink due to deprecation process. New deployments with v1 controller are highly discouraged. - San N is unsure. + Say N if unsure. config BLK_CGROUP bool "IO controller" diff --git a/ipc/shm.c b/ipc/shm.c index 3e3071252dac..99564c870084 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1778,8 +1778,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, - NULL, false); + do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start, + vma->vm_end, NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1803,8 +1803,8 @@ long ksys_shmdt(char __user *shmaddr) if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && (vma->vm_file == file)) { - do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, - NULL, false); + do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start, + vma->vm_end, NULL, false); } vma = vma_next(&vmi); diff --git a/kernel/Makefile b/kernel/Makefile index 3c13240dfc9f..87866b037fbe 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -116,7 +116,6 @@ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o obj-$(CONFIG_CFI_CLANG) += cfi.o -obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 520b90dd97ec..c964dd7ff967 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -81,6 +81,8 @@ struct cgroup_file_ctx { struct { struct cgroup_pidlist *pidlist; } procs1; + + struct cgroup_of_peak peak; }; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 90e50d6d3cf3..f38f61bc068a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1972,6 +1972,13 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param return -EINVAL; } +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of) +{ + struct cgroup_file_ctx *ctx = of->priv; + + return &ctx->peak; +} + static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { @@ -4623,8 +4630,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @root are accessible and @pos is a descendant of @root. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -4672,8 +4680,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct rightmost descendant as - * long as @pos is accessible. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct rightmost descendant as long as @pos + * is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) @@ -4717,9 +4726,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @cgroup are accessible and @pos is a descendant of - * @cgroup. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4b7e590dc428..2ec796e2f055 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -103,8 +103,7 @@ struct xol_area { atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ - struct vm_special_mapping xol_mapping; - struct page *pages[2]; + struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make @@ -1466,6 +1465,21 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } +static vm_fault_t xol_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; + + vmf->page = area->page; + get_page(vmf->page); + return 0; +} + +static const struct vm_special_mapping xol_mapping = { + .name = "[uprobes]", + .fault = xol_fault, +}; + /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { @@ -1492,7 +1506,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, - &area->xol_mapping); + &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto fail; @@ -1531,12 +1545,9 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->xol_mapping.name = "[uprobes]"; - area->xol_mapping.pages = area->pages; - area->pages[0] = alloc_page(GFP_HIGHUSER); - if (!area->pages[0]) + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) goto free_bitmap; - area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); @@ -1544,12 +1555,12 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); insns = arch_uprobe_trampoline(&insns_size); - arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size); + arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; - __free_page(area->pages[0]); + __free_page(area->page); free_bitmap: kfree(area->bitmap); free_area: @@ -1591,7 +1602,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->pages[0]); + put_page(area->page); kfree(area->bitmap); kfree(area); } @@ -1658,7 +1669,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, + arch_uprobe_copy_ixol(area->page, xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; diff --git a/kernel/exit.c b/kernel/exit.c index 0d62a53605df..619f0014c33b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -778,6 +778,62 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ +# ifdef CONFIG_STACK_GROWSUP + n--; +# else + n++; +# endif + } while (!*n); + +# ifdef CONFIG_STACK_GROWSUP + return (unsigned long)end_of_stack(p) - (unsigned long)n; +# else + return (unsigned long)n - (unsigned long)end_of_stack(p); +# endif +} + +/* Count the maximum pages reached in kernel stacks */ +static inline void kstack_histogram(unsigned long used_stack) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + if (used_stack <= 1024) + count_vm_event(KSTACK_1K); +#if THREAD_SIZE > 1024 + else if (used_stack <= 2048) + count_vm_event(KSTACK_2K); +#endif +#if THREAD_SIZE > 2048 + else if (used_stack <= 4096) + count_vm_event(KSTACK_4K); +#endif +#if THREAD_SIZE > 4096 + else if (used_stack <= 8192) + count_vm_event(KSTACK_8K); +#endif +#if THREAD_SIZE > 8192 + else if (used_stack <= 16384) + count_vm_event(KSTACK_16K); +#endif +#if THREAD_SIZE > 16384 + else if (used_stack <= 32768) + count_vm_event(KSTACK_32K); +#endif +#if THREAD_SIZE > 32768 + else if (used_stack <= 65536) + count_vm_event(KSTACK_64K); +#endif +#if THREAD_SIZE > 65536 + else + count_vm_event(KSTACK_REST); +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS */ +} + static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); @@ -785,6 +841,7 @@ static void check_stack_usage(void) unsigned long free; free = stack_not_used(current); + kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; diff --git a/kernel/fork.c b/kernel/fork.c index d4b2d543f48c..5ac41db8aa69 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -832,7 +832,7 @@ static void check_mm(struct mm_struct *mm) pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", mm_pgtables_bytes(mm)); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } @@ -1276,7 +1276,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); diff --git a/kernel/numa.c b/kernel/numa.c deleted file mode 100644 index 67ca6b8585c0..000000000000 --- a/kernel/numa.c +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include - -/* Stub functions: */ - -#ifndef memory_add_physaddr_to_nid -int memory_add_physaddr_to_nid(u64 start) -{ - pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - -#ifndef phys_to_target_node -int phys_to_target_node(u64 start) -{ - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} -EXPORT_SYMBOL_GPL(phys_to_target_node); -#endif diff --git a/kernel/resource.c b/kernel/resource.c index 1681ab5012e1..b730bd28b422 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -450,8 +450,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg, /* re-alloc */ struct resource *rams_new; - rams_new = kvrealloc(rams, rams_size * sizeof(struct resource), - (rams_size + 16) * sizeof(struct resource), + rams_new = kvrealloc(rams, (rams_size + 16) * sizeof(struct resource), GFP_KERNEL); if (!rams_new) goto out; @@ -1860,7 +1859,11 @@ EXPORT_SYMBOL(resource_list_free); #ifdef CONFIG_GET_FREE_REGION #define GFR_DESCENDING (1UL << 0) #define GFR_REQUEST_REGION (1UL << 1) -#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) +#ifdef PA_SECTION_SHIFT +#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) +#else +#define GFR_DEFAULT_ALIGN PAGE_SIZE +#endif static resource_size_t gfr_start(struct resource *base, resource_size_t size, resource_size_t align, unsigned long flags) @@ -1872,7 +1875,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size, return end - size + 1; } - return ALIGN(base->start, align); + return ALIGN(max(base->start, align), align); } static bool gfr_continue(struct resource *base, resource_size_t addr, @@ -2046,7 +2049,7 @@ struct resource *alloc_free_mem_region(struct resource *base, return get_free_mem_region(NULL, base, size, align, name, IORES_DESC_NONE, flags); } -EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL); +EXPORT_SYMBOL_GPL(alloc_free_mem_region); #endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 0e509985a44a..42d2d8d20f5d 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #define R0_START 0x0000 #define R0_END 0xffff @@ -137,9 +139,150 @@ static void resource_test_intersection(struct kunit *test) } while (++i < ARRAY_SIZE(results_for_intersection)); } +/* + * The test resource tree for region_intersects() test: + * + * BASE-BASE+1M-1 : Test System RAM 0 + * # hole 0 (BASE+1M-BASE+2M) + * BASE+2M-BASE+3M-1 : Test CXL Window 0 + * BASE+3M-BASE+4M-1 : Test System RAM 1 + * BASE+4M-BASE+7M-1 : Test CXL Window 1 + * BASE+4M-BASE+5M-1 : Test System RAM 2 + * BASE+4M+128K-BASE+4M+256K-1: Test Code + * BASE+5M-BASE+6M-1 : Test System RAM 3 + */ +#define RES_TEST_RAM0_OFFSET 0 +#define RES_TEST_RAM0_SIZE SZ_1M +#define RES_TEST_HOLE0_OFFSET (RES_TEST_RAM0_OFFSET + RES_TEST_RAM0_SIZE) +#define RES_TEST_HOLE0_SIZE SZ_1M +#define RES_TEST_WIN0_OFFSET (RES_TEST_HOLE0_OFFSET + RES_TEST_HOLE0_SIZE) +#define RES_TEST_WIN0_SIZE SZ_1M +#define RES_TEST_RAM1_OFFSET (RES_TEST_WIN0_OFFSET + RES_TEST_WIN0_SIZE) +#define RES_TEST_RAM1_SIZE SZ_1M +#define RES_TEST_WIN1_OFFSET (RES_TEST_RAM1_OFFSET + RES_TEST_RAM1_SIZE) +#define RES_TEST_WIN1_SIZE (SZ_1M * 3) +#define RES_TEST_RAM2_OFFSET RES_TEST_WIN1_OFFSET +#define RES_TEST_RAM2_SIZE SZ_1M +#define RES_TEST_CODE_OFFSET (RES_TEST_RAM2_OFFSET + SZ_128K) +#define RES_TEST_CODE_SIZE SZ_128K +#define RES_TEST_RAM3_OFFSET (RES_TEST_RAM2_OFFSET + RES_TEST_RAM2_SIZE) +#define RES_TEST_RAM3_SIZE SZ_1M +#define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE)) + +static void remove_free_resource(void *ctx) +{ + struct resource *res = (struct resource *)ctx; + + remove_resource(res); + kfree(res); +} + +static void resource_test_request_region(struct kunit *test, struct resource *parent, + resource_size_t start, resource_size_t size, + const char *name, unsigned long flags) +{ + struct resource *res; + + res = __request_region(parent, start, size, name, flags); + KUNIT_ASSERT_NOT_NULL(test, res); + kunit_add_action_or_reset(test, remove_free_resource, res); +} + +static void resource_test_insert_resource(struct kunit *test, struct resource *parent, + resource_size_t start, resource_size_t size, + const char *name, unsigned long flags) +{ + struct resource *res; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, res); + + res->name = name; + res->start = start; + res->end = start + size - 1; + res->flags = flags; + if (insert_resource(parent, res)) { + kfree(res); + KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res); + } + + kunit_add_action_or_reset(test, remove_free_resource, res); +} + +static void resource_test_region_intersects(struct kunit *test) +{ + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *parent; + resource_size_t start; + + /* Find an iomem_resource hole to hold test resources */ + parent = alloc_free_mem_region(&iomem_resource, RES_TEST_TOTAL_SIZE, SZ_1M, + "test resources"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + start = parent->start; + kunit_add_action_or_reset(test, remove_free_resource, parent); + + resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET, + RES_TEST_RAM0_SIZE, "Test System RAM 0", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_WIN0_OFFSET, + RES_TEST_WIN0_SIZE, "Test CXL Window 0", + IORESOURCE_MEM); + resource_test_request_region(test, parent, start + RES_TEST_RAM1_OFFSET, + RES_TEST_RAM1_SIZE, "Test System RAM 1", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_WIN1_OFFSET, + RES_TEST_WIN1_SIZE, "Test CXL Window 1", + IORESOURCE_MEM); + resource_test_request_region(test, parent, start + RES_TEST_RAM2_OFFSET, + RES_TEST_RAM2_SIZE, "Test System RAM 2", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_CODE_OFFSET, + RES_TEST_CODE_SIZE, "Test Code", flags); + resource_test_request_region(test, parent, start + RES_TEST_RAM3_OFFSET, + RES_TEST_RAM3_SIZE, "Test System RAM 3", flags); + kunit_release_action(test, remove_free_resource, parent); + + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM0_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM0_OFFSET + + RES_TEST_RAM0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_DISJOINT, + region_intersects(start + RES_TEST_HOLE0_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_DISJOINT, + region_intersects(start + RES_TEST_HOLE0_OFFSET + + RES_TEST_HOLE0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_MIXED, + region_intersects(start + RES_TEST_WIN0_OFFSET + + RES_TEST_WIN0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM1_OFFSET + + RES_TEST_RAM1_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM2_OFFSET + + RES_TEST_RAM2_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_CODE_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM2_OFFSET, + RES_TEST_RAM2_SIZE + PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_MIXED, + region_intersects(start + RES_TEST_RAM3_OFFSET, + RES_TEST_RAM3_SIZE + PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); +} + static struct kunit_case resource_test_cases[] = { KUNIT_CASE(resource_test_union), KUNIT_CASE(resource_test_intersection), + KUNIT_CASE(resource_test_region_intersects), {} }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a7af49b3a337..c1a08035ea43 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7483,7 +7483,7 @@ EXPORT_SYMBOL(io_schedule); void sched_show_task(struct task_struct *p) { - unsigned long free = 0; + unsigned long free; int ppid; if (!try_get_task_stack(p)) @@ -7493,9 +7493,7 @@ void sched_show_task(struct task_struct *p) if (task_is_running(p)) pr_cont(" running task "); -#ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); -#endif ppid = 0; rcu_read_lock(); if (pid_alive(p)) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b9784e13e6b6..03d7c5e59a18 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1823,7 +1823,7 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) continue; if (zone_watermark_ok(zone, 0, - wmark_pages(zone, WMARK_PROMO) + enough_wmark, + promo_wmark_pages(zone) + enough_wmark, ZONE_MOVABLE, 0)) return true; } @@ -1921,8 +1921,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared. */ - if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && - !node_is_toptier(src_nid)) { + if (folio_use_access_time(folio)) { struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; @@ -3269,6 +3268,15 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) return true; } + /* + * This vma has not been accessed for a while, and if the number + * the threads in the same process is low, which means no other + * threads can help scan this vma, force a vma scan. + */ + if (READ_ONCE(mm->numa_scan_seq) > + (vma->numab_state->prev_scan_seq + get_nr_threads(current))) + return true; + return false; } diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 8b4f8cc2e0ec..1fec61603ef3 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -198,17 +198,17 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); VMCOREINFO_NUMBER(PG_swapbacked); -#define PAGE_SLAB_MAPCOUNT_VALUE (~PG_slab) +#define PAGE_SLAB_MAPCOUNT_VALUE (PGTY_slab << 24) VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); -#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) +#define PAGE_BUDDY_MAPCOUNT_VALUE (PGTY_buddy << 24) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb) +#define PAGE_HUGETLB_MAPCOUNT_VALUE (PGTY_hugetlb << 24) VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); -#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) +#define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #ifdef CONFIG_KALLSYMS diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 38e58f74fd2b..3b8a915558ef 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2625,6 +2625,7 @@ config RESOURCE_KUNIT_TEST tristate "KUnit test for resource API" if !KUNIT_ALL_TESTS depends on KUNIT default KUNIT_ALL_TESTS + select GET_FREE_REGION help This builds the resource API unit test. Tests the logic of API provided by resource.c and ioport.h. diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index f9ad60a9c7bd..ecb638d4cde1 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -306,8 +306,7 @@ DEFINE_ALLOC_SIZE_TEST_PAIR(vmalloc) orig = kvmalloc(prev_size, gfp); \ KUNIT_EXPECT_TRUE(test, orig != NULL); \ checker(((expected_pages) * PAGE_SIZE) * 2, \ - kvrealloc(orig, prev_size, \ - ((alloc_pages) * PAGE_SIZE) * 2, gfp), \ + kvrealloc(orig, ((alloc_pages) * PAGE_SIZE) * 2, gfp), \ kvfree(p)); \ } while (0) DEFINE_ALLOC_SIZE_TEST_PAIR(kvmalloc) diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c index e7ac8694b797..bc45594ad2a8 100644 --- a/lib/lz4/lz4hc_compress.c +++ b/lib/lz4/lz4hc_compress.c @@ -621,6 +621,7 @@ void LZ4_resetStreamHC(LZ4_streamHC_t *LZ4_streamHCPtr, int compressionLevel) LZ4_streamHCPtr->internal_donotuse.base = NULL; LZ4_streamHCPtr->internal_donotuse.compressionLevel = (unsigned int)compressionLevel; } +EXPORT_SYMBOL(LZ4_resetStreamHC); int LZ4_loadDictHC(LZ4_streamHC_t *LZ4_streamHCPtr, const char *dictionary, diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 6df3a8b95808..20990ecba2dd 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -348,17 +348,17 @@ static inline void *mte_safe_root(const struct maple_enode *node) return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } -static inline void *mte_set_full(const struct maple_enode *node) +static inline void __maybe_unused *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } -static inline void *mte_clear_full(const struct maple_enode *node) +static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } -static inline bool mte_has_null(const struct maple_enode *node) +static inline bool __maybe_unused mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } @@ -474,6 +474,7 @@ enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) /* * mas_set_parent() - Set the parent node and encode the slot + * @mas: The maple state * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. @@ -534,7 +535,7 @@ unsigned int mte_parent_slot(const struct maple_enode *enode) /* * mte_parent() - Get the parent of @node. - * @node: The encoded maple node. + * @enode: The encoded maple node. * * Return: The parent maple node. */ @@ -641,8 +642,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas) /* * ma_pivots() - Get a pointer to the maple node pivots. - * @node - the maple node - * @type - the node type + * @node: the maple node + * @type: the node type * * In the event of a dead node, this array may be %NULL * @@ -665,8 +666,8 @@ static inline unsigned long *ma_pivots(struct maple_node *node, /* * ma_gaps() - Get a pointer to the maple node gaps. - * @node - the maple node - * @type - the node type + * @node: the maple node + * @type: the node type * * Return: A pointer to the maple node gaps */ @@ -880,8 +881,6 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, * @mt: The maple tree * @mn: The maple node * @type: The maple node type - * @offset: The offset of the highest sub-gap in this node. - * @end: The end of the data in this node. */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) @@ -939,7 +938,7 @@ static inline unsigned char ma_meta_gap(struct maple_node *mn) /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node - * @mn: The maple node type + * @mt: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, @@ -953,8 +952,8 @@ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. - * @mat - the ma_topiary, a linked list of dead nodes. - * @dead_enode - the node to be marked as dead and added to the tail of the list + * @mat: the ma_topiary, a linked list of dead nodes. + * @dead_enode: the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ @@ -977,8 +976,8 @@ static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. - * @mas - the maple state - * @mat - the ma_topiary linked list of dead nodes to free. + * @mas: the maple state + * @mat: the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ @@ -999,7 +998,7 @@ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) } /* * mas_descend() - Descend into the slot stored in the ma_state. - * @mas - the maple state. + * @mas: the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ @@ -1346,8 +1345,8 @@ static void mas_node_count(struct ma_state *mas, int count) * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none - * - If it's a single entry: The entry & mas->status == mas_root - * - If it's a tree: NULL & mas->status == safe root node. + * - If it's a single entry: The entry & mas->status == ma_root + * - If it's a tree: NULL & mas->status == ma_active */ static inline struct maple_enode *mas_start(struct ma_state *mas) { @@ -1372,9 +1371,9 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) return NULL; } + mas->node = NULL; /* empty tree */ if (unlikely(!root)) { - mas->node = NULL; mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; @@ -1462,7 +1461,7 @@ static inline unsigned char mas_data_end(struct ma_state *mas) /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node - * @mas - the maple state + * @mas: the maple state * * Return: The maximum gap in the leaf. */ @@ -1544,7 +1543,7 @@ static unsigned long mas_leaf_max_gap(struct ma_state *mas) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type - * @*off: Pointer to store the offset location of the gap. + * @off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * @@ -1651,7 +1650,7 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. - * @mas - the maple state. + * @mas: the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { @@ -1678,8 +1677,8 @@ static inline void mas_update_gap(struct ma_state *mas) /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. - * @mas - the maple state (for the tree) - * @parent - the maple encoded node containing the children. + * @mas: the maple state (for the tree) + * @parent: the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) @@ -1701,8 +1700,8 @@ static inline void mas_adopt_children(struct ma_state *mas, /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. - * @mas - the maple state with the new node - * @old_enode - The old maple encoded node to replace. + * @mas: the maple state with the new node + * @old_enode: The old maple encoded node to replace. */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode) @@ -1730,8 +1729,8 @@ static inline void mas_put_in_tree(struct ma_state *mas, * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. - * @mas - the ma_state with @mas->node pointing to the new node. - * @old_enode - The old maple encoded node. + * @mas: the ma_state with @mas->node pointing to the new node. + * @old_enode: The old maple encoded node. */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) @@ -1796,7 +1795,6 @@ static inline void mab_shift_right(struct maple_big_node *b_node, /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. - * @size: the amount of data in the b_node * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * @@ -1844,6 +1842,7 @@ static inline int mab_no_null_split(struct maple_big_node *b_node, /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. + * @mas: The maple state * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * @@ -2177,7 +2176,8 @@ static inline bool mas_next_sibling(struct ma_state *mas) } /* - * mte_node_or_none() - Set the enode and state. + * mas_node_or_none() - Set the enode and state. + * @mas: the maple state * @enode: The encoded maple node. * * Set the node to the enode and the status. @@ -2228,7 +2228,6 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state - * @old_r: The encoded maple node to the right (next node). */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { @@ -2242,7 +2241,6 @@ static inline void mast_rebalance_next(struct maple_subtree_state *mast) /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state - * @old_l: The encoded maple node to the left (previous node) */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { @@ -2393,9 +2391,9 @@ static inline unsigned char mas_mab_to_node(struct ma_state *mas, /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. - * @b_node - the big node to add the entry - * @mas - the maple state to get the pivot (mas->max) - * @entry - the entry to add, if NULL nothing happens. + * @b_node: the big node to add the entry + * @mas: the maple state to get the pivot (mas->max) + * @entry: the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, @@ -2414,11 +2412,11 @@ static inline void mab_set_b_end(struct maple_big_node *b_node, * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * - * @mas - the maple state with the node that needs a parent - * @left - possible parent 1 - * @right - possible parent 2 - * @slot - the slot the mas->node was placed - * @split - the split location between @left and @right + * @mas: the maple state with the node that needs a parent + * @left: possible parent 1 + * @right: possible parent 2 + * @slot: the slot the mas->node was placed + * @split: the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, @@ -2438,11 +2436,11 @@ static inline void mas_set_split_parent(struct ma_state *mas, /* * mte_mid_split_check() - Check if the next node passes the mid-split - * @**l: Pointer to left encoded maple node. - * @**m: Pointer to middle encoded maple node. - * @**r: Pointer to right encoded maple node. + * @l: Pointer to left encoded maple node. + * @m: Pointer to middle encoded maple node. + * @r: Pointer to right encoded maple node. * @slot: The offset - * @*split: The split location. + * @split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, @@ -2466,10 +2464,10 @@ static inline void mte_mid_split_check(struct maple_enode **l, /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. - * @mast - the maple subtree state - * @left - the left node - * @right - the right node - * @split - the split location. + * @mast: the maple subtree state + * @left: the left node + * @right: the right node + * @split: the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, @@ -2503,7 +2501,6 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast, /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes - * @enode: The encoded maple node * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. @@ -2635,7 +2632,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state - * @old: The old maple encoded node that is being replaced. + * @old_enode: The old maple encoded node that is being replaced. * * Updates gap as necessary. */ @@ -2823,10 +2820,8 @@ static inline void *mtree_range_walk(struct ma_state *mas) * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. - * - * Return: the number of elements in b_node during the last loop. */ -static int mas_spanning_rebalance(struct ma_state *mas, +static void mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; @@ -2942,7 +2937,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); - return mast->bn->b_end; + return; } /* @@ -2952,10 +2947,8 @@ static int mas_spanning_rebalance(struct ma_state *mas, * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. - * - * Return: the number of elements in b_node during the last loop. */ -static inline int mas_rebalance(struct ma_state *mas, +static inline void mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); @@ -2976,9 +2969,6 @@ static inline int mas_rebalance(struct ma_state *mas, * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ - mas_node_count(mas, empty_count * 2 - 1); - if (mas_is_err(mas)) - return 0; mast.orig_l = &l_mas; mast.orig_r = &r_mas; @@ -3029,11 +3019,6 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end /* set up node. */ if (in_rcu) { - /* Allocate for both left and right as well as parent. */ - mas_node_count(mas, 3); - if (mas_is_err(mas)) - return; - newnode = mas_pop_node(mas); } else { newnode = &reuse; @@ -3308,9 +3293,8 @@ static inline bool mas_push_data(struct ma_state *mas, int height, * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node - * Return: 1 on success, 0 on failure. */ -static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) +static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; @@ -3341,10 +3325,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); - /* Allocation failures will happen early. */ - mas_node_count(mas, 1 + mas->depth * 2); - if (mas_is_err(mas)) - return 0; mast.l = &l_mas; mast.r = &r_mas; @@ -3392,75 +3372,25 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) mas->node = l_mas.node; mas_wmb_replace(mas, old); mtree_range_walk(mas); - return 1; -} - -/* - * mas_reuse_node() - Reuse the node to store the data. - * @wr_mas: The maple write state - * @bn: The maple big node - * @end: The end of the data. - * - * Will always return false in RCU mode. - * - * Return: True if node was reused, false otherwise. - */ -static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, - struct maple_big_node *bn, unsigned char end) -{ - /* Need to be rcu safe. */ - if (mt_in_rcu(wr_mas->mas->tree)) - return false; - - if (end > bn->b_end) { - int clear = mt_slots[wr_mas->type] - bn->b_end; - - memset(wr_mas->slots + bn->b_end, 0, sizeof(void *) * clear--); - memset(wr_mas->pivots + bn->b_end, 0, sizeof(void *) * clear); - } - mab_mas_cp(bn, 0, bn->b_end, wr_mas->mas, false); - return true; + return; } /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node - * @end: The end of the data. */ -static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, - struct maple_big_node *b_node, unsigned char end) +static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, + struct maple_big_node *b_node) { - struct maple_node *node; - struct maple_enode *old_enode; - unsigned char b_end = b_node->b_end; - enum maple_type b_type = b_node->type; + enum store_type type = wr_mas->mas->store_type; - old_enode = wr_mas->mas->node; - if ((b_end < mt_min_slots[b_type]) && - (!mte_is_root(old_enode)) && - (mas_mt_height(wr_mas->mas) > 1)) + WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); + + if (type == wr_rebalance) return mas_rebalance(wr_mas->mas, b_node); - if (b_end >= mt_slots[b_type]) - return mas_split(wr_mas->mas, b_node); - - if (mas_reuse_node(wr_mas, b_node, end)) - goto reuse_node; - - mas_node_count(wr_mas->mas, 1); - if (mas_is_err(wr_mas->mas)) - return 0; - - node = mas_pop_node(wr_mas->mas); - node->parent = mas_mn(wr_mas->mas)->parent; - wr_mas->mas->node = mt_mk_node(node, b_type); - mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false); - mas_replace_node(wr_mas->mas, old_enode); -reuse_node: - mas_update_gap(wr_mas->mas); - wr_mas->mas->end = b_end; - return 1; + return mas_split(wr_mas->mas, b_node); } /* @@ -3477,10 +3407,6 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) unsigned long *pivots; int slot = 0; - mas_node_count(mas, 1); - if (unlikely(mas_is_err(mas))) - return 0; - node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); @@ -3526,10 +3452,7 @@ static inline void mas_store_root(struct ma_state *mas, void *entry) /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. - * @mas: The maple state - * @piv: The pivot value being written - * @type: The maple node type - * @entry: The data to write + * @wr_mas: The maple write state * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. @@ -3730,10 +3653,8 @@ static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX - * - * Return 0 on error, 1 on success. */ -static inline int mas_new_root(struct ma_state *mas, void *entry) +static inline void mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; @@ -3749,10 +3670,6 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) goto done; } - mas_node_count(mas, 1); - if (mas_is_err(mas)) - return 0; - node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); @@ -3769,7 +3686,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); - return 1; + return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed @@ -3777,10 +3694,8 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state - * - * Return: 0 on error, positive on success. */ -static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) +static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; @@ -3815,9 +3730,6 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) * entries per level plus a new root. */ height = mas_mt_height(mas); - mas_node_count(mas, 1 + height * 3); - if (mas_is_err(mas)) - return 0; /* * Set up right side. Need to get to the next offset after the spanning @@ -3875,10 +3787,8 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. - * - * Return: True if stored, false otherwise */ -static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, +static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; @@ -3889,11 +3799,6 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); - /* Check if there is enough data. The room is enough. */ - if (!mte_is_root(mas->node) && (new_end <= mt_min_slots[wr_mas->type]) && - !(mas->mas_flags & MA_STATE_BULK)) - return false; - if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) @@ -3901,10 +3806,6 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, /* set up node. */ if (in_rcu) { - mas_node_count(mas, 1); - if (mas_is_err(mas)) - return false; - newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); @@ -3960,16 +3861,14 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; - return true; + return; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state - * - * Return: True if stored, false otherwise */ -static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) +static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; @@ -4001,7 +3900,7 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } else { - return false; + return; } trace_ma_write(__func__, mas, 0, wr_mas->entry); @@ -4012,7 +3911,7 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) if (!wr_mas->entry || gap) mas_update_gap(mas); - return true; + return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) @@ -4061,9 +3960,6 @@ static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; - - if (!wr_mas->entry) - mas_wr_extend_null(wr_mas); } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) @@ -4089,23 +3985,13 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. - * - * Return: True if appended, false otherwise */ -static inline bool mas_wr_append(struct ma_wr_state *wr_mas, +static inline void mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { - struct ma_state *mas; + struct ma_state *mas = wr_mas->mas; void __rcu **slots; - unsigned char end; - - mas = wr_mas->mas; - if (mt_in_rcu(mas->tree)) - return false; - - end = mas->end; - if (mas->offset != end) - return false; + unsigned char end = mas->end; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; @@ -4139,7 +4025,7 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); - return true; + return; } /* @@ -4155,76 +4041,235 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas) trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); - mas_commit_b_node(wr_mas, &b_node, wr_mas->mas->end); -} - -static inline void mas_wr_modify(struct ma_wr_state *wr_mas) -{ - struct ma_state *mas = wr_mas->mas; - unsigned char new_end; - - /* Direct replacement */ - if (wr_mas->r_min == mas->index && wr_mas->r_max == mas->last) { - rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); - if (!!wr_mas->entry ^ !!wr_mas->content) - mas_update_gap(mas); - return; - } - - /* - * new_end exceeds the size of the maple node and cannot enter the fast - * path. - */ - new_end = mas_wr_new_end(wr_mas); - if (new_end >= mt_slots[wr_mas->type]) - goto slow_path; - - /* Attempt to append */ - if (mas_wr_append(wr_mas, new_end)) - return; - - if (new_end == mas->end && mas_wr_slot_store(wr_mas)) - return; - - if (mas_wr_node_store(wr_mas, new_end)) - return; - - if (mas_is_err(mas)) - return; - -slow_path: - mas_wr_bnode(wr_mas); + mas_commit_b_node(wr_mas, &b_node); } /* * mas_wr_store_entry() - Internal call to store a value - * @mas: The maple state - * @entry: The entry to store. - * - * Return: The contents that was stored at the index. + * @wr_mas: The maple write state */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; + unsigned char new_end = mas_wr_new_end(wr_mas); - wr_mas->content = mas_start(mas); - if (mas_is_none(mas) || mas_is_ptr(mas)) { + switch (mas->store_type) { + case wr_invalid: + MT_BUG_ON(mas->tree, 1); + return; + case wr_new_root: + mas_new_root(mas, wr_mas->entry); + break; + case wr_store_root: mas_store_root(mas, wr_mas->entry); + break; + case wr_exact_fit: + rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); + if (!!wr_mas->entry ^ !!wr_mas->content) + mas_update_gap(mas); + break; + case wr_append: + mas_wr_append(wr_mas, new_end); + break; + case wr_slot_store: + mas_wr_slot_store(wr_mas); + break; + case wr_node_store: + mas_wr_node_store(wr_mas, new_end); + break; + case wr_spanning_store: + mas_wr_spanning_store(wr_mas); + break; + case wr_split_store: + case wr_rebalance: + mas_wr_bnode(wr_mas); + break; + } + + return; +} + +static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + if (!mas_is_active(mas)) { + if (mas_is_start(mas)) + goto set_content; + + if (unlikely(mas_is_paused(mas))) + goto reset; + + if (unlikely(mas_is_none(mas))) + goto reset; + + if (unlikely(mas_is_overflow(mas))) + goto reset; + + if (unlikely(mas_is_underflow(mas))) + goto reset; + } + + /* + * A less strict version of mas_is_span_wr() where we allow spanning + * writes within this node. This is to stop partial walks in + * mas_prealloc() from being reset. + */ + if (mas->last > mas->max) + goto reset; + + if (wr_mas->entry) + goto set_content; + + if (mte_is_leaf(mas->node) && mas->last == mas->max) + goto reset; + + goto set_content; + +reset: + mas_reset(mas); +set_content: + wr_mas->content = mas_start(mas); +} + +/** + * mas_prealloc_calc() - Calculate number of nodes needed for a + * given store oepration + * @mas: The maple state + * @entry: The entry to store into the tree + * + * Return: Number of nodes required for preallocation. + */ +static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) +{ + int ret = mas_mt_height(mas) * 3 + 1; + + switch (mas->store_type) { + case wr_invalid: + WARN_ON_ONCE(1); + break; + case wr_new_root: + ret = 1; + break; + case wr_store_root: + if (likely((mas->last != 0) || (mas->index != 0))) + ret = 1; + else if (((unsigned long) (entry) & 3) == 2) + ret = 1; + else + ret = 0; + break; + case wr_spanning_store: + ret = mas_mt_height(mas) * 3 + 1; + break; + case wr_split_store: + ret = mas_mt_height(mas) * 2 + 1; + break; + case wr_rebalance: + ret = mas_mt_height(mas) * 2 - 1; + break; + case wr_node_store: + ret = mt_in_rcu(mas->tree) ? 1 : 0; + break; + case wr_append: + case wr_exact_fit: + case wr_slot_store: + ret = 0; + } + + return ret; +} + +/* + * mas_wr_store_type() - Set the store type for a given + * store operation. + * @wr_mas: The maple write state + */ +static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + unsigned char new_end; + + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) { + mas->store_type = wr_store_root; return; } if (unlikely(!mas_wr_walk(wr_mas))) { - mas_wr_spanning_store(wr_mas); + mas->store_type = wr_spanning_store; return; } /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); - /* New root for a single pointer */ - if (unlikely(!mas->index && mas->last == ULONG_MAX)) - mas_new_root(mas, wr_mas->entry); - else - mas_wr_modify(wr_mas); + if (!wr_mas->entry) + mas_wr_extend_null(wr_mas); + + new_end = mas_wr_new_end(wr_mas); + if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) { + mas->store_type = wr_exact_fit; + return; + } + + if (unlikely(!mas->index && mas->last == ULONG_MAX)) { + mas->store_type = wr_new_root; + return; + } + + /* Potential spanning rebalance collapsing a node */ + if (new_end < mt_min_slots[wr_mas->type]) { + if (!mte_is_root(mas->node)) { + mas->store_type = wr_rebalance; + return; + } + mas->store_type = wr_node_store; + return; + } + + if (new_end >= mt_slots[wr_mas->type]) { + mas->store_type = wr_split_store; + return; + } + + if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) { + mas->store_type = wr_append; + return; + } + + if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || + (wr_mas->offset_end - mas->offset == 1))) { + mas->store_type = wr_slot_store; + return; + } + + if (mte_is_root(mas->node) || (new_end >= mt_min_slots[wr_mas->type]) || + (mas->mas_flags & MA_STATE_BULK)) { + mas->store_type = wr_node_store; + return; + } + + mas->store_type = wr_invalid; + MAS_WARN_ON(mas, 1); +} + +/** + * mas_wr_preallocate() - Preallocate enough nodes for a store operation + * @wr_mas: The maple write state + * @entry: The entry that will be stored + * + */ +static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) +{ + struct ma_state *mas = wr_mas->mas; + int request; + + mas_wr_prealloc_setup(wr_mas); + mas_wr_store_type(wr_mas); + request = mas_prealloc_calc(mas, entry); + if (!request) + return; + + mas_node_count(mas, request); } /** @@ -4257,26 +4302,24 @@ static inline void *mas_insert(struct ma_state *mas, void *entry) if (wr_mas.content) goto exists; - if (mas_is_none(mas) || mas_is_ptr(mas)) { - mas_store_root(mas, entry); + mas_wr_preallocate(&wr_mas, entry); + if (mas_is_err(mas)) return NULL; - } /* spanning writes always overwrite something */ - if (!mas_wr_walk(&wr_mas)) + if (mas->store_type == wr_spanning_store) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ - wr_mas.offset_end = mas->offset; - wr_mas.end_piv = wr_mas.r_max; + if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) { + wr_mas.offset_end = mas->offset; + wr_mas.end_piv = wr_mas.r_max; - if (wr_mas.content || (mas->last > wr_mas.r_max)) - goto exists; + if (wr_mas.content || (mas->last > wr_mas.r_max)) + goto exists; + } - if (!entry) - return NULL; - - mas_wr_modify(&wr_mas); + mas_wr_store_entry(&wr_mas); return wr_mas.content; exists: @@ -4331,6 +4374,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; + mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); @@ -4440,9 +4484,8 @@ static int mas_prev_node(struct ma_state *mas, unsigned long min) * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state - * @max: The minimum starting range + * @min: The minimum starting range * @empty: Can be empty - * @set_underflow: Set the @mas->node to underflow state on limit. * * Return: The entry in the previous slot which is possibly NULL */ @@ -4525,6 +4568,7 @@ static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state + * @node: The maple node * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have @@ -4615,8 +4659,6 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node, * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty - * @set_overflow: Should @mas->node be set to overflow when the limit is - * reached. * * Return: The entry in the next slot which is possibly NULL */ @@ -5150,9 +5192,9 @@ EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. - * @mas: The maple state + * @enode: the encoded node + * @mt: the maple tree * @slots: Pointer to the slot array - * @type: The maple node type * * Must hold the write lock. * @@ -5358,47 +5400,6 @@ static inline void mte_destroy_walk(struct maple_enode *enode, mt_destroy_walk(enode, mt, true); } } - -static void mas_wr_store_setup(struct ma_wr_state *wr_mas) -{ - if (!mas_is_active(wr_mas->mas)) { - if (mas_is_start(wr_mas->mas)) - return; - - if (unlikely(mas_is_paused(wr_mas->mas))) - goto reset; - - if (unlikely(mas_is_none(wr_mas->mas))) - goto reset; - - if (unlikely(mas_is_overflow(wr_mas->mas))) - goto reset; - - if (unlikely(mas_is_underflow(wr_mas->mas))) - goto reset; - } - - /* - * A less strict version of mas_is_span_wr() where we allow spanning - * writes within this node. This is to stop partial walks in - * mas_prealloc() from being reset. - */ - if (wr_mas->mas->last > wr_mas->mas->max) - goto reset; - - if (wr_mas->entry) - return; - - if (mte_is_leaf(wr_mas->mas->node) && - wr_mas->mas->last == wr_mas->mas->max) - goto reset; - - return; - -reset: - mas_reset(wr_mas->mas); -} - /* Interface */ /** @@ -5407,13 +5408,12 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. - * Note: The @mas should have pre-allocated entries to ensure there is memory to - * store the entry. Please see mas_expected_entries()/mas_destroy() for more details. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { + int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); @@ -5434,8 +5434,25 @@ void *mas_store(struct ma_state *mas, void *entry) * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ - mas_wr_store_setup(&wr_mas); + mas_wr_prealloc_setup(&wr_mas); + mas_wr_store_type(&wr_mas); + if (mas->mas_flags & MA_STATE_PREALLOC) { + mas_wr_store_entry(&wr_mas); + MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); + return wr_mas.content; + } + + request = mas_prealloc_calc(mas, entry); + if (!request) + goto store; + + mas_node_count(mas, request); + if (mas_is_err(mas)) + return NULL; + +store: mas_wr_store_entry(&wr_mas); + mas_destroy(mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); @@ -5451,19 +5468,28 @@ EXPORT_SYMBOL_GPL(mas_store); */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { + unsigned long index = mas->index; + unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); + int ret = 0; - mas_wr_store_setup(&wr_mas); - trace_ma_write(__func__, mas, 0, entry); retry: - mas_wr_store_entry(&wr_mas); - if (unlikely(mas_nomem(mas, gfp))) + mas_wr_preallocate(&wr_mas, entry); + if (unlikely(mas_nomem(mas, gfp))) { + if (!entry) + __mas_set_range(mas, index, last); goto retry; + } - if (unlikely(mas_is_err(mas))) - return xa_err(mas->node); + if (mas_is_err(mas)) { + ret = xa_err(mas->node); + goto out; + } - return 0; + mas_wr_store_entry(&wr_mas); +out: + mas_destroy(mas); + return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); @@ -5477,7 +5503,19 @@ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); - mas_wr_store_setup(&wr_mas); + if (mas->store_type == wr_store_root) { + mas_wr_prealloc_setup(&wr_mas); + goto store; + } + + mas_wr_walk_descend(&wr_mas); + if (mas->store_type != wr_spanning_store) { + /* set wr_mas->content to current slot */ + wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset); + mas_wr_end_piv(&wr_mas); + } + +store: trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); @@ -5496,70 +5534,25 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); - unsigned char node_size; - int request = 1; - int ret; + int ret = 0; + int request; + mas_wr_prealloc_setup(&wr_mas); + mas_wr_store_type(&wr_mas); + request = mas_prealloc_calc(mas, entry); + if (!request) + return ret; - if (unlikely(!mas->index && mas->last == ULONG_MAX)) - goto ask_now; - - mas_wr_store_setup(&wr_mas); - wr_mas.content = mas_start(mas); - /* Root expand */ - if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) - goto ask_now; - - if (unlikely(!mas_wr_walk(&wr_mas))) { - /* Spanning store, use worst case for now */ - request = 1 + mas_mt_height(mas) * 3; - goto ask_now; - } - - /* At this point, we are at the leaf node that needs to be altered. */ - /* Exact fit, no nodes needed. */ - if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last) - return 0; - - mas_wr_end_piv(&wr_mas); - node_size = mas_wr_new_end(&wr_mas); - - /* Slot store, does not require additional nodes */ - if (node_size == mas->end) { - /* reuse node */ - if (!mt_in_rcu(mas->tree)) - return 0; - /* shifting boundary */ - if (wr_mas.offset_end - mas->offset == 1) - return 0; - } - - if (node_size >= mt_slots[wr_mas.type]) { - /* Split, worst case for now. */ - request = 1 + mas_mt_height(mas) * 2; - goto ask_now; - } - - /* New root needs a single node */ - if (unlikely(mte_is_root(mas->node))) - goto ask_now; - - /* Potential spanning rebalance collapsing a node, use worst-case */ - if (node_size - 1 <= mt_min_slots[wr_mas.type]) - request = mas_mt_height(mas) * 2 - 1; - - /* node store, slot store needs one node */ -ask_now: mas_node_count_gfp(mas, request, gfp); - mas->mas_flags |= MA_STATE_PREALLOC; - if (likely(!mas_is_err(mas))) - return 0; + if (mas_is_err(mas)) { + mas_set_alloc_req(mas, 0); + ret = xa_err(mas->node); + mas_destroy(mas); + mas_reset(mas); + return ret; + } - mas_set_alloc_req(mas, 0); - ret = xa_err(mas->node); - mas_reset(mas); - mas_destroy(mas); - mas_reset(mas); + mas->mas_flags |= MA_STATE_PREALLOC; return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); @@ -5585,7 +5578,8 @@ void mas_destroy(struct ma_state *mas) */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; - + if (mas_is_err(mas)) + mas_reset(mas); mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; @@ -6245,24 +6239,32 @@ EXPORT_SYMBOL_GPL(mas_find_range_rev); void *mas_erase(struct ma_state *mas) { void *entry; + unsigned long index = mas->index; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; - /* Retry unnecessary when holding the write lock. */ +write_retry: entry = mas_state_walk(mas); if (!entry) return NULL; -write_retry: /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); - mas_wr_store_setup(&wr_mas); - mas_wr_store_entry(&wr_mas); - if (mas_nomem(mas, GFP_KERNEL)) + mas_wr_preallocate(&wr_mas, NULL); + if (mas_nomem(mas, GFP_KERNEL)) { + /* in case the range of entry changed when unlocked */ + mas->index = mas->last = index; goto write_retry; + } + if (mas_is_err(mas)) + goto out; + + mas_wr_store_entry(&wr_mas); +out: + mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); @@ -6277,10 +6279,8 @@ EXPORT_SYMBOL_GPL(mas_erase); bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { - if (likely(mas->node != MA_ERROR(-ENOMEM))) { - mas_destroy(mas); + if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; - } if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); @@ -6357,7 +6357,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); - MA_WR_STATE(wr_mas, &mas, entry); + int ret = 0; trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) @@ -6367,16 +6367,10 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index, return -EINVAL; mtree_lock(mt); -retry: - mas_wr_store_entry(&wr_mas); - if (mas_nomem(&mas, gfp)) - goto retry; - + ret = mas_store_gfp(&mas, entry, gfp); mtree_unlock(mt); - if (mas_is_err(&mas)) - return xa_err(mas.node); - return 0; + return ret; } EXPORT_SYMBOL(mtree_store_range); @@ -6412,6 +6406,7 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); + int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; @@ -6427,9 +6422,10 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first, mtree_unlock(mt); if (mas_is_err(&ms)) - return xa_err(ms.node); + ret = xa_err(ms.node); - return 0; + mas_destroy(&ms); + return ret; } EXPORT_SYMBOL(mtree_insert_range); @@ -6484,6 +6480,7 @@ int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, unlock: mtree_unlock(mt); + mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); @@ -6565,6 +6562,7 @@ int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, unlock: mtree_unlock(mt); + mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); @@ -6997,6 +6995,19 @@ void mt_set_non_kernel(unsigned int val) kmem_cache_set_non_kernel(maple_node_cache, val); } +extern void kmem_cache_set_callback(struct kmem_cache *cachep, + void (*callback)(void *)); +void mt_set_callback(void (*callback)(void *)) +{ + kmem_cache_set_callback(maple_node_cache, callback); +} + +extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); +void mt_set_private(void *private) +{ + kmem_cache_set_private(maple_node_cache, private); +} + extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { @@ -7181,7 +7192,6 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; - bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; @@ -7215,19 +7225,22 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, break; if (last == 0 && i > 0) break; - if (leaf) - mt_dump_entry(mt_slot(mt, node->slot, i), - first, last, depth + 1, format); - else if (node->slot[i]) + if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + switch(format) { + case mt_dump_hex: + pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); - break; + break; + case mt_dump_dec: + pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + node, last, max, i); + } } first = last + 1; } @@ -7627,6 +7640,40 @@ void mas_dump(const struct ma_state *mas) break; } + pr_err("Store Type: "); + switch (mas->store_type) { + case wr_invalid: + pr_err("invalid store type\n"); + break; + case wr_new_root: + pr_err("new_root\n"); + break; + case wr_store_root: + pr_err("store_root\n"); + break; + case wr_exact_fit: + pr_err("exact_fit\n"); + break; + case wr_split_store: + pr_err("split_store\n"); + break; + case wr_slot_store: + pr_err("slot_store\n"); + break; + case wr_append: + pr_err("append\n"); + break; + case wr_node_store: + pr_err("node_store\n"); + break; + case wr_spanning_store: + pr_err("spanning_store\n"); + break; + case wr_rebalance: + pr_err("rebalance\n"); + break; + } + pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", diff --git a/lib/test_hmm.c b/lib/test_hmm.c index ee20e1f9bae9..056f2e411d7b 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -799,10 +799,7 @@ static int dmirror_exclusive(struct dmirror *dmirror, unsigned long mapped = 0; int i; - if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)) - next = end; - else - next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT); + next = min(end, addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)); ret = make_device_exclusive_range(mm, addr, next, pages, NULL); /* diff --git a/lib/test_printf.c b/lib/test_printf.c index 965cb6f28527..8448b6d02bd9 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -641,26 +641,12 @@ page_flags_test(int section, int node, int zone, int last_cpupid, test(cmp_buf, "%pGp", &flags); } -static void __init page_type_test(unsigned int page_type, const char *name, - char *cmp_buf) -{ - unsigned long size; - - size = scnprintf(cmp_buf, BUF_SIZE, "%#x(", page_type); - if (page_type_has_type(page_type)) - size += scnprintf(cmp_buf + size, BUF_SIZE - size, "%s", name); - - snprintf(cmp_buf + size, BUF_SIZE - size, ")"); - test(cmp_buf, "%pGt", &page_type); -} - static void __init flags(void) { unsigned long flags; char *cmp_buffer; gfp_t gfp; - unsigned int page_type; cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); if (!cmp_buffer) @@ -700,18 +686,6 @@ flags(void) gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); - page_type = ~0; - page_type_test(page_type, "", cmp_buffer); - - page_type = 10; - page_type_test(page_type, "", cmp_buffer); - - page_type = ~PG_buddy; - page_type_test(page_type, "buddy", cmp_buffer); - - page_type = ~(PG_table | PG_buddy); - page_type_test(page_type, "table|buddy", cmp_buffer); - kfree(cmp_buffer); } diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 2d71b1115916..09f022ba1c05 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2054,25 +2054,6 @@ char *format_page_flags(char *buf, char *end, unsigned long flags) return buf; } -static -char *format_page_type(char *buf, char *end, unsigned int page_type) -{ - buf = number(buf, end, page_type, default_flag_spec); - - if (buf < end) - *buf = '('; - buf++; - - if (page_type_has_type(page_type)) - buf = format_flags(buf, end, ~page_type, pagetype_names); - - if (buf < end) - *buf = ')'; - buf++; - - return buf; -} - static noinline_for_stack char *flags_string(char *buf, char *end, void *flags_ptr, struct printf_spec spec, const char *fmt) @@ -2086,8 +2067,6 @@ char *flags_string(char *buf, char *end, void *flags_ptr, switch (fmt[1]) { case 'p': return format_page_flags(buf, end, *(unsigned long *)flags_ptr); - case 't': - return format_page_type(buf, end, *(unsigned int *)flags_ptr); case 'v': flags = *(unsigned long *)flags_ptr; names = vmaflag_names; diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c index f620cafca633..16bb995bc6c4 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -4810,6 +4810,8 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( dictLoadMethod, cctxParams.cParams, cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, customMem); + if (!cdict) + return NULL; if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c index 04e1b5c01d9b..bd8784449b31 100644 --- a/lib/zstd/zstd_compress_module.c +++ b/lib/zstd/zstd_compress_module.c @@ -66,6 +66,12 @@ int zstd_max_clevel(void) } EXPORT_SYMBOL(zstd_max_clevel); +int zstd_default_clevel(void) +{ + return ZSTD_defaultCLevel(); +} +EXPORT_SYMBOL(zstd_default_clevel); + size_t zstd_compress_bound(size_t src_size) { return ZSTD_compressBound(src_size); @@ -79,6 +85,13 @@ zstd_parameters zstd_get_params(int level, } EXPORT_SYMBOL(zstd_get_params); +zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size) +{ + return ZSTD_getCParams(level, estimated_src_size, dict_size); +} +EXPORT_SYMBOL(zstd_get_cparams); + size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams) { return ZSTD_estimateCCtxSize_usingCParams(*cparams); @@ -93,6 +106,33 @@ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size) } EXPORT_SYMBOL(zstd_init_cctx); +zstd_cctx *zstd_create_cctx_advanced(zstd_custom_mem custom_mem) +{ + return ZSTD_createCCtx_advanced(custom_mem); +} +EXPORT_SYMBOL(zstd_create_cctx_advanced); + +size_t zstd_free_cctx(zstd_cctx *cctx) +{ + return ZSTD_freeCCtx(cctx); +} +EXPORT_SYMBOL(zstd_free_cctx); + +zstd_cdict *zstd_create_cdict_byreference(const void *dict, size_t dict_size, + zstd_compression_parameters cparams, + zstd_custom_mem custom_mem) +{ + return ZSTD_createCDict_advanced(dict, dict_size, ZSTD_dlm_byRef, + ZSTD_dct_auto, cparams, custom_mem); +} +EXPORT_SYMBOL(zstd_create_cdict_byreference); + +size_t zstd_free_cdict(zstd_cdict *cdict) +{ + return ZSTD_freeCDict(cdict); +} +EXPORT_SYMBOL(zstd_free_cdict); + size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, const void *src, size_t src_size, const zstd_parameters *parameters) { @@ -101,6 +141,15 @@ size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, } EXPORT_SYMBOL(zstd_compress_cctx); +size_t zstd_compress_using_cdict(zstd_cctx *cctx, void *dst, + size_t dst_capacity, const void *src, size_t src_size, + const ZSTD_CDict *cdict) +{ + return ZSTD_compress_usingCDict(cctx, dst, dst_capacity, + src, src_size, cdict); +} +EXPORT_SYMBOL(zstd_compress_using_cdict); + size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams) { return ZSTD_estimateCStreamSize_usingCParams(*cparams); diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c index f4ed952ed485..469fc3059be0 100644 --- a/lib/zstd/zstd_decompress_module.c +++ b/lib/zstd/zstd_decompress_module.c @@ -44,6 +44,33 @@ size_t zstd_dctx_workspace_bound(void) } EXPORT_SYMBOL(zstd_dctx_workspace_bound); +zstd_dctx *zstd_create_dctx_advanced(zstd_custom_mem custom_mem) +{ + return ZSTD_createDCtx_advanced(custom_mem); +} +EXPORT_SYMBOL(zstd_create_dctx_advanced); + +size_t zstd_free_dctx(zstd_dctx *dctx) +{ + return ZSTD_freeDCtx(dctx); +} +EXPORT_SYMBOL(zstd_free_dctx); + +zstd_ddict *zstd_create_ddict_byreference(const void *dict, size_t dict_size, + zstd_custom_mem custom_mem) +{ + return ZSTD_createDDict_advanced(dict, dict_size, ZSTD_dlm_byRef, + ZSTD_dct_auto, custom_mem); + +} +EXPORT_SYMBOL(zstd_create_ddict_byreference); + +size_t zstd_free_ddict(zstd_ddict *ddict) +{ + return ZSTD_freeDDict(ddict); +} +EXPORT_SYMBOL(zstd_free_ddict); + zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size) { if (workspace == NULL) @@ -59,6 +86,15 @@ size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, } EXPORT_SYMBOL(zstd_decompress_dctx); +size_t zstd_decompress_using_ddict(zstd_dctx *dctx, + void *dst, size_t dst_capacity, const void* src, size_t src_size, + const zstd_ddict* ddict) +{ + return ZSTD_decompress_usingDDict(dctx, dst, dst_capacity, src, + src_size, ddict); +} +EXPORT_SYMBOL(zstd_decompress_using_ddict); + size_t zstd_dstream_workspace_bound(size_t max_window_size) { return ZSTD_estimateDStreamSize(max_window_size); diff --git a/mm/Kconfig b/mm/Kconfig index b72e7d040f78..09aebca1cae3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -128,7 +128,7 @@ config ZSWAP_COMPRESSOR_DEFAULT choice prompt "Default allocator" depends on ZSWAP - default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if HAVE_ZSMALLOC + default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU default ZSWAP_ZPOOL_DEFAULT_ZBUD help Selects the default allocator for the compressed cache for @@ -146,15 +146,17 @@ config ZSWAP_ZPOOL_DEFAULT_ZBUD help Use the zbud allocator as the default allocator. -config ZSWAP_ZPOOL_DEFAULT_Z3FOLD - bool "z3fold" - select Z3FOLD +config ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED + bool "z3foldi (DEPRECATED)" + select Z3FOLD_DEPRECATED help Use the z3fold allocator as the default allocator. + Deprecated and scheduled for removal in a few cycles, + see CONFIG_Z3FOLD_DEPRECATED. + config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC bool "zsmalloc" - depends on HAVE_ZSMALLOC select ZSMALLOC help Use the zsmalloc allocator as the default allocator. @@ -164,7 +166,7 @@ config ZSWAP_ZPOOL_DEFAULT string depends on ZSWAP default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD - default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD + default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC default "" @@ -178,24 +180,29 @@ config ZBUD deterministic reclaim properties that make it preferable to a higher density approach when reclaim will be used. -config Z3FOLD - tristate "3:1 compression allocator (z3fold)" +config Z3FOLD_DEPRECATED + tristate "3:1 compression allocator (z3fold) (DEPRECATED)" depends on ZSWAP help + Deprecated and scheduled for removal in a few cycles. If you have + a good reason for using Z3FOLD over ZSMALLOC, please contact + linux-mm@kvack.org and the zswap maintainers. + A special purpose allocator for storing compressed pages. It is designed to store up to three compressed pages per physical page. It is a ZBUD derivative so the simplicity and determinism are still there. -config HAVE_ZSMALLOC - def_bool y - depends on MMU - depends on PAGE_SIZE_LESS_THAN_256KB # we want <= 64 KiB +config Z3FOLD + tristate + default y if Z3FOLD_DEPRECATED=y + default m if Z3FOLD_DEPRECATED=m + depends on Z3FOLD_DEPRECATED config ZSMALLOC tristate - prompt "N:1 compression allocator (zsmalloc)" if ZSWAP - depends on HAVE_ZSMALLOC + prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM) + depends on MMU help zsmalloc is a slab-based memory allocator designed to store pages of various compression levels efficiently. It achieves @@ -585,17 +592,21 @@ config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE # at the same time (e.g. copy_page_range()). # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # -config SPLIT_PTLOCK_CPUS - int - default "999999" if !MMU - default "999999" if ARM && !CPU_CACHE_VIPT - default "999999" if PARISC && !PA20 - default "999999" if SPARC32 - default "4" +config SPLIT_PTE_PTLOCKS + def_bool y + depends on MMU + depends on NR_CPUS >= 4 + depends on !ARM || CPU_CACHE_VIPT + depends on !PARISC || PA20 + depends on !SPARC32 config ARCH_ENABLE_SPLIT_PMD_PTLOCK bool +config SPLIT_PMD_PTLOCKS + def_bool y + depends on SPLIT_PTE_PTLOCKS && ARCH_ENABLE_SPLIT_PMD_PTLOCK + # # support for memory balloon config MEMORY_BALLOON @@ -877,6 +888,19 @@ endif # TRANSPARENT_HUGEPAGE config PGTABLE_HAS_HUGE_LEAVES def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE +# TODO: Allow to be enabled without THP +config ARCH_SUPPORTS_HUGE_PFNMAP + def_bool n + depends on TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PMD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PUD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + # # UP and nommu archs use km based percpu allocator # @@ -1081,13 +1105,10 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool -config ARCH_USES_PG_ARCH_X +config ARCH_USES_PG_ARCH_2 + bool +config ARCH_USES_PG_ARCH_3 bool - help - Enable the definition of PG_arch_x page flags with x > 1. Only - suitable for 64-bit architectures with CONFIG_FLATMEM or - CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be - enough room for additional bits in page->flags. config VM_EVENT_COUNTERS default y @@ -1263,6 +1284,17 @@ config IOMMU_MM_DATA config EXECMEM bool +config NUMA_MEMBLKS + bool + +config NUMA_EMU + bool "NUMA emulation" + depends on NUMA_MEMBLKS + help + Enable NUMA emulation. A flat machine will be split + into virtual nodes when booted with "numa=fake=N", where N is the + number of nodes. This is only useful for debugging. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index d2915f8c9dc0..d5639b036166 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o + pgtable-generic.o rmap.o vmalloc.o vma.o ifdef CONFIG_CROSS_MEMORY_ATTACH @@ -53,7 +53,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shrinker.o \ shmem.o util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o show_mem.o shmem_quota.o\ + compaction.o show_mem.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) @@ -117,6 +117,9 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_Z3FOLD) += z3fold.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o +obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o @@ -141,3 +144,4 @@ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o +obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o diff --git a/mm/cma.c b/mm/cma.c index 3e9724716bad..2d9fae939283 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -202,7 +202,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, cma->order_per_bit = order_per_bit; *res_cma = cma; cma_area_count++; - totalcma_pages += (size / PAGE_SIZE); + totalcma_pages += cma->count; return 0; } @@ -403,18 +403,8 @@ static void cma_debug_show_areas(struct cma *cma) spin_unlock_irq(&cma->lock); } -/** - * cma_alloc() - allocate pages from contiguous area - * @cma: Contiguous memory region for which the allocation is performed. - * @count: Requested number of pages. - * @align: Requested alignment of pages (in PAGE_SIZE order). - * @no_warn: Avoid printing message about failed allocation - * - * This function allocates part of contiguous memory on specific - * contiguous memory area. - */ -struct page *cma_alloc(struct cma *cma, unsigned long count, - unsigned int align, bool no_warn) +static struct page *__cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, gfp_t gfp) { unsigned long mask, offset; unsigned long pfn = -1; @@ -463,8 +453,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -494,7 +483,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, page_kasan_tag_reset(nth_page(page, i)); } - if (ret && !no_warn) { + if (ret && !(gfp & __GFP_NOWARN)) { pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", __func__, cma->name, count, ret); cma_debug_show_areas(cma); @@ -513,6 +502,34 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, return page; } +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * @no_warn: Avoid printing message about failed allocation + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) +{ + return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); +} + +struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) +{ + struct page *page; + + if (WARN_ON(!order || !(gfp & __GFP_COMP))) + return NULL; + + page = __cma_alloc(cma, 1 << order, order, gfp); + + return page ? page_folio(page) : NULL; +} + bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count) { @@ -564,6 +581,14 @@ bool cma_release(struct cma *cma, const struct page *pages, return true; } +bool cma_free_folio(struct cma *cma, const struct folio *folio) +{ + if (WARN_ON(!folio_test_large(folio))) + return false; + + return cma_release(cma, &folio->page, folio_nr_pages(folio)); +} + int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) { int i; diff --git a/mm/compaction.c b/mm/compaction.c index eb95e9b435d0..a2b16b08cbbf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -86,33 +87,6 @@ static struct page *mark_allocated_noprof(struct page *page, unsigned int order, } #define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) -static void split_map_pages(struct list_head *freepages) -{ - unsigned int i, order; - struct page *page, *next; - LIST_HEAD(tmp_list); - - for (order = 0; order < NR_PAGE_ORDERS; order++) { - list_for_each_entry_safe(page, next, &freepages[order], lru) { - unsigned int nr_pages; - - list_del(&page->lru); - - nr_pages = 1 << order; - - mark_allocated(page, order, __GFP_MOVABLE); - if (order) - split_page(page, order); - - for (i = 0; i < nr_pages; i++) { - list_add(&page->lru, &tmp_list); - page++; - } - } - list_splice_init(&tmp_list, &freepages[0]); - } -} - static unsigned long release_free_list(struct list_head *freepages) { int order; @@ -742,11 +716,11 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * * Non-free pages, invalid PFNs, or zone boundaries within the * [start_pfn, end_pfn) range are considered errors, cause function to - * undo its actions and return zero. + * undo its actions and return zero. cc->freepages[] are empty. * * Otherwise, function returns one-past-the-last PFN of isolated page * (which may be greater then end_pfn if end fell in a middle of - * a free page). + * a free page). cc->freepages[] contain free pages isolated. */ unsigned long isolate_freepages_range(struct compact_control *cc, @@ -754,10 +728,9 @@ isolate_freepages_range(struct compact_control *cc, { unsigned long isolated, pfn, block_start_pfn, block_end_pfn; int order; - struct list_head tmp_freepages[NR_PAGE_ORDERS]; for (order = 0; order < NR_PAGE_ORDERS; order++) - INIT_LIST_HEAD(&tmp_freepages[order]); + INIT_LIST_HEAD(&cc->freepages[order]); pfn = start_pfn; block_start_pfn = pageblock_start_pfn(pfn); @@ -788,7 +761,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, tmp_freepages, 0, true); + block_end_pfn, cc->freepages, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -807,13 +780,10 @@ isolate_freepages_range(struct compact_control *cc, if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ - release_free_list(tmp_freepages); + release_free_list(cc->freepages); return 0; } - /* __isolate_free_page() does not map the pages */ - split_map_pages(tmp_freepages); - /* We don't use freelists for anything. */ return pfn; } @@ -2853,6 +2823,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, ac->highest_zoneidx, ac->nodemask) { enum compact_result status; + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + if (prio > MIN_COMPACT_PRIORITY && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); diff --git a/mm/damon/core.c b/mm/damon/core.c index 7a87628b76ab..a83f3b736d51 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -552,7 +552,13 @@ static unsigned int damon_accesses_bp_to_nr_accesses( return accesses_bp * damon_max_nr_accesses(attrs) / 10000; } -/* convert nr_accesses to access ratio in bp (per 10,000) */ +/* + * Convert nr_accesses to access ratio in bp (per 10,000). + * + * Callers should ensure attrs.aggr_interval is not zero, like + * damon_update_monitoring_results() does . Otherwise, divide-by-zero would + * happen. + */ static unsigned int damon_nr_accesses_to_accesses_bp( unsigned int nr_accesses, struct damon_attrs *attrs) { @@ -1582,13 +1588,16 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) return; /* Fill up the score histogram */ - memset(quota->histogram, 0, sizeof(quota->histogram)); + memset(c->regions_score_histogram, 0, + sizeof(*c->regions_score_histogram) * + (DAMOS_MAX_SCORE + 1)); damon_for_each_target(t, c) { damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; score = c->ops.get_scheme_score(c, t, r, s); - quota->histogram[score] += damon_sz_region(r); + c->regions_score_histogram[score] += + damon_sz_region(r); if (score > max_score) max_score = score; } @@ -1596,7 +1605,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* Set the min score limit */ for (cumulated_sz = 0, score = max_score; ; score--) { - cumulated_sz += quota->histogram[score]; + cumulated_sz += c->regions_score_histogram[score]; if (cumulated_sz >= quota->esz || !score) break; } @@ -1957,6 +1966,10 @@ static int kdamond_fn(void *data) ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) goto done; + ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1, + sizeof(*ctx->regions_score_histogram), GFP_KERNEL); + if (!ctx->regions_score_histogram) + goto done; sz_limit = damon_region_sz_limit(ctx); @@ -2034,6 +2047,7 @@ static int kdamond_fn(void *data) ctx->callback.before_terminate(ctx); if (ctx->ops.cleanup) ctx->ops.cleanup(ctx); + kfree(ctx->regions_score_histogram); pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); @@ -2205,4 +2219,4 @@ static int __init damon_init(void) subsys_initcall(damon_init); -#include "core-test.h" +#include "tests/core-kunit.h" diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 51a6f1cac385..b4213bc47e44 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -1145,4 +1145,4 @@ static int __init damon_dbgfs_init(void) module_init(damon_dbgfs_init); -#include "dbgfs-test.h" +#include "tests/dbgfs-kunit.h" diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index cffc755e7775..58145d59881d 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1882,4 +1882,4 @@ static int __init damon_sysfs_init(void) } subsys_initcall(damon_sysfs_init); -#include "sysfs-test.h" +#include "tests/sysfs-kunit.h" diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig new file mode 100644 index 000000000000..a73be044fc9b --- /dev/null +++ b/mm/damon/tests/.kunitconfig @@ -0,0 +1,22 @@ +# for DAMON core +CONFIG_KUNIT=y +CONFIG_DAMON=y +CONFIG_DAMON_KUNIT_TEST=y + +# for DAMON vaddr ops +CONFIG_MMU=y +CONFIG_PAGE_IDLE_FLAG=y +CONFIG_DAMON_VADDR=y +CONFIG_DAMON_VADDR_KUNIT_TEST=y + +# for DAMON sysfs interface +CONFIG_SYSFS=y +CONFIG_DAMON_SYSFS=y +CONFIG_DAMON_SYSFS_KUNIT_TEST=y + +# for DAMON debugfs interface +CONFIG_DEBUG_FS=y +CONFIG_DAMON_PADDR=y +CONFIG_DAMON_DBGFS_DEPRECATED=y +CONFIG_DAMON_DBGFS=y +CONFIG_DAMON_DBGFS_KUNIT_TEST=y diff --git a/mm/damon/core-test.h b/mm/damon/tests/core-kunit.h similarity index 93% rename from mm/damon/core-test.h rename to mm/damon/tests/core-kunit.h index 0cee634f3544..cf22e09a3507 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/tests/core-kunit.h @@ -246,16 +246,20 @@ static void damon_test_split_regions_of(struct kunit *test) static void damon_test_ops_registration(struct kunit *test) { struct damon_ctx *c = damon_new_ctx(); - struct damon_operations ops, bak; + struct damon_operations ops = {.id = DAMON_OPS_VADDR}, bak; + bool need_cleanup = false; - /* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */ + /* DAMON_OPS_VADDR is registered only if CONFIG_DAMON_VADDR is set */ + if (!damon_is_registered_ops(DAMON_OPS_VADDR)) { + bak.id = DAMON_OPS_VADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&bak), 0); + need_cleanup = true; + } + + /* DAMON_OPS_VADDR is ensured to be registered */ KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0); - KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0); /* Double-registration is prohibited */ - ops.id = DAMON_OPS_VADDR; - KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); - ops.id = DAMON_OPS_PADDR; KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); /* Unknown ops id cannot be registered */ @@ -278,6 +282,13 @@ static void damon_test_ops_registration(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); damon_destroy_ctx(c); + + if (need_cleanup) { + mutex_lock(&damon_ops_lock); + damon_registered_ops[DAMON_OPS_VADDR] = + (struct damon_operations){}; + mutex_unlock(&damon_ops_lock); + } } static void damon_test_set_regions(struct kunit *test) @@ -309,6 +320,18 @@ static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test) .aggr_interval = ((unsigned long)UINT_MAX + 1) * 10 }; + /* + * In some cases such as 32bit architectures where UINT_MAX is + * ULONG_MAX, attrs.aggr_interval becomes zero. Calling + * damon_nr_accesses_to_accesses_bp() in the case will cause + * divide-by-zero. Such case is prohibited in normal execution since + * the caution is documented on the comment for the function, and + * damon_update_monitoring_results() does the check. Skip the test in + * the case. + */ + if (!attrs.aggr_interval) + kunit_skip(test, "aggr_interval is zero."); + KUNIT_EXPECT_EQ(test, damon_nr_accesses_to_accesses_bp(123, &attrs), 0); } diff --git a/mm/damon/dbgfs-test.h b/mm/damon/tests/dbgfs-kunit.h similarity index 94% rename from mm/damon/dbgfs-test.h rename to mm/damon/tests/dbgfs-kunit.h index 2d85217f5ba4..d2ecfcc8db86 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/tests/dbgfs-kunit.h @@ -73,6 +73,11 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) struct damon_ctx *ctx = dbgfs_new_ctx(); char buf[64]; + if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { + dbgfs_destroy_ctx(ctx); + kunit_skip(test, "PADDR not registered"); + } + /* Make DAMON consider target has no pid */ damon_select_ops(ctx, DAMON_OPS_PADDR); @@ -111,6 +116,11 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) int i, rc; char buf[256]; + if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + kunit_skip(test, "PADDR not registered"); + } + damon_select_ops(ctx, DAMON_OPS_PADDR); dbgfs_set_targets(ctx, 3, NULL); diff --git a/mm/damon/sysfs-test.h b/mm/damon/tests/sysfs-kunit.h similarity index 100% rename from mm/damon/sysfs-test.h rename to mm/damon/tests/sysfs-kunit.h diff --git a/mm/damon/vaddr-test.h b/mm/damon/tests/vaddr-kunit.h similarity index 99% rename from mm/damon/vaddr-test.h rename to mm/damon/tests/vaddr-kunit.h index 83626483f82b..a339d117150f 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -77,7 +77,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, }; - mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); + mt_init_flags(&mm.mm_mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU); if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas))) kunit_skip(test, "Failed to create VMA tree"); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index a0036dc78a3b..08cfd22b5249 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -732,4 +732,4 @@ static int __init damon_va_initcall(void) subsys_initcall(damon_va_initcall); -#include "vaddr-test.h" +#include "tests/vaddr-kunit.h" diff --git a/mm/debug.c b/mm/debug.c index 69e524c3e601..aa57d3ffd4ed 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -36,11 +36,6 @@ const struct trace_print_flags pageflag_names[] = { {0, NULL} }; -const struct trace_print_flags pagetype_names[] = { - __def_pagetype_names, - {0, NULL} -}; - const struct trace_print_flags gfpflag_names[] = { __def_gfpflag_names, {0, NULL} @@ -51,6 +46,27 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; +#define DEF_PAGETYPE_NAME(_name) [PGTY_##_name - 0xf0] = __stringify(_name) + +static const char *page_type_names[] = { + DEF_PAGETYPE_NAME(slab), + DEF_PAGETYPE_NAME(hugetlb), + DEF_PAGETYPE_NAME(offline), + DEF_PAGETYPE_NAME(guard), + DEF_PAGETYPE_NAME(table), + DEF_PAGETYPE_NAME(buddy), + DEF_PAGETYPE_NAME(unaccepted), +}; + +static const char *page_type_name(unsigned int page_type) +{ + unsigned i = (page_type >> 24) - 0xf0; + + if (i >= ARRAY_SIZE(page_type_names)) + return "unknown"; + return page_type_names[i]; +} + static void __dump_folio(struct folio *folio, struct page *page, unsigned long pfn, unsigned long idx) { @@ -58,7 +74,7 @@ static void __dump_folio(struct folio *folio, struct page *page, int mapcount = atomic_read(&page->_mapcount); char *type = ""; - mapcount = page_type_has_type(mapcount) ? 0 : mapcount + 1; + mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1; pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); @@ -92,7 +108,8 @@ static void __dump_folio(struct folio *folio, struct page *page, pr_warn("%sflags: %pGp%s\n", type, &folio->flags, is_migrate_cma_folio(folio, pfn) ? " CMA" : ""); if (page_has_type(&folio->page)) - pr_warn("page_type: %pGt\n", &folio->page.page_type); + pr_warn("page_type: %x(%s)\n", folio->page.page_type >> 24, + page_type_name(folio->page.page_type)); print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e4969fb54da3..bc748f700a9e 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -231,10 +231,10 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) set_pmd_at(args->mm, vaddr, args->pmdp, pmd); flush_dcache_page(page); pmdp_set_wrprotect(args->mm, vaddr, args->pmdp); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(pmd_write(pmd)); pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!pmd_none(pmd)); pmd = pfn_pmd(args->pmd_pfn, args->page_prot); @@ -245,10 +245,10 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pmd_mkwrite(pmd, args->vma); pmd = pmd_mkdirty(pmd); pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!pmd_none(pmd)); pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot)); @@ -256,7 +256,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) set_pmd_at(args->mm, vaddr, args->pmdp, pmd); flush_dcache_page(page); pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(pmd_young(pmd)); /* Clear the pte entries */ @@ -357,12 +357,12 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) set_pud_at(args->mm, vaddr, args->pudp, pud); flush_dcache_page(page); pudp_set_wrprotect(args->mm, vaddr, args->pudp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ pud = pfn_pud(args->pud_pfn, args->page_prot); @@ -374,12 +374,12 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) pud = pud_mkwrite(pud); pud = pud_mkdirty(pud); pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!(pud_write(pud) && pud_dirty(pud))); #ifndef __PAGETABLE_PMD_FOLDED pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ @@ -389,7 +389,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) set_pud_at(args->mm, vaddr, args->pudp, pud); flush_dcache_page(page); pudp_test_and_clear_young(args->vma, vaddr, args->pudp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(pud_young(pud)); pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); @@ -441,7 +441,7 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) WRITE_ONCE(*args->pmdp, __pmd(0)); WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot)); WARN_ON(!pmd_clear_huge(args->pmdp)); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!pmd_none(pmd)); } @@ -461,7 +461,7 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args) WRITE_ONCE(*args->pudp, __pud(0)); WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot)); WARN_ON(!pud_clear_huge(args->pudp)); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!pud_none(pud)); } #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ @@ -490,7 +490,7 @@ static void __init pgd_basic_tests(struct pgtable_debug_args *args) #ifndef __PAGETABLE_PUD_FOLDED static void __init pud_clear_tests(struct pgtable_debug_args *args) { - pud_t pud = READ_ONCE(*args->pudp); + pud_t pud = pudp_get(args->pudp); if (mm_pmd_folded(args->mm)) return; @@ -498,7 +498,7 @@ static void __init pud_clear_tests(struct pgtable_debug_args *args) pr_debug("Validating PUD clear\n"); WARN_ON(pud_none(pud)); pud_clear(args->pudp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!pud_none(pud)); } @@ -515,7 +515,7 @@ static void __init pud_populate_tests(struct pgtable_debug_args *args) * Hence this must not qualify as pud_bad(). */ pud_populate(args->mm, args->pudp, args->start_pmdp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(pud_bad(pud)); } #else /* !__PAGETABLE_PUD_FOLDED */ @@ -526,7 +526,7 @@ static void __init pud_populate_tests(struct pgtable_debug_args *args) { } #ifndef __PAGETABLE_P4D_FOLDED static void __init p4d_clear_tests(struct pgtable_debug_args *args) { - p4d_t p4d = READ_ONCE(*args->p4dp); + p4d_t p4d = p4dp_get(args->p4dp); if (mm_pud_folded(args->mm)) return; @@ -534,7 +534,7 @@ static void __init p4d_clear_tests(struct pgtable_debug_args *args) pr_debug("Validating P4D clear\n"); WARN_ON(p4d_none(p4d)); p4d_clear(args->p4dp); - p4d = READ_ONCE(*args->p4dp); + p4d = p4dp_get(args->p4dp); WARN_ON(!p4d_none(p4d)); } @@ -553,13 +553,13 @@ static void __init p4d_populate_tests(struct pgtable_debug_args *args) pud_clear(args->pudp); p4d_clear(args->p4dp); p4d_populate(args->mm, args->p4dp, args->start_pudp); - p4d = READ_ONCE(*args->p4dp); + p4d = p4dp_get(args->p4dp); WARN_ON(p4d_bad(p4d)); } static void __init pgd_clear_tests(struct pgtable_debug_args *args) { - pgd_t pgd = READ_ONCE(*(args->pgdp)); + pgd_t pgd = pgdp_get(args->pgdp); if (mm_p4d_folded(args->mm)) return; @@ -567,7 +567,7 @@ static void __init pgd_clear_tests(struct pgtable_debug_args *args) pr_debug("Validating PGD clear\n"); WARN_ON(pgd_none(pgd)); pgd_clear(args->pgdp); - pgd = READ_ONCE(*args->pgdp); + pgd = pgdp_get(args->pgdp); WARN_ON(!pgd_none(pgd)); } @@ -586,7 +586,7 @@ static void __init pgd_populate_tests(struct pgtable_debug_args *args) p4d_clear(args->p4dp); pgd_clear(args->pgdp); pgd_populate(args->mm, args->pgdp, args->start_p4dp); - pgd = READ_ONCE(*args->pgdp); + pgd = pgdp_get(args->pgdp); WARN_ON(pgd_bad(pgd)); } #else /* !__PAGETABLE_P4D_FOLDED */ @@ -627,12 +627,12 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) static void __init pmd_clear_tests(struct pgtable_debug_args *args) { - pmd_t pmd = READ_ONCE(*args->pmdp); + pmd_t pmd = pmdp_get(args->pmdp); pr_debug("Validating PMD clear\n"); WARN_ON(pmd_none(pmd)); pmd_clear(args->pmdp); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!pmd_none(pmd)); } @@ -646,7 +646,7 @@ static void __init pmd_populate_tests(struct pgtable_debug_args *args) * Hence this must not qualify as pmd_bad(). */ pmd_populate(args->mm, args->pmdp, args->start_ptep); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(pmd_bad(pmd)); } @@ -1251,7 +1251,7 @@ static int __init init_args(struct pgtable_debug_args *args) ret = -ENOMEM; goto error; } - args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); + args->start_ptep = pmd_pgtable(pmdp_get(args->pmdp)); WARN_ON(!args->start_ptep); init_fixed_pfns(args); diff --git a/mm/filemap.c b/mm/filemap.c index 65c515e7bbf0..4f3753f0a158 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -112,8 +113,8 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) - * ->lruvec->lru_lock (follow_page->mark_page_accessed) - * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) + * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) + * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) @@ -530,7 +531,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping, struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); - folio_clear_error(folio); } folio_batch_release(&fbatch); cond_resched(); @@ -2049,17 +2049,20 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, if (!folio_batch_add(fbatch, folio)) break; } - rcu_read_unlock(); if (folio_batch_count(fbatch)) { - unsigned long nr = 1; + unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); - *start = indices[idx] + nr; + else + nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); + *start = round_down(indices[idx] + nr, nr); } + rcu_read_unlock(); + return folio_batch_count(fbatch); } @@ -2091,10 +2094,17 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + unsigned long base; + unsigned long nr; + if (!xa_is_value(folio)) { - if (folio->index < *start) + nr = folio_nr_pages(folio); + base = folio->index; + /* Omit large folio which begins before the start */ + if (base < *start) goto put; - if (folio_next_index(folio) - 1 > end) + /* Omit large folio which extends beyond the end */ + if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; @@ -2103,7 +2113,19 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); + } else { + nr = 1 << xas_get_order(&xas); + base = xas.xa_index & ~(nr - 1); + /* Omit order>0 value which begins before the start */ + if (base < *start) + continue; + /* Omit order>0 value which extends beyond the end */ + if (base + nr - 1 > end) + break; } + + /* Update start now so that last update is correct on return */ + *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; @@ -2115,15 +2137,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, } rcu_read_unlock(); - if (folio_batch_count(fbatch)) { - unsigned long nr = 1; - int idx = folio_batch_count(fbatch) - 1; - - folio = fbatch->folios[idx]; - if (!xa_is_value(folio)) - nr = folio_nr_pages(folio); - *start = indices[idx] + nr; - } return folio_batch_count(fbatch); } @@ -2344,13 +2357,6 @@ static int filemap_read_folio(struct file *file, filler_t filler, unsigned long pflags; int error; - /* - * A previous I/O error may have been due to temporary failures, - * eg. multipath errors. PG_error will be set again if read_folio - * fails. - */ - folio_clear_error(folio); - /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); @@ -2519,6 +2525,7 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; + unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ @@ -2531,8 +2538,12 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_NOWAIT) + flags = memalloc_noio_save(); page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); + if (iocb->ki_flags & IOCB_NOWAIT) + memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { @@ -2560,6 +2571,7 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, goto err; } + trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0; err: if (err < 0) @@ -3000,7 +3012,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas, static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) - return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return PAGE_SIZE << xas_get_order(xas); return folio_size(folio); } @@ -3298,6 +3310,8 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; + trace_mm_filemap_fault(mapping, index); + /* * Do we have something in the page cache already? */ @@ -3668,6 +3682,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); + trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); out: rcu_read_unlock(); @@ -4297,7 +4312,7 @@ static void filemap_cachestat(struct address_space *mapping, if (xas_retry(&xas, folio)) continue; - order = xa_get_order(xas.xa, xas.xa_index); + order = xas_get_order(&xas); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; diff --git a/mm/folio-compat.c b/mm/folio-compat.c index f05906006b3c..80746182e9e8 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -92,15 +92,3 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); - -bool isolate_lru_page(struct page *page) -{ - if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) - return false; - return folio_isolate_lru((struct folio *)page); -} - -void putback_lru_page(struct page *page) -{ - folio_putback_lru(page_folio(page)); -} diff --git a/mm/gup.c b/mm/gup.c index 02c46ae33028..8232c8c9c372 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -832,6 +832,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; + struct folio *folio; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -889,6 +890,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, goto out; } } + folio = page_folio(page); if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); @@ -899,7 +901,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, !PageAnonExclusive(page), page); /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ - ret = try_grab_folio(page_folio(page), 1, flags); + ret = try_grab_folio(folio, 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; @@ -911,7 +913,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { - ret = arch_make_page_accessible(page); + ret = arch_make_folio_accessible(folio); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); @@ -1083,28 +1085,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, return page; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) -{ - struct follow_page_context ctx = { NULL }; - struct page *page; - - if (vma_is_secretmem(vma)) - return NULL; - - if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) - return NULL; - - /* - * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect - * to fail on PROT_NONE-mapped pages. - */ - page = follow_page_mask(vma, address, foll_flags, &ctx); - if (ctx.pgmap) - put_dev_pagemap(ctx.pgmap); - return page; -} - static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) @@ -1166,19 +1146,19 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, bool unshare, + unsigned long address, unsigned int flags, bool unshare, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; - if (*flags & FOLL_NOFAULT) + if (flags & FOLL_NOFAULT) return -EFAULT; - if (*flags & FOLL_WRITE) + if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; - if (*flags & FOLL_REMOTE) + if (flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (*flags & FOLL_UNLOCKABLE) { + if (flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set @@ -1186,12 +1166,12 @@ static int faultin_page(struct vm_area_struct *vma, * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ - if (*flags & FOLL_INTERRUPTIBLE) + if (flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } - if (*flags & FOLL_NOWAIT) + if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; - if (*flags & FOLL_TRIED) { + if (flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist @@ -1225,7 +1205,7 @@ static int faultin_page(struct vm_area_struct *vma, } if (ret & VM_FAULT_ERROR) { - int err = vm_fault_to_errno(ret, *flags); + int err = vm_fault_to_errno(ret, flags); if (err) return err; @@ -1450,7 +1430,6 @@ static long __get_user_pages(struct mm_struct *mm, do { struct page *page; - unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ @@ -1501,9 +1480,9 @@ static long __get_user_pages(struct mm_struct *mm, } cond_resched(); - page = follow_page_mask(vma, start, foll_flags, &ctx); + page = follow_page_mask(vma, start, gup_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { - ret = faultin_page(vma, start, &foll_flags, + ret = faultin_page(vma, start, gup_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: @@ -1560,13 +1539,12 @@ static long __get_user_pages(struct mm_struct *mm, * large folio, this should never fail. */ if (try_grab_folio(folio, page_increm - 1, - foll_flags)) { + gup_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ - gup_put_folio(folio, 1, - foll_flags); + gup_put_folio(folio, 1, gup_flags); ret = -EFAULT; goto out; } @@ -2370,7 +2348,7 @@ static int migrate_longterm_unpinnable_folios( folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); - if (migrate_device_coherent_page(&folio->page)) { + if (migrate_device_coherent_folio(folio)) { ret = -EBUSY; goto err; } @@ -2532,7 +2510,7 @@ static bool is_valid_gup_args(struct page **pages, int *locked, * These flags not allowed to be specified externally to the gup * interfaces: * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only - * - FOLL_REMOTE is internal only and used on follow_page() + * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote() * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) @@ -2934,7 +2912,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, * details. */ if (flags & FOLL_PIN) { - ret = arch_make_page_accessible(page); + ret = arch_make_folio_accessible(folio); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; @@ -3073,6 +3051,9 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pmd_special(orig)) + return 0; + if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; @@ -3117,6 +3098,9 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pud_special(orig)) + return 0; + if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b71f744d360c..0580ac9e47b9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -73,6 +74,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc); +static bool split_underused_thp = true; static atomic_t huge_zero_refcount; struct folio *huge_zero_folio __read_mostly; @@ -80,6 +82,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +static bool anon_orders_configured __initdata; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, @@ -94,8 +97,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) supported_orders = THP_ORDERS_ALL_ANON; - else if (vma_is_dax(vma)) - supported_orders = THP_ORDERS_ALL_FILE_DAX; + else if (vma_is_special_huge(vma)) + supported_orders = THP_ORDERS_ALL_SPECIAL; else supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; @@ -159,15 +162,10 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * Must be done before hugepage flags check since shmem has its * own flags. */ - if (!in_pf && shmem_file(vma->vm_file)) { - bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, - !enforce_sysfs, vma->vm_mm, vm_flags); - - if (!vma_is_anon_shmem(vma)) - return global_huge ? orders : 0; + if (!in_pf && shmem_file(vma->vm_file)) return shmem_allowable_huge_orders(file_inode(vma->vm_file), - vma, vma->vm_pgoff, global_huge); - } + vma, vma->vm_pgoff, 0, + !enforce_sysfs); if (!vma_is_anonymous(vma)) { /* @@ -445,6 +443,27 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj, static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); +static ssize_t split_underused_thp_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", split_underused_thp); +} + +static ssize_t split_underused_thp_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err = kstrtobool(buf, &split_underused_thp); + + if (err < 0) + return err; + + return count; +} + +static struct kobj_attribute split_underused_thp_attr = __ATTR( + shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store); + static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, @@ -453,6 +472,7 @@ static struct attribute *hugepage_attr[] = { #ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif + &split_underused_thp_attr.attr, NULL, }; @@ -465,8 +485,8 @@ static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); static LIST_HEAD(thpsize_list); -static ssize_t thpsize_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t anon_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; @@ -483,9 +503,9 @@ static ssize_t thpsize_enabled_show(struct kobject *kobj, return sysfs_emit(buf, "%s\n", output); } -static ssize_t thpsize_enabled_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t anon_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; @@ -527,19 +547,35 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj, return ret; } -static struct kobj_attribute thpsize_enabled_attr = - __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); +static struct kobj_attribute anon_enabled_attr = + __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); -static struct attribute *thpsize_attrs[] = { - &thpsize_enabled_attr.attr, +static struct attribute *anon_ctrl_attrs[] = { + &anon_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group anon_ctrl_attr_grp = { + .attrs = anon_ctrl_attrs, +}; + +static struct attribute *file_ctrl_attrs[] = { #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif NULL, }; -static const struct attribute_group thpsize_attr_group = { - .attrs = thpsize_attrs, +static const struct attribute_group file_ctrl_attr_grp = { + .attrs = file_ctrl_attrs, +}; + +static struct attribute *any_ctrl_attrs[] = { + NULL, +}; + +static const struct attribute_group any_ctrl_attr_grp = { + .attrs = any_ctrl_attrs, }; static const struct kobj_type thpsize_ktype = { @@ -578,64 +614,136 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); +#ifdef CONFIG_SHMEM DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); +#endif DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); +DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); +DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); -static struct attribute *stats_attrs[] = { +static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, +#ifndef CONFIG_SHMEM &swpout_attr.attr, &swpout_fallback_attr.attr, - &shmem_alloc_attr.attr, - &shmem_fallback_attr.attr, - &shmem_fallback_charge_attr.attr, - &split_attr.attr, - &split_failed_attr.attr, +#endif &split_deferred_attr.attr, + &nr_anon_attr.attr, + &nr_anon_partially_mapped_attr.attr, NULL, }; -static struct attribute_group stats_attr_group = { +static struct attribute_group anon_stats_attr_grp = { .name = "stats", - .attrs = stats_attrs, + .attrs = anon_stats_attrs, }; +static struct attribute *file_stats_attrs[] = { +#ifdef CONFIG_SHMEM + &shmem_alloc_attr.attr, + &shmem_fallback_attr.attr, + &shmem_fallback_charge_attr.attr, +#endif + NULL, +}; + +static struct attribute_group file_stats_attr_grp = { + .name = "stats", + .attrs = file_stats_attrs, +}; + +static struct attribute *any_stats_attrs[] = { +#ifdef CONFIG_SHMEM + &swpout_attr.attr, + &swpout_fallback_attr.attr, +#endif + &split_attr.attr, + &split_failed_attr.attr, + NULL, +}; + +static struct attribute_group any_stats_attr_grp = { + .name = "stats", + .attrs = any_stats_attrs, +}; + +static int sysfs_add_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + int ret = -ENOENT; + + /* + * If the group is named, try to merge first, assuming the subdirectory + * was already created. This avoids the warning emitted by + * sysfs_create_group() if the directory already exists. + */ + if (grp->name) + ret = sysfs_merge_group(kobj, grp); + if (ret) + ret = sysfs_create_group(kobj, grp); + + return ret; +} + static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; struct thpsize *thpsize; - int ret; + int ret = -ENOMEM; thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); if (!thpsize) - return ERR_PTR(-ENOMEM); + goto err; + + thpsize->order = order; ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, "hugepages-%lukB", size); if (ret) { kfree(thpsize); - return ERR_PTR(ret); + goto err; } - ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); - if (ret) { - kobject_put(&thpsize->kobj); - return ERR_PTR(ret); + + ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp); + if (ret) + goto err_put; + + if (BIT(order) & THP_ORDERS_ALL_ANON) { + ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp); + if (ret) + goto err_put; } - ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); - if (ret) { - kobject_put(&thpsize->kobj); - return ERR_PTR(ret); + if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) { + ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp); + if (ret) + goto err_put; } - thpsize->order = order; return thpsize; +err_put: + kobject_put(&thpsize->kobj); +err: + return ERR_PTR(ret); } static void thpsize_release(struct kobject *kobj) @@ -655,7 +763,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time * constant so we have to do this here. */ - huge_anon_orders_inherit = BIT(PMD_ORDER); + if (!anon_orders_configured) + huge_anon_orders_inherit = BIT(PMD_ORDER); *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { @@ -675,7 +784,7 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) goto remove_hp_group; } - orders = THP_ORDERS_ALL_ANON; + orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT; order = highest_order(orders); while (orders) { thpsize = thpsize_create(order, *hugepage_kobj); @@ -840,6 +949,100 @@ static int __init setup_transparent_hugepage(char *str) } __setup("transparent_hugepage=", setup_transparent_hugepage); +static inline int get_order_from_str(const char *size_str) +{ + unsigned long size; + char *endptr; + int order; + + size = memparse(size_str, &endptr); + + if (!is_power_of_2(size)) + goto err; + order = get_order(size); + if (BIT(order) & ~THP_ORDERS_ALL_ANON) + goto err; + + return order; +err: + pr_err("invalid size %s in thp_anon boot parameter\n", size_str); + return -EINVAL; +} + +static char str_dup[PAGE_SIZE] __initdata; +static int __init setup_thp_anon(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always, inherit, madvise; + char *start_size, *end_size; + int start, end, nr; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strcpy(str_dup, str); + + always = huge_anon_orders_always; + madvise = huge_anon_orders_madvise; + inherit = huge_anon_orders_inherit; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size); + end = get_order_from_str(end_size); + } else { + start = end = get_order_from_str(subtoken); + } + + if (start < 0 || end < 0 || start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + } else if (!strcmp(policy, "madvise")) { + bitmap_set(&madvise, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "inherit")) { + bitmap_set(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "never")) { + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else { + pr_err("invalid policy %s in thp_anon boot parameter\n", policy); + goto err; + } + } + } + + huge_anon_orders_always = always; + huge_anon_orders_madvise = madvise; + huge_anon_orders_inherit = inherit; + anon_orders_configured = true; + return 1; + +err: + pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str); + return 0; +} +__setup("thp_anon=", setup_thp_anon); + pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) @@ -1009,6 +1212,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); + deferred_split_folio(folio, false); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); @@ -1168,6 +1372,8 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); + else + entry = pmd_mkspecial(entry); if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -1251,10 +1457,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { if (write) { - if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) goto out_unlock; - } entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) @@ -1266,6 +1470,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); + else + entry = pud_mkspecial(entry); if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); @@ -1379,6 +1585,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgtable_t pgtable = NULL; int ret = -ENOMEM; + pmd = pmdp_get_lockless(src_pmd); + if (unlikely(pmd_special(pmd))) { + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + /* + * No need to recheck the pmd, it can't change with write + * mmap lock held here. + * + * Meanwhile, making sure it's not a CoW VMA with writable + * mapping, otherwise it means either the anon page wrongly + * applied special bit, or we made the PRIVATE mapping be + * able to wrongly write to the backend MMIO. + */ + VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); + goto set_pmd; + } + /* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0; @@ -1460,7 +1684,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); if (!userfaultfd_wp(dst_vma)) pmd = pmd_clear_uffd_wp(pmd); - pmd = pmd_mkold(pmd_wrprotect(pmd)); + pmd = pmd_wrprotect(pmd); +set_pmd: + pmd = pmd_mkold(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; @@ -1502,21 +1728,15 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) goto out_unlock; - /* - * When page table lock is held, the huge zero pud should not be - * under splitting since we don't split the page itself, only pud to - * a page table. - */ - if (is_huge_zero_pud(pud)) { - /* No huge zero pud yet */ - } - /* * TODO: once we support anonymous pages, use * folio_try_dup_anon_rmap_*() and split if duplicating fails. */ - pudp_set_wrprotect(src_mm, addr, src_pud); - pud = pud_mkold(pud_wrprotect(pud)); + if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_wrprotect(pud); + } + pud = pud_mkold(pud); set_pud_at(dst_mm, addr, dst_pud, pud); ret = 0; @@ -1675,22 +1895,23 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma, vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - pmd_t oldpmd = vmf->orig_pmd; - pmd_t pmd; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int nid = NUMA_NO_NODE; - int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); + int target_nid, last_cpupid; + pmd_t pmd, old_pmd; bool writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + old_pmd = pmdp_get(vmf->pmd); + + if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) { spin_unlock(vmf->ptl); return 0; } - pmd = pmd_modify(oldpmd, vma->vm_page_prot); + pmd = pmd_modify(old_pmd, vma->vm_page_prot); /* * Detect now whether the PMD could be writable; this information @@ -1705,18 +1926,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) if (!folio) goto out_map; - /* See similar comment in do_numa_page for explanation */ - if (!writable) - flags |= TNF_NO_GROUP; - nid = folio_nid(folio); - /* - * For memory tiering mode, cpupid of slow memory page is used - * to record page access time. So use default value. - */ - if (node_is_toptier(nid)) - last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags); + + target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable, + &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { @@ -1736,13 +1949,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATE_FAIL; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) { spin_unlock(vmf->ptl); return 0; } out_map: /* Restore the PMD */ - pmd = pmd_modify(oldpmd, vma->vm_page_prot); + pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (writable) pmd = pmd_mkwrite(pmd, vma); @@ -2064,8 +2277,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, toptier) goto unlock; - if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && - !toptier) + if (folio_use_access_time(folio)) folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); } @@ -2118,6 +2330,53 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return ret; } +/* + * Returns: + * + * - 0: if pud leaf changed from under us + * - 1: if pud can be skipped + * - HPAGE_PUD_NR: if pud was successfully processed + */ +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) +{ + struct mm_struct *mm = vma->vm_mm; + pud_t oldpud, entry; + spinlock_t *ptl; + + tlb_change_page_size(tlb, HPAGE_PUD_SIZE); + + /* NUMA balancing doesn't apply to dax */ + if (cp_flags & MM_CP_PROT_NUMA) + return 1; + + /* + * Huge entries on userfault-wp only works with anonymous, while we + * don't have anonymous PUDs yet. + */ + if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL)) + return 1; + + ptl = __pud_trans_huge_lock(pudp, vma); + if (!ptl) + return 0; + + /* + * Can't clear PUD or it can race with concurrent zapping. See + * change_huge_pmd(). + */ + oldpud = pudp_invalidate(vma, addr, pudp); + entry = pud_modify(oldpud, newprot); + set_pud_at(mm, addr, pudp, entry); + tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE); + + spin_unlock(ptl); + return HPAGE_PUD_NR; +} +#endif + #ifdef CONFIG_USERFAULTFD /* * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by @@ -2297,12 +2556,14 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { spinlock_t *ptl; + pud_t orig_pud; ptl = __pud_trans_huge_lock(pud, vma); if (!ptl) return 0; - pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); + orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); + arch_check_zapped_pud(vma, orig_pud); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { spin_unlock(ptl); @@ -2346,6 +2607,11 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } +#else +void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ +} #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, @@ -2780,7 +3046,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, return false; } -static void remap_page(struct folio *folio, unsigned long nr) +static void remap_page(struct folio *folio, unsigned long nr, int flags) { int i = 0; @@ -2788,7 +3054,7 @@ static void remap_page(struct folio *folio, unsigned long nr) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, true); + remove_migration_ptes(folio, folio, RMP_LOCKED | flags); i += folio_nr_pages(folio); if (i >= nr) break; @@ -2796,25 +3062,25 @@ static void remap_page(struct folio *folio, unsigned long nr) } } -static void lru_add_page_tail(struct page *head, struct page *tail, +static void lru_add_page_tail(struct folio *folio, struct page *tail, struct lruvec *lruvec, struct list_head *list) { - VM_BUG_ON_PAGE(!PageHead(head), head); - VM_BUG_ON_PAGE(PageLRU(tail), head); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + VM_BUG_ON_FOLIO(PageLRU(tail), folio); lockdep_assert_held(&lruvec->lru_lock); if (list) { /* page reclaim is reclaiming a huge page */ - VM_WARN_ON(PageLRU(head)); + VM_WARN_ON(folio_test_lru(folio)); get_page(tail); list_add_tail(&tail->lru, list); } else { /* head is still on lru (and we have it frozen) */ - VM_WARN_ON(!PageLRU(head)); - if (PageUnevictable(tail)) + VM_WARN_ON(!folio_test_lru(folio)); + if (folio_test_unevictable(folio)) tail->mlock_count = 0; else - list_add_tail(&tail->lru, &head->lru); + list_add_tail(&tail->lru, &folio->lru); SetPageLRU(tail); } } @@ -2857,8 +3123,10 @@ static void __split_huge_page_tail(struct folio *folio, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 (1L << PG_arch_2) | +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 (1L << PG_arch_3) | #endif (1L << PG_dirty) | @@ -2913,7 +3181,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, * pages to show after the currently processed elements - e.g. * migrate_pages */ - lru_add_page_tail(head, page_tail, lruvec, list); + lru_add_page_tail(folio, page_tail, lruvec, list); } static void __split_huge_page(struct page *page, struct list_head *list, @@ -2976,7 +3244,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, order, new_order); - pgalloc_tag_split(head, 1 << order); + pgalloc_tag_split(folio, order, new_order); /* See comment in __split_huge_page_tail() */ if (folio_test_anon(folio)) { @@ -2996,7 +3264,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (nr_dropped) shmem_uncharge(folio->mapping->host, nr_dropped); - remap_page(folio, nr); + remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0); /* * set page to its compound_head when split to non order-0 pages, so @@ -3025,7 +3293,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } /* Racy check whether the huge page can be split */ -bool can_split_folio(struct folio *folio, int *pextra_pins) +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) { int extra_pins; @@ -3037,7 +3305,8 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) extra_pins = folio_nr_pages(folio); if (pextra_pins) *pextra_pins = extra_pins; - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; + return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - + caller_pins; } /* @@ -3094,8 +3363,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, struct deferred_split *ds_queue = get_deferred_split_queue(folio); /* reset xarray order to new order after split */ XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); - struct anon_vma *anon_vma = NULL; + bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; + struct anon_vma *anon_vma = NULL; int order = folio_order(folio); int extra_pins, ret; pgoff_t end; @@ -3107,7 +3377,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (new_order >= folio_order(folio)) return -EINVAL; - if (folio_test_anon(folio)) { + if (is_anon) { /* order-1 is not supported for anonymous THP. */ if (new_order == 1) { VM_WARN_ONCE(1, "Cannot split to order-1 folio"); @@ -3147,7 +3417,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (folio_test_writeback(folio)) return -EBUSY; - if (folio_test_anon(folio)) { + if (is_anon) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -3217,7 +3487,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, * Racy check if we can split the page, before unmap_folio() will * split PMDs */ - if (!can_split_folio(folio, &extra_pins)) { + if (!can_split_folio(folio, 1, &extra_pins)) { ret = -EAGAIN; goto out_unlock; } @@ -3243,6 +3513,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + if (folio_test_partially_mapped(folio)) { + __folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } /* * Reinitialize page_deferred_list after removing the * page from the split_queue, otherwise a subsequent @@ -3269,6 +3544,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, } } + if (is_anon) { + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); + mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order)); + } __split_huge_page(page, list, end, new_order); ret = 0; } else { @@ -3277,7 +3556,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (mapping) xas_unlock(&xas); local_irq_enable(); - remap_page(folio, folio_nr_pages(folio)); + remap_page(folio, folio_nr_pages(folio), 0); ret = -EAGAIN; } @@ -3329,12 +3608,18 @@ void __folio_undo_large_rmappable(struct folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + if (folio_test_partially_mapped(folio)) { + __folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } list_del_init(&folio->_deferred_list); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } -void deferred_split_folio(struct folio *folio) +/* partially_mapped=false won't clear PG_partially_mapped folio flag */ +void deferred_split_folio(struct folio *folio, bool partially_mapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG @@ -3349,6 +3634,9 @@ void deferred_split_folio(struct folio *folio) if (folio_order(folio) <= 1) return; + if (!partially_mapped && !split_underused_thp) + return; + /* * The try_to_unmap() in page reclaim path might reach here too, * this may cause a race condition to corrupt deferred split queue. @@ -3362,14 +3650,21 @@ void deferred_split_folio(struct folio *folio) if (folio_test_swapcache(folio)) return; - if (!list_empty(&folio->_deferred_list)) - return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (partially_mapped) { + if (!folio_test_partially_mapped(folio)) { + __folio_set_partially_mapped(folio); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_DEFERRED_SPLIT_PAGE); + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); + + } + } else { + /* partially mapped folios cannot become non-partially mapped */ + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); + } if (list_empty(&folio->_deferred_list)) { - if (folio_test_pmd_mappable(folio)) - count_vm_event(THP_DEFERRED_SPLIT_PAGE); - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG @@ -3394,6 +3689,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink, return READ_ONCE(ds_queue->split_queue_len); } +static bool thp_underused(struct folio *folio) +{ + int num_zero_pages = 0, num_filled_pages = 0; + void *kaddr; + int i; + + if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) + return false; + + for (i = 0; i < folio_nr_pages(folio); i++) { + kaddr = kmap_local_folio(folio, i * PAGE_SIZE); + if (!memchr_inv(kaddr, 0, PAGE_SIZE)) { + num_zero_pages++; + if (num_zero_pages > khugepaged_max_ptes_none) { + kunmap_local(kaddr); + return true; + } + } else { + /* + * Another path for early exit once the number + * of non-zero filled pages exceeds threshold. + */ + num_filled_pages++; + if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) { + kunmap_local(kaddr); + return false; + } + } + kunmap_local(kaddr); + } + return false; +} + static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { @@ -3417,6 +3745,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, list_move(&folio->_deferred_list, &list); } else { /* We lost race with folio_put() */ + if (folio_test_partially_mapped(folio)) { + __folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } @@ -3426,13 +3759,35 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_entry_safe(folio, next, &list, _deferred_list) { + bool did_split = false; + bool underused = false; + + if (!folio_test_partially_mapped(folio)) { + underused = thp_underused(folio); + if (!underused) + goto next; + } if (!folio_trylock(folio)) goto next; - /* split_huge_page() removes page from list on success */ - if (!split_folio(folio)) + if (!split_folio(folio)) { + did_split = true; + if (underused) + count_vm_event(THP_UNDERUSED_SPLIT_PAGE); split++; + } folio_unlock(folio); next: + /* + * split_folio() removes folio from list on success. + * Only add back to the queue if folio is partially mapped. + * If thp_underused returns false, or if split_folio fails + * in the case it was underused, then consider it used and + * don't add it back to split_queue. + */ + if (!did_split && !folio_test_partially_mapped(folio)) { + list_del_init(&folio->_deferred_list); + ds_queue->split_queue_len--; + } folio_put(folio); } @@ -3518,16 +3873,11 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, vaddr_start &= PAGE_MASK; vaddr_end &= PAGE_MASK; - /* Find the task_struct from pid */ - rcu_read_lock(); - task = find_task_by_vpid(pid); + task = find_get_task_by_vpid(pid); if (!task) { - rcu_read_unlock(); ret = -ESRCH; goto out; } - get_task_struct(task); - rcu_read_unlock(); /* Find the mm_struct */ mm = get_task_mm(task); @@ -3548,7 +3898,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, */ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { struct vm_area_struct *vma = vma_lookup(mm, addr); - struct page *page; + struct folio_walk fw; struct folio *folio; struct address_space *mapping; unsigned int target_order = new_order; @@ -3562,13 +3912,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, continue; } - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - if (IS_ERR_OR_NULL(page)) + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) continue; - folio = page_folio(page); if (!is_transparent_hugepage(folio)) goto next; @@ -3588,11 +3935,13 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, * can be split or not. So skip the check here. */ if (!folio_test_private(folio) && - !can_split_folio(folio, NULL)) + !can_split_folio(folio, 0, NULL)) goto next; if (!folio_trylock(folio)) goto next; + folio_get(folio); + folio_walk_end(&fw, vma); if (!folio_test_anon(folio) && folio->mapping != mapping) goto unlock; @@ -3603,8 +3952,12 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, unlock: folio_unlock(folio); -next: folio_put(folio); + + cond_resched(); + continue; +next: + folio_walk_end(&fw, vma); cond_resched(); } mmap_read_unlock(mm); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a3a6e2dee97..def84d8bcf2d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -56,16 +56,6 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) -{ - return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, - 1 << order); -} -#else -static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) -{ - return false; -} #endif static unsigned long hugetlb_cma_size __initdata; @@ -82,14 +72,14 @@ static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. */ -DEFINE_SPINLOCK(hugetlb_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock); /* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized. */ -static int num_fault_mutexes; -struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; +static int num_fault_mutexes __ro_after_init; +struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -100,6 +90,17 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); +static void hugetlb_free_folio(struct folio *folio) +{ +#ifdef CONFIG_CMA + int nid = folio_nid(folio); + + if (cma_free_folio(hugetlb_cma[nid], folio)) + return; +#endif + folio_put(folio); +} + static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -1512,95 +1513,54 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -/* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i; - int nr_pages = 1 << order; - struct page *p; - - atomic_set(&folio->_entire_mapcount, 0); - atomic_set(&folio->_large_mapcount, 0); - atomic_set(&folio->_pincount, 0); - - for (i = 1; i < nr_pages; i++) { - p = folio_page(folio, i); - p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; - p->mapping = NULL; - clear_compound_head(p); - if (!demote) - set_page_refcounted(p); - } - - __folio_clear_head(folio); -} - -static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, true); -} - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, false); -} - -static void free_gigantic_folio(struct folio *folio, unsigned int order) -{ - /* - * If the page isn't allocated using the cma allocator, - * cma_release() returns false. - */ -#ifdef CONFIG_CMA - int nid = folio_nid(folio); - - if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) - return; -#endif - - free_contig_range(folio_pfn(folio), 1 << order); -} - #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - struct page *page; - unsigned long nr_pages = pages_per_huge_page(h); + struct folio *folio; + int order = huge_page_order(h); + bool retried = false; + if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - +retry: + folio = NULL; #ifdef CONFIG_CMA { int node; - if (hugetlb_cma[nid]) { - page = cma_alloc(hugetlb_cma[nid], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); - } + if (hugetlb_cma[nid]) + folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); - if (!(gfp_mask & __GFP_THISNODE)) { + if (!folio && !(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; - page = cma_alloc(hugetlb_cma[node], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); + folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); + if (folio) + break; } } } #endif + if (!folio) { + folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); + if (!folio) + return NULL; + } - page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); - return page ? page_folio(page) : NULL; + if (folio_ref_freeze(folio, 1)) + return folio; + + pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); + hugetlb_free_folio(folio); + if (!retried) { + retried = true; + goto retry; + } + return NULL; } #else /* !CONFIG_CONTIG_ALLOC */ @@ -1617,10 +1577,6 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, { return NULL; } -static inline void free_gigantic_folio(struct folio *folio, - unsigned int order) { } -static inline void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) { } #endif /* @@ -1748,18 +1704,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, folio_ref_unfreeze(folio, 1); - /* - * Non-gigantic pages demoted from CMA allocated gigantic pages - * need to be given back to CMA in free_gigantic_folio. - */ - if (hstate_is_gigantic(h) || - hugetlb_cma_folio(folio, huge_page_order(h))) { - destroy_compound_gigantic_folio(folio, huge_page_order(h)); - free_gigantic_folio(folio, huge_page_order(h)); - } else { - INIT_LIST_HEAD(&folio->_deferred_list); - folio_put(folio); - } + INIT_LIST_HEAD(&folio->_deferred_list); + hugetlb_free_folio(folio); } /* @@ -2032,95 +1978,6 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i, j; - int nr_pages = 1 << order; - struct page *p; - - __folio_clear_reserved(folio); - for (i = 0; i < nr_pages; i++) { - p = folio_page(folio, i); - - /* - * For gigantic hugepages allocated through bootmem at - * boot, it's safer to be consistent with the not-gigantic - * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwise drivers using get_user_pages() to access tail - * pages may get the reference counting wrong if they see - * PG_reserved set on a tail page (despite the head page not - * having PG_reserved set). Enforcing this consistency between - * head and tail pages allows drivers to optimize away a check - * on the head page when they need know if put_page() is needed - * after get_user_pages(). - */ - if (i != 0) /* head page cleared above */ - __ClearPageReserved(p); - /* - * Subtle and very unlikely - * - * Gigantic 'page allocators' such as memblock or cma will - * return a set of pages with each page ref counted. We need - * to turn this set of pages into a compound page with tail - * page ref counts set to zero. Code such as speculative page - * cache adding could take a ref on a 'to be' tail page. - * We need to respect any increased ref count, and only set - * the ref count to zero if count is currently 1. If count - * is not 1, we return an error. An error return indicates - * the set of pages can not be converted to a gigantic page. - * The caller who allocated the pages should then discard the - * pages using the appropriate free interface. - * - * In the case of demote, the ref count will be zero. - */ - if (!demote) { - if (!page_ref_freeze(p, 1)) { - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); - goto out_error; - } - } else { - VM_BUG_ON_PAGE(page_count(p), p); - } - if (i != 0) - set_compound_head(p, &folio->page); - } - __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */ - folio_set_order(folio, order); - atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_large_mapcount, -1); - atomic_set(&folio->_pincount, 0); - return true; - -out_error: - /* undo page modifications made above */ - for (j = 0; j < i; j++) { - p = folio_page(folio, j); - if (j != 0) - clear_compound_head(p); - set_page_refcounted(p); - } - /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++) { - p = folio_page(folio, j); - __ClearPageReserved(p); - } - return false; -} - -static bool prep_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, false); -} - -static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, true); -} - /* * Find and lock address space (mapping) in write mode. * @@ -2159,7 +2016,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; - gfp_mask |= __GFP_COMP|__GFP_NOWARN; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) @@ -2206,48 +2062,16 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, return folio; } -static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) -{ - struct folio *folio; - bool retry = false; - -retry: - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); - else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, - nid, nmask, node_alloc_noretry); - if (!folio) - return NULL; - - if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { - /* - * Rare failure to convert pages to compound page. - * Free pages and try again - ONCE! - */ - free_gigantic_folio(folio, huge_page_order(h)); - if (!retry) { - retry = true; - goto retry; - } - return NULL; - } - } - - return folio; -} - static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, - node_alloc_noretry); + if (hstate_is_gigantic(h)) + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + else + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; @@ -2265,7 +2089,10 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (hstate_is_gigantic(h)) + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + else + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; @@ -2549,9 +2376,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - gfp_t gfp = gfp_mask | __GFP_NOWARN; + gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ @@ -3333,6 +3159,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); + __ClearPageReserved(folio_page(folio, pfn - head_pfn)); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); @@ -3921,100 +3748,119 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, return 0; } -static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) +static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, + struct list_head *src_list) { - int i, nid = folio_nid(folio); - struct hstate *target_hstate; - struct page *subpage; - struct folio *inner_folio; - int rc = 0; + long rc; + struct folio *folio, *next; + LIST_HEAD(dst_list); + LIST_HEAD(ret_list); - target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - - remove_hugetlb_folio(h, folio, false); - spin_unlock_irq(&hugetlb_lock); - - /* - * If vmemmap already existed for folio, the remove routine above would - * have cleared the hugetlb folio flag. Hence the folio is technically - * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be - * passed hugetlb folios and will BUG otherwise. - */ - if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore_folio(h, folio); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); - return rc; - } - } - - /* - * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count folios. - */ - destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); + rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list); + list_splice_init(&ret_list, src_list); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. * Without the mutex, pages added to target hstate could be marked * as surplus. * - * Note that we already hold h->resize_lock. To prevent deadlock, + * Note that we already hold src->resize_lock. To prevent deadlock, * use the convention of always taking larger size hstate mutex first. */ - mutex_lock(&target_hstate->resize_lock); - for (i = 0; i < pages_per_huge_page(h); - i += pages_per_huge_page(target_hstate)) { - subpage = folio_page(folio, i); - inner_folio = page_folio(subpage); - if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(inner_folio, - target_hstate->order); - else - prep_compound_page(subpage, target_hstate->order); - folio_change_private(inner_folio, NULL); - prep_new_hugetlb_folio(target_hstate, inner_folio, nid); - free_huge_folio(inner_folio); - } - mutex_unlock(&target_hstate->resize_lock); + mutex_lock(&dst->resize_lock); - spin_lock_irq(&hugetlb_lock); + list_for_each_entry_safe(folio, next, src_list, lru) { + int i; + + if (folio_test_hugetlb_vmemmap_optimized(folio)) + continue; + + list_del(&folio->lru); + + split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); + pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst)); + + for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { + struct page *page = folio_page(folio, i); + + page->mapping = NULL; + clear_compound_head(page); + prep_compound_page(page, dst->order); + + init_new_hugetlb_folio(dst, page_folio(page)); + list_add(&page->lru, &dst_list); + } + } + + prep_and_add_allocated_folios(dst, &dst_list); + + mutex_unlock(&dst->resize_lock); + + return rc; +} + +static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, + unsigned long nr_to_demote) + __must_hold(&hugetlb_lock) +{ + int nr_nodes, node; + struct hstate *dst; + long rc = 0; + long nr_demoted = 0; + + lockdep_assert_held(&hugetlb_lock); + + /* We should never get here if no demote order */ + if (!src->demote_order) { + pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); + return -EINVAL; /* internal error */ + } + dst = size_to_hstate(PAGE_SIZE << src->demote_order); + + for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) { + LIST_HEAD(list); + struct folio *folio, *next; + + list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) { + if (folio_test_hwpoison(folio)) + continue; + + remove_hugetlb_folio(src, folio, false); + list_add(&folio->lru, &list); + + if (++nr_demoted == nr_to_demote) + break; + } + + spin_unlock_irq(&hugetlb_lock); + + rc = demote_free_hugetlb_folios(src, dst, &list); + + spin_lock_irq(&hugetlb_lock); + + list_for_each_entry_safe(folio, next, &list, lru) { + list_del(&folio->lru); + add_hugetlb_folio(src, folio, false); + + nr_demoted--; + } + + if (rc < 0 || nr_demoted == nr_to_demote) + break; + } /* * Not absolutely necessary, but for consistency update max_huge_pages * based on pool changes for the demoted page. */ - h->max_huge_pages--; - target_hstate->max_huge_pages += - pages_per_huge_page(h) / pages_per_huge_page(target_hstate); + src->max_huge_pages -= nr_demoted; + dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst)); - return rc; -} - -static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) - __must_hold(&hugetlb_lock) -{ - int nr_nodes, node; - struct folio *folio; - - lockdep_assert_held(&hugetlb_lock); - - /* We should never get here if no demote order */ - if (!h->demote_order) { - pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); - return -EINVAL; /* internal error */ - } - - for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { - if (folio_test_hwpoison(folio)) - continue; - return demote_free_hugetlb_folio(h, folio); - } - } + if (rc < 0) + return rc; + if (nr_demoted) + return nr_demoted; /* * Only way to get here is if all pages on free lists are poisoned. * Return -EBUSY so that caller will not retry. @@ -4249,6 +4095,8 @@ static ssize_t demote_store(struct kobject *kobj, spin_lock_irq(&hugetlb_lock); while (nr_demote) { + long rc; + /* * Check for available pages to demote each time thorough the * loop as demote_pool_huge_page will drop hugetlb_lock. @@ -4261,11 +4109,13 @@ static ssize_t demote_store(struct kobject *kobj, if (!nr_available) break; - err = demote_pool_huge_page(h, n_mask); - if (err) + rc = demote_pool_huge_page(h, n_mask, nr_demote); + if (rc < 0) { + err = rc; break; + } - nr_demote--; + nr_demote -= rc; } spin_unlock_irq(&hugetlb_lock); @@ -7227,7 +7077,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, return 0; } -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, unsigned long addr, pgoff_t idx) @@ -7389,7 +7239,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } -#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -7412,7 +7262,7 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { return false; } -#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -7510,7 +7360,7 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING if (huge_page_size(h) == PMD_SIZE) return PUD_SIZE - PMD_SIZE; #endif @@ -7519,10 +7369,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -/* - * These functions are overwritable if your architecture needs its own - * behavior. - */ bool isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 4ff238ba1250..e716c4671a15 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -114,10 +114,10 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, } page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), - fault_parent); + fault_parent, false); page_counter_init( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), - rsvd_parent); + rsvd_parent, false); limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 0c3f56b3578e..57b7f591eee8 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -43,6 +43,8 @@ struct vmemmap_remap_walk { #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) +/* synchronize_rcu() to avoid writes from page_ref_add_unless() */ +#define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; }; @@ -457,6 +459,9 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; + if (flags & VMEMMAP_SYNCHRONIZE_RCU) + synchronize_rcu(); + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; @@ -489,10 +494,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { - /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ - synchronize_rcu(); - - return __hugetlb_vmemmap_restore_folio(h, folio, 0); + return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); } /** @@ -515,14 +517,14 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct folio *folio, *t_folio; long restored = 0; long ret = 0; - - /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ - synchronize_rcu(); + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { - ret = __hugetlb_vmemmap_restore_folio(h, folio, - VMEMMAP_REMAP_NO_TLB_FLUSH); + ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); + /* only need to synchronize_rcu() once for each batch */ + flags &= ~VMEMMAP_SYNCHRONIZE_RCU; + if (ret) break; restored++; @@ -570,6 +572,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); + + if (flags & VMEMMAP_SYNCHRONIZE_RCU) + synchronize_rcu(); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed @@ -617,10 +622,7 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); - /* avoid writes from page_ref_add_unless() while folding vmemmap */ - synchronize_rcu(); - - __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); free_vmemmap_page_list(&vmemmap_pages); } @@ -647,6 +649,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l { struct folio *folio; LIST_HEAD(vmemmap_pages); + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; list_for_each_entry(folio, folio_list, lru) { int ret = hugetlb_vmemmap_split_folio(h, folio); @@ -663,14 +666,12 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); - /* avoid writes from page_ref_add_unless() while folding vmemmap */ - synchronize_rcu(); - list_for_each_entry(folio, folio_list, lru) { int ret; - ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, - VMEMMAP_REMAP_NO_TLB_FLUSH); + ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); + /* only need to synchronize_rcu() once for each batch */ + flags &= ~VMEMMAP_SYNCHRONIZE_RCU; /* * Pages to be freed may have been accumulated. If we @@ -684,8 +685,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, - VMEMMAP_REMAP_NO_TLB_FLUSH); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); } } diff --git a/mm/internal.h b/mm/internal.h index a963f67d3452..93083bbeeefa 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -8,13 +8,19 @@ #define __MM_INTERNAL_H #include +#include #include +#include #include #include #include #include +#include #include +/* Internal core VMA manipulation functions. */ +#include "vma.h" + struct folio_batch; /* @@ -270,18 +276,22 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; + swp_entry_t entry = pte_to_swp_entry(pte); pte_t *ptep = start_ptep + 1; + unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); - VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); + VM_WARN_ON(non_swap_entry(entry)); + cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; - + if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) + break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } @@ -415,9 +425,7 @@ extern unsigned long highest_memmap_pfn; /* * in mm/vmscan.c: */ -bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); -void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); @@ -787,37 +795,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype) return list_empty(&area->free_list[migratetype]); } -/* - * These three helpers classifies VMAs for virtual memory accounting. - */ - -/* - * Executable code area - executable, not writable, not stack - */ -static inline bool is_exec_mapping(vm_flags_t flags) -{ - return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; -} - -/* - * Stack area (including shadow stacks) - * - * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: - * do_mmap() forbids all other combinations. - */ -static inline bool is_stack_mapping(vm_flags_t flags) -{ - return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); -} - -/* - * Data area - private, writable, not stack - */ -static inline bool is_data_mapping(vm_flags_t flags) -{ - return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; -} - /* mm/util.c */ struct anon_vma *folio_anon_vma(struct folio *folio); @@ -1078,6 +1055,8 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) /* * mm/memory-failure.c */ +#ifdef CONFIG_MEMORY_FAILURE +void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); @@ -1098,6 +1077,12 @@ void add_to_kill_ksm(struct task_struct *tsk, struct page *p, unsigned long ksm_addr); unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); +#else +static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +{ +} +#endif + extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); @@ -1174,7 +1159,6 @@ static inline void flush_tlb_batched_pending(struct mm_struct *mm) #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; -extern const struct trace_print_flags pagetype_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; @@ -1226,11 +1210,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, - unsigned long addr, int page_nid, int *flags); +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, + unsigned long addr, int *flags, bool writable, + int *last_cpupid); void free_zone_device_folio(struct folio *folio); -int migrate_device_coherent_page(struct page *page); +int migrate_device_coherent_folio(struct folio *folio); /* * mm/gup.c @@ -1246,13 +1231,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); -/* - * mm/mmap.c - */ -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); - enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, @@ -1379,117 +1357,6 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); } -static inline void vma_iter_config(struct vma_iterator *vmi, - unsigned long index, unsigned long last) -{ - __mas_set_range(&vmi->mas, index, last - 1); -} - -static inline void vma_iter_reset(struct vma_iterator *vmi) -{ - mas_reset(&vmi->mas); -} - -static inline -struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) -{ - return mas_prev_range(&vmi->mas, min); -} - -static inline -struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) -{ - return mas_next_range(&vmi->mas, max); -} - -static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, - unsigned long max, unsigned long size) -{ - return mas_empty_area(&vmi->mas, min, max - 1, size); -} - -static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, - unsigned long max, unsigned long size) -{ - return mas_empty_area_rev(&vmi->mas, min, max - 1, size); -} - -/* - * VMA Iterator functions shared between nommu and mmap - */ -static inline int vma_iter_prealloc(struct vma_iterator *vmi, - struct vm_area_struct *vma) -{ - return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); -} - -static inline void vma_iter_clear(struct vma_iterator *vmi) -{ - mas_store_prealloc(&vmi->mas, NULL); -} - -static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) -{ - return mas_walk(&vmi->mas); -} - -/* Store a VMA with preallocated memory */ -static inline void vma_iter_store(struct vma_iterator *vmi, - struct vm_area_struct *vma) -{ - -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && - vmi->mas.index > vma->vm_start)) { - pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", - vmi->mas.index, vma->vm_start, vma->vm_start, - vma->vm_end, vmi->mas.index, vmi->mas.last); - } - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && - vmi->mas.last < vma->vm_start)) { - pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", - vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, - vmi->mas.index, vmi->mas.last); - } -#endif - - if (vmi->mas.status != ma_start && - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) - vma_iter_invalidate(vmi); - - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(&vmi->mas, vma); -} - -static inline int vma_iter_store_gfp(struct vma_iterator *vmi, - struct vm_area_struct *vma, gfp_t gfp) -{ - if (vmi->mas.status != ma_start && - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) - vma_iter_invalidate(vmi); - - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); - mas_store_gfp(&vmi->mas, vma, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - -/* - * VMA lock generalization - */ -struct vma_prepare { - struct vm_area_struct *vma; - struct vm_area_struct *adj_next; - struct file *file; - struct address_space *mapping; - struct anon_vma *anon_vma; - struct vm_area_struct *insert; - struct vm_area_struct *remove; - struct vm_area_struct *remove2; -}; - void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); @@ -1506,27 +1373,11 @@ static inline int can_do_mseal(unsigned long flags) return 0; } -bool can_modify_mm(struct mm_struct *mm, unsigned long start, - unsigned long end); -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, - unsigned long end, int behavior); #else static inline int can_do_mseal(unsigned long flags) { return -EPERM; } - -static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - return true; -} - -static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, - unsigned long end, int behavior) -{ - return true; -} #endif #ifdef CONFIG_SHRINKER_DEBUG @@ -1578,13 +1429,18 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; -struct unlink_vma_file_batch { - int count; - struct vm_area_struct *vmas[8]; -}; +/* mremap.c */ +unsigned long move_page_tables(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len, + bool need_rmap_locks, bool for_stack); -void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); -void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); -void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); +#ifdef CONFIG_UNACCEPTED_MEMORY +void accept_page(struct page *page); +#else /* CONFIG_UNACCEPTED_MEMORY */ +static inline void accept_page(struct page *page) +{ +} +#endif /* CONFIG_UNACCEPTED_MEMORY */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/kfence/core.c b/mm/kfence/core.c index c5cb54fc696d..67fc321db79b 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -99,6 +99,10 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte static unsigned long kfence_skip_covered_thresh __read_mostly = 75; module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); +/* Allocation burst count: number of excess KFENCE allocations per sample. */ +static unsigned int kfence_burst __read_mostly; +module_param_named(burst, kfence_burst, uint, 0644); + /* If true, use a deferrable timer. */ static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); module_param_named(deferrable, kfence_deferrable, bool, 0444); @@ -269,6 +273,13 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m return pageaddr; } +static inline bool kfence_obj_allocated(const struct kfence_metadata *meta) +{ + enum kfence_object_state state = READ_ONCE(meta->state); + + return state == KFENCE_OBJECT_ALLOCATED || state == KFENCE_OBJECT_RCU_FREEING; +} + /* * Update the object's metadata state, including updating the alloc/free stacks * depending on the state transition. @@ -278,10 +289,14 @@ metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state nex unsigned long *stack_entries, size_t num_stack_entries) { struct kfence_track *track = - next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; + next == KFENCE_OBJECT_ALLOCATED ? &meta->alloc_track : &meta->free_track; lockdep_assert_held(&meta->lock); + /* Stack has been saved when calling rcu, skip. */ + if (READ_ONCE(meta->state) == KFENCE_OBJECT_RCU_FREEING) + goto out; + if (stack_entries) { memcpy(track->stack_entries, stack_entries, num_stack_entries * sizeof(stack_entries[0])); @@ -297,6 +312,7 @@ metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state nex track->cpu = raw_smp_processor_id(); track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ +out: /* * Pairs with READ_ONCE() in * kfence_shutdown_cache(), @@ -502,7 +518,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z raw_spin_lock_irqsave(&meta->lock, flags); - if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { + if (!kfence_obj_allocated(meta) || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); kfence_report_error((unsigned long)addr, false, NULL, meta, @@ -780,7 +796,7 @@ static void kfence_check_all_canary(void) for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { struct kfence_metadata *meta = &kfence_metadata[i]; - if (meta->state == KFENCE_OBJECT_ALLOCATED) + if (kfence_obj_allocated(meta)) check_canary(meta); } } @@ -827,12 +843,12 @@ static void toggle_allocation_gate(struct work_struct *work) if (!READ_ONCE(kfence_enabled)) return; - atomic_set(&kfence_allocation_gate, 0); + atomic_set(&kfence_allocation_gate, -kfence_burst); #ifdef CONFIG_KFENCE_STATIC_KEYS /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); + wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); @@ -1006,12 +1022,11 @@ void kfence_shutdown_cache(struct kmem_cache *s) * the lock will not help, as different critical section * serialization will have the same outcome. */ - if (READ_ONCE(meta->cache) != s || - READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) + if (READ_ONCE(meta->cache) != s || !kfence_obj_allocated(meta)) continue; raw_spin_lock_irqsave(&meta->lock, flags); - in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; + in_use = meta->cache == s && kfence_obj_allocated(meta); raw_spin_unlock_irqrestore(&meta->lock, flags); if (in_use) { @@ -1052,6 +1067,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) unsigned long stack_entries[KFENCE_STACK_DEPTH]; size_t num_stack_entries; u32 alloc_stack_hash; + int allocation_gate; /* * Perform size check before switching kfence_allocation_gate, so that @@ -1080,14 +1096,15 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) if (s->flags & SLAB_SKIP_KFENCE) return NULL; - if (atomic_inc_return(&kfence_allocation_gate) > 1) + allocation_gate = atomic_inc_return(&kfence_allocation_gate); + if (allocation_gate > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS /* * waitqueue_active() is fully ordered after the update of * kfence_allocation_gate per atomic_inc_return(). */ - if (waitqueue_active(&allocation_wait)) { + if (allocation_gate == 1 && waitqueue_active(&allocation_wait)) { /* * Calling wake_up() here may deadlock when allocations happen * from within timer code. Use an irq_work to defer it. @@ -1154,11 +1171,19 @@ void __kfence_free(void *addr) * the object, as the object page may be recycled for other-typed * objects once it has been freed. meta->cache may be NULL if the cache * was destroyed. + * Save the stack trace here so that reports show where the user freed + * the object. */ - if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) + if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) { + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + metadata_update_state(meta, KFENCE_OBJECT_RCU_FREEING, NULL, 0); + raw_spin_unlock_irqrestore(&meta->lock, flags); call_rcu(&meta->rcu_head, rcu_guarded_free); - else + } else { kfence_guarded_free(addr, meta, false); + } } bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) @@ -1182,14 +1207,14 @@ bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs int distance = 0; meta = addr_to_metadata(addr - PAGE_SIZE); - if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + if (meta && kfence_obj_allocated(meta)) { to_report = meta; /* Data race ok; distance calculation approximate. */ distance = addr - data_race(meta->addr + meta->size); } meta = addr_to_metadata(addr + PAGE_SIZE); - if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + if (meta && kfence_obj_allocated(meta)) { /* Data race ok; distance calculation approximate. */ if (!to_report || distance > data_race(meta->addr) - addr) to_report = meta; diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index db87a05047bd..dfba5ea06b01 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -38,6 +38,7 @@ enum kfence_object_state { KFENCE_OBJECT_UNUSED, /* Object is unused. */ KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ + KFENCE_OBJECT_RCU_FREEING, /* Object was allocated, and then being freed by rcu. */ KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ }; diff --git a/mm/kfence/report.c b/mm/kfence/report.c index c509aed326ce..451991a3a8f2 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -108,11 +109,15 @@ static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadat const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track; u64 ts_sec = track->ts_nsec; unsigned long rem_nsec = do_div(ts_sec, NSEC_PER_SEC); + u64 interval_nsec = local_clock() - meta->alloc_track.ts_nsec; + unsigned long rem_interval_nsec = do_div(interval_nsec, NSEC_PER_SEC); /* Timestamp matches printk timestamp format. */ - seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus:\n", - show_alloc ? "allocated" : "freed", track->pid, - track->cpu, (unsigned long)ts_sec, rem_nsec / 1000); + seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus (%lu.%06lus ago):\n", + show_alloc ? "allocated" : meta->state == KFENCE_OBJECT_RCU_FREEING ? + "rcu freeing" : "freed", track->pid, + track->cpu, (unsigned long)ts_sec, rem_nsec / 1000, + (unsigned long)interval_nsec, rem_interval_nsec / 1000); if (track->num_stack_entries) { /* Skip allocation/free internals stack. */ @@ -145,7 +150,7 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met kfence_print_stack(seq, meta, true); - if (meta->state == KFENCE_OBJECT_FREED) { + if (meta->state == KFENCE_OBJECT_FREED || meta->state == KFENCE_OBJECT_RCU_FREEING) { seq_con_printf(seq, "\n"); kfence_print_stack(seq, meta, false); } @@ -314,7 +319,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla kpp->kp_slab_cache = meta->cache; kpp->kp_objp = (void *)meta->addr; kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack); - if (meta->state == KFENCE_OBJECT_FREED) + if (meta->state == KFENCE_OBJECT_FREED || meta->state == KFENCE_OBJECT_RCU_FREEING) kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack); /* get_stack_skipnr() ensures the first entry is outside allocator. */ kpp->kp_ret = kpp->kp_stack[0]; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cdd1d8655a76..f9c39898eaff 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -85,7 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * * Note that these are only respected if collapse was initiated by khugepaged. */ -static unsigned int khugepaged_max_ptes_none __read_mostly; +unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; @@ -546,12 +546,14 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, static bool is_refcount_suitable(struct folio *folio) { - int expected_refcount; + int expected_refcount = folio_mapcount(folio); - expected_refcount = folio_mapcount(folio); - if (folio_test_swapcache(folio)) + if (!folio_test_anon(folio) || folio_test_swapcache(folio)) expected_refcount += folio_nr_pages(folio); + if (folio_test_private(folio)) + expected_refcount++; + return folio_ref_count(folio) == expected_refcount; } @@ -625,8 +627,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } /* - * We can do it before isolate_lru_page because the - * page can't be freed from under us. NOTE: PG_lock + * We can do it before folio_isolate_lru because the + * folio can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ @@ -1235,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); + deferred_split_folio(folio, false); spin_unlock(pmd_ptl); folio = NULL; @@ -1841,7 +1844,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } } while (1); - for (index = start; index < end; index++) { + for (index = start; index < end;) { xas_set(&xas, index); folio = xas_load(&xas); @@ -1860,18 +1863,19 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } } nr_none++; + index++; continue; } if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_get_folio(mapping->host, index, + if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } - /* drain lru cache to help isolate_lru_page() */ + /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); } else if (folio_trylock(folio)) { folio_get(folio); @@ -1886,7 +1890,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); - /* drain lru cache to help isolate_lru_page() */ + /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { @@ -1941,12 +1945,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ - if (folio_test_large(folio)) { - result = folio_order(folio) == HPAGE_PMD_ORDER && - folio->index == start - /* Maybe PMD-mapped */ - ? SCAN_PTE_MAPPED_HUGEPAGE - : SCAN_PAGE_COMPOUND; + if (folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start) { + /* Maybe PMD-mapped */ + result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } @@ -1986,9 +1988,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* - * We control three references to the folio: + * We control 2 + nr_pages references to the folio: * - we hold a pin on it; - * - one reference from page cache; + * - nr_pages reference from page cache; * - one from lru_isolate_folio; * If those are the only references, then any new usage * of the folio will have to fetch it from the page @@ -1996,7 +1998,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * truncate, so any new usage will be blocked until we * unlock folio after collapse/during rollback. */ - if (folio_ref_count(folio) != 3) { + if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); folio_putback_lru(folio); @@ -2007,6 +2009,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * Accumulate the folios that are being collapsed. */ list_add_tail(&folio->lru, &pagelist); + index += folio_nr_pages(folio); continue; out_unlock: folio_unlock(folio); @@ -2054,17 +2057,22 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, index = start; dst = folio_page(new_folio, 0); list_for_each_entry(folio, &pagelist, lru) { + int i, nr_pages = folio_nr_pages(folio); + while (index < folio->index) { clear_highpage(dst); index++; dst++; } - if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) { - result = SCAN_COPY_MC; - goto rollback; + + for (i = 0; i < nr_pages; i++) { + if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { + result = SCAN_COPY_MC; + goto rollback; + } + index++; + dst++; } - index++; - dst++; } while (index < end) { clear_highpage(dst); @@ -2179,7 +2187,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, folio_clear_active(folio); folio_clear_unevictable(folio); folio_unlock(folio); - folio_put_refs(folio, 3); + folio_put_refs(folio, 2 + folio_nr_pages(folio)); } goto out; @@ -2254,16 +2262,10 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, continue; } - /* - * TODO: khugepaged should compact smaller compound pages - * into a PMD sized page - */ - if (folio_test_large(folio)) { - result = folio_order(folio) == HPAGE_PMD_ORDER && - folio->index == start - /* Maybe PMD-mapped */ - ? SCAN_PTE_MAPPED_HUGEPAGE - : SCAN_PAGE_COMPOUND; + if (folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start) { + /* Maybe PMD-mapped */ + result = SCAN_PTE_MAPPED_HUGEPAGE; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so @@ -2285,8 +2287,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, break; } - if (folio_ref_count(folio) != - 1 + folio_mapcount(folio) + folio_test_private(folio)) { + if (!is_refcount_suitable(folio)) { result = SCAN_PAGE_COUNT; break; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 764b08100570..0400f5e8ac60 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -224,6 +224,10 @@ static int kmemleak_error; static unsigned long min_addr = ULONG_MAX; static unsigned long max_addr; +/* minimum and maximum address that may be valid per-CPU pointers */ +static unsigned long min_percpu_addr = ULONG_MAX; +static unsigned long max_percpu_addr; + static struct task_struct *scan_thread; /* used to avoid reporting of recently allocated objects */ static unsigned long jiffies_min_age; @@ -294,13 +298,20 @@ static void hex_dump_object(struct seq_file *seq, const u8 *ptr = (const u8 *)object->pointer; size_t len; - if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU))) + if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) return; + if (object->flags & OBJECT_PERCPU) + ptr = (const u8 *)this_cpu_ptr((void __percpu *)object->pointer); + /* limit the number of lines to HEX_MAX_LINES */ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); - warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); + if (object->flags & OBJECT_PERCPU) + warn_or_seq_printf(seq, " hex dump (first %zu bytes on cpu %d):\n", + len, raw_smp_processor_id()); + else + warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII); @@ -695,10 +706,14 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); /* - * Only update min_addr and max_addr with object - * storing virtual address. + * Only update min_addr and max_addr with object storing virtual + * address. And update min_percpu_addr max_percpu_addr for per-CPU + * objects. */ - if (!(objflags & (OBJECT_PHYS | OBJECT_PERCPU))) { + if (objflags & OBJECT_PERCPU) { + min_percpu_addr = min(min_percpu_addr, untagged_ptr); + max_percpu_addr = max(max_percpu_addr, untagged_ptr + size); + } else if (!(objflags & OBJECT_PHYS)) { min_addr = min(min_addr, untagged_ptr); max_addr = max(max_addr, untagged_ptr + size); } @@ -1055,12 +1070,8 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, { pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size); - /* - * Percpu allocations are only scanned and not reported as leaks - * (min_count is set to 0). - */ - if (kmemleak_enabled && ptr && !IS_ERR(ptr)) - create_object_percpu((unsigned long)ptr, size, 0, gfp); + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + create_object_percpu((__force unsigned long)ptr, size, 0, gfp); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -1134,8 +1145,8 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) { pr_debug("%s(0x%px)\n", __func__, ptr); - if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) - delete_object_full((unsigned long)ptr, OBJECT_PERCPU); + if (kmemleak_free_enabled && ptr && !IS_ERR_PCPU(ptr)) + delete_object_full((__force unsigned long)ptr, OBJECT_PERCPU); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -1304,12 +1315,23 @@ static bool update_checksum(struct kmemleak_object *object) { u32 old_csum = object->checksum; - if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU))) + if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) return false; kasan_disable_current(); kcsan_disable_current(); - object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); + if (object->flags & OBJECT_PERCPU) { + unsigned int cpu; + + object->checksum = 0; + for_each_possible_cpu(cpu) { + void *ptr = per_cpu_ptr((void __percpu *)object->pointer, cpu); + + object->checksum ^= crc32(0, kasan_reset_tag((void *)ptr), object->size); + } + } else { + object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); + } kasan_enable_current(); kcsan_enable_current(); @@ -1340,6 +1362,64 @@ static void update_refs(struct kmemleak_object *object) } } +static void pointer_update_refs(struct kmemleak_object *scanned, + unsigned long pointer, unsigned int objflags) +{ + struct kmemleak_object *object; + unsigned long untagged_ptr; + unsigned long excess_ref; + + untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); + if (objflags & OBJECT_PERCPU) { + if (untagged_ptr < min_percpu_addr || untagged_ptr >= max_percpu_addr) + return; + } else { + if (untagged_ptr < min_addr || untagged_ptr >= max_addr) + return; + } + + /* + * No need for get_object() here since we hold kmemleak_lock. + * object->use_count cannot be dropped to 0 while the object + * is still present in object_tree_root and object_list + * (with updates protected by kmemleak_lock). + */ + object = __lookup_object(pointer, 1, objflags); + if (!object) + return; + if (object == scanned) + /* self referenced, ignore */ + return; + + /* + * Avoid the lockdep recursive warning on object->lock being + * previously acquired in scan_object(). These locks are + * enclosed by scan_mutex. + */ + raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + /* only pass surplus references (object already gray) */ + if (color_gray(object)) { + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); + } + raw_spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + return; + if (object == scanned) + /* circular reference, ignore */ + return; + raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + raw_spin_unlock(&object->lock); + } +} + /* * Memory scanning is a long process and it needs to be interruptible. This * function checks whether such interrupt condition occurred. @@ -1372,13 +1452,10 @@ static void scan_block(void *_start, void *_end, unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); unsigned long flags; - unsigned long untagged_ptr; raw_spin_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { - struct kmemleak_object *object; unsigned long pointer; - unsigned long excess_ref; if (scan_should_stop()) break; @@ -1387,50 +1464,8 @@ static void scan_block(void *_start, void *_end, pointer = *(unsigned long *)kasan_reset_tag((void *)ptr); kasan_enable_current(); - untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); - if (untagged_ptr < min_addr || untagged_ptr >= max_addr) - continue; - - /* - * No need for get_object() here since we hold kmemleak_lock. - * object->use_count cannot be dropped to 0 while the object - * is still present in object_tree_root and object_list - * (with updates protected by kmemleak_lock). - */ - object = lookup_object(pointer, 1); - if (!object) - continue; - if (object == scanned) - /* self referenced, ignore */ - continue; - - /* - * Avoid the lockdep recursive warning on object->lock being - * previously acquired in scan_object(). These locks are - * enclosed by scan_mutex. - */ - raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - /* only pass surplus references (object already gray) */ - if (color_gray(object)) { - excess_ref = object->excess_ref; - /* no need for update_refs() if object already gray */ - } else { - excess_ref = 0; - update_refs(object); - } - raw_spin_unlock(&object->lock); - - if (excess_ref) { - object = lookup_object(excess_ref, 0); - if (!object) - continue; - if (object == scanned) - /* circular reference, ignore */ - continue; - raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - update_refs(object); - raw_spin_unlock(&object->lock); - } + pointer_update_refs(scanned, pointer, 0); + pointer_update_refs(scanned, pointer, OBJECT_PERCPU); } raw_spin_unlock_irqrestore(&kmemleak_lock, flags); } diff --git a/mm/ksm.c b/mm/ksm.c index 14d9e53b1ec2..a2e2a521df0a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -608,47 +608,6 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, - struct mm_walk *walk) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t *pte; - pte_t ptent; - int ret; - - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (!pte) - return 0; - ptent = ptep_get(pte); - if (pte_present(ptent)) { - page = vm_normal_page(walk->vma, addr, ptent); - } else if (!pte_none(ptent)) { - swp_entry_t entry = pte_to_swp_entry(ptent); - - /* - * As KSM pages remain KSM pages until freed, no need to wait - * here for migration to end. - */ - if (is_migration_entry(entry)) - page = pfn_swap_entry_to_page(entry); - } - /* return 1 if the page is an normal ksm page or KSM-placed zero page */ - ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent); - pte_unmap_unlock(pte, ptl); - return ret; -} - -static const struct mm_walk_ops break_ksm_ops = { - .pmd_entry = break_ksm_pmd_entry, - .walk_lock = PGWALK_RDLOCK, -}; - -static const struct mm_walk_ops break_ksm_lock_vma_ops = { - .pmd_entry = break_ksm_pmd_entry, - .walk_lock = PGWALK_WRLOCK, -}; - /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. @@ -665,16 +624,26 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = { static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; - const struct mm_walk_ops *ops = lock_vma ? - &break_ksm_lock_vma_ops : &break_ksm_ops; + + if (lock_vma) + vma_start_write(vma); do { - int ksm_page; + bool ksm_page = false; + struct folio_walk fw; + struct folio *folio; cond_resched(); - ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL); - if (WARN_ON_ONCE(ksm_page < 0)) - return ksm_page; + folio = folio_walk_start(&fw, vma, addr, + FW_MIGRATION | FW_ZEROPAGE); + if (folio) { + /* Small folio implies FW_LEVEL_PTE. */ + if (!folio_test_large(folio) && + (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) + ksm_page = true; + folio_walk_end(&fw, vma); + } + if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, @@ -767,26 +736,28 @@ static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; - struct page *page; + struct page *page = NULL; + struct folio_walk fw; + struct folio *folio; mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; - page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) - goto out; - if (is_zone_device_page(page)) - goto out_putpage; - if (PageAnon(page)) { + folio = folio_walk_start(&fw, vma, addr, 0); + if (folio) { + if (!folio_is_zone_device(folio) && + folio_test_anon(folio)) { + folio_get(folio); + page = fw.page; + } + folio_walk_end(&fw, vma); + } +out: + if (page) { flush_anon_page(vma, page, addr); flush_dcache_page(page); - } else { -out_putpage: - put_page(page); -out: - page = NULL; } mmap_read_unlock(mm); return page; @@ -938,12 +909,13 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, */ while (!folio_try_get(folio)) { /* - * Another check for page->mapping != expected_mapping would - * work here too. We have chosen the !PageSwapCache test to - * optimize the common case, when the page is or is about to - * be freed: PageSwapCache is cleared (under spin_lock_irq) - * in the ref_freeze section of __remove_mapping(); but Anon - * folio->mapping reset to NULL later, in free_pages_prepare(). + * Another check for folio->mapping != expected_mapping + * would work here too. We have chosen to test the + * swapcache flag to optimize the common case, when the + * folio is or is about to be freed: the swapcache flag + * is cleared (under spin_lock_irq) in the ref_freeze + * section of __remove_mapping(); but anon folio->mapping + * is reset to NULL later, in free_pages_prepare(). */ if (!folio_test_swapcache(folio)) goto stale; @@ -974,7 +946,7 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, stale: /* - * We come here from above when page->mapping or !PageSwapCache + * We come here from above when folio->mapping or the swapcache flag * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. @@ -1481,7 +1453,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, goto out; /* - * We need the page lock to read a stable PageSwapCache in + * We need the folio lock to read a stable swapcache flag in * write_protect_page(). We use trylock_page() instead of * lock_page() because we don't want to wait here - we * prefer to continue scanning and merging different pages, @@ -2562,36 +2534,46 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { + struct page *tmp_page = NULL; + struct folio_walk fw; + struct folio *folio; + if (ksm_test_exit(mm)) break; - *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (IS_ERR_OR_NULL(*page)) { - ksm_scan.address += PAGE_SIZE; - cond_resched(); - continue; + + folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); + if (folio) { + if (!folio_is_zone_device(folio) && + folio_test_anon(folio)) { + folio_get(folio); + tmp_page = fw.page; + } + folio_walk_end(&fw, vma); } - if (is_zone_device_page(*page)) - goto next_page; - if (PageAnon(*page)) { - flush_anon_page(vma, *page, ksm_scan.address); - flush_dcache_page(*page); + + if (tmp_page) { + flush_anon_page(vma, tmp_page, ksm_scan.address); + flush_dcache_page(tmp_page); rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; - if (should_skip_rmap_item(*page, rmap_item)) + if (should_skip_rmap_item(tmp_page, rmap_item)) { + folio_put(folio); goto next_page; + } ksm_scan.address += PAGE_SIZE; - } else - put_page(*page); + *page = tmp_page; + } else { + folio_put(folio); + } mmap_read_unlock(mm); return rmap_item; } next_page: - put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } @@ -3142,7 +3124,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible * to ksm_get_folio() before it can see that folio->mapping - * has gone stale (or that folio_test_swapcache has been cleared). + * has gone stale (or that the swapcache flag has been cleared). */ smp_wmb(); folio_set_stable_node(folio, NULL); diff --git a/mm/madvise.c b/mm/madvise.c index 6e3a137b8e50..ff139e57cca2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1031,6 +1031,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; + if (unlikely(!can_modify_vma_madv(vma, behavior))) + return -EPERM; + switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); @@ -1448,15 +1451,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr_remote(mm, start); end = start + len; - /* - * Check if the address range is sealed for do_madvise(). - * can_modify_mm_madv assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { - error = -EPERM; - goto out; - } - blk_start_plug(&plug); switch (behavior) { case MADV_POPULATE_READ: @@ -1470,7 +1464,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh } blk_finish_plug(&plug); -out: if (write) mmap_write_unlock(mm); else diff --git a/mm/memblock.c b/mm/memblock.c index 3b9dc2d89b8a..0a77a748a8eb 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1500,7 +1500,7 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, * * Accept the memory of the allocated buffer. */ - accept_memory(found, found + size); + accept_memory(found, size); return found; } diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 417c96f2da28..b37c0d870816 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -742,6 +742,9 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return folio_file_page(folio, index); } +static void memcg1_check_events(struct mem_cgroup *memcg, int nid); +static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages); + /** * mem_cgroup_move_account - move account of the folio * @folio: The folio. @@ -853,9 +856,9 @@ static int mem_cgroup_move_account(struct folio *folio, nid = folio_nid(folio); local_irq_disable(); - mem_cgroup_charge_statistics(to, nr_pages); + memcg1_charge_statistics(to, nr_pages); memcg1_check_events(to, nid); - mem_cgroup_charge_statistics(from, -nr_pages); + memcg1_charge_statistics(from, -nr_pages); memcg1_check_events(from, nid); local_irq_enable(); out: @@ -1439,21 +1442,68 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg) } } +/* Cgroup1: threshold notifications & softlimit tree updates */ +struct memcg1_events_percpu { + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) +{ + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) + __count_memcg_events(memcg, PGPGIN, 1); + else { + __count_memcg_events(memcg, PGPGOUT, 1); + nr_pages = -nr_pages; /* for event */ + } + + __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages); +} + +#define THRESHOLDS_EVENTS_TARGET 128 +#define SOFTLIMIT_EVENTS_TARGET 1024 + +static bool memcg1_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) +{ + unsigned long val, next; + + val = __this_cpu_read(memcg->events_percpu->nr_page_events); + next = __this_cpu_read(memcg->events_percpu->targets[target]); + /* from time_after() in jiffies.h */ + if ((long)(next - val) < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->events_percpu->targets[target], next); + return true; + } + return false; +} + /* * Check events in order. * */ -void memcg1_check_events(struct mem_cgroup *memcg, int nid) +static void memcg1_check_events(struct mem_cgroup *memcg, int nid) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) return; /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(mem_cgroup_event_ratelimit(memcg, + if (unlikely(memcg1_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { bool do_softlimit; - do_softlimit = mem_cgroup_event_ratelimit(memcg, + do_softlimit = memcg1_event_ratelimit(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) @@ -1461,6 +1511,43 @@ void memcg1_check_events(struct mem_cgroup *memcg, int nid) } } +void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) +{ + unsigned long flags; + + local_irq_save(flags); + memcg1_charge_statistics(memcg, folio_nr_pages(folio)); + memcg1_check_events(memcg, folio_nid(folio)); + local_irq_restore(flags); +} + +void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) +{ + /* + * Interrupts should be disabled here because the caller holds the + * i_pages lock which is taken with interrupts-off. It is + * important here to have the interrupts disabled because it is the + * only synchronisation we have for updating the per-CPU variables. + */ + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); + memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); + preempt_enable_nested(); + memcg1_check_events(memcg, folio_nid(folio)); +} + +void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_memory, int nid) +{ + unsigned long flags; + + local_irq_save(flags); + __count_memcg_events(memcg, PGPGOUT, pgpgout); + __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); + memcg1_check_events(memcg, nid); + local_irq_restore(flags); +} + static int compare_thresholds(const void *a, const void *b) { const struct mem_cgroup_threshold *_a = a; @@ -1907,9 +1994,15 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, event->register_event = mem_cgroup_usage_register_event; event->unregister_event = mem_cgroup_usage_unregister_event; } else if (!strcmp(name, "memory.oom_control")) { + pr_warn_once("oom_control is deprecated and will be removed. " + "Please report your usecase to linux-mm-@kvack.org" + " if you depend on this functionality. \n"); event->register_event = mem_cgroup_oom_register_event; event->unregister_event = mem_cgroup_oom_unregister_event; } else if (!strcmp(name, "memory.pressure_level")) { + pr_warn_once("pressure_level is deprecated and will be removed. " + "Please report your usecase to linux-mm-@kvack.org " + "if you depend on this functionality. \n"); event->register_event = vmpressure_register_event; event->unregister_event = vmpressure_unregister_event; } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { @@ -2447,6 +2540,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = 0; break; case _TCP: + pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); ret = memcg_update_tcp_max(memcg, nr_pages); break; } @@ -2455,6 +2551,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ret = -EOPNOTSUPP; } else { + pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); WRITE_ONCE(memcg->soft_limit, nr_pages); ret = 0; } @@ -2748,6 +2847,10 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + pr_warn_once("oom_control is deprecated and will be removed. " + "Please report your usecase to linux-mm-@kvack.org if you " + "depend on this functionality. \n"); + /* cannot set to root cgroup and only 0 and 1 are allowed */ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) return -EINVAL; @@ -2952,6 +3055,19 @@ bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, return false; } +bool memcg1_alloc_events(struct mem_cgroup *memcg) +{ + memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu, + GFP_KERNEL_ACCOUNT); + return !!memcg->events_percpu; +} + +void memcg1_free_events(struct mem_cgroup *memcg) +{ + if (memcg->events_percpu) + free_percpu(memcg->events_percpu); +} + static int __init memcg1_init(void) { int node; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 56d7eaa98274..c0672e25bcdb 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -7,7 +7,6 @@ /* Cgroup v1 and v2 common declarations */ -void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages); int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages); @@ -56,8 +55,6 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, - enum mem_cgroup_events_target target); unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); void drain_all_stock(struct mem_cgroup *root_memcg); @@ -71,6 +68,10 @@ int memory_stat_show(struct seq_file *m, void *v); /* Cgroup v1-specific declarations */ #ifdef CONFIG_MEMCG_V1 + +bool memcg1_alloc_events(struct mem_cgroup *memcg); +void memcg1_free_events(struct mem_cgroup *memcg); + void memcg1_memcg_init(struct mem_cgroup *memcg); void memcg1_remove_from_trees(struct mem_cgroup *memcg); @@ -99,7 +100,10 @@ bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked); void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked); void memcg1_oom_recover(struct mem_cgroup *memcg); -void memcg1_check_events(struct mem_cgroup *memcg, int nid); +void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg); +void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg); +void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_memory, int nid); void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); @@ -120,6 +124,9 @@ extern struct cftype mem_cgroup_legacy_files[]; #else /* CONFIG_MEMCG_V1 */ +static inline bool memcg1_alloc_events(struct mem_cgroup *memcg) { return true; } +static inline void memcg1_free_events(struct mem_cgroup *memcg) {} + static inline void memcg1_memcg_init(struct mem_cgroup *memcg) {} static inline void memcg1_remove_from_trees(struct mem_cgroup *memcg) {} static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) {} @@ -130,7 +137,14 @@ static inline bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) { static inline void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) {} static inline void memcg1_oom_recover(struct mem_cgroup *memcg) {} -static inline void memcg1_check_events(struct mem_cgroup *memcg, int nid) {} +static inline void memcg1_commit_charge(struct folio *folio, + struct mem_cgroup *memcg) {} + +static inline void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) {} + +static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg, + unsigned long pgpgout, + unsigned long nr_memory, int nid) {} static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {} @@ -140,8 +154,6 @@ static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr gfp_t gfp_mask) { return true; } static inline void memcg1_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) {} -extern struct cftype memsw_files[]; -extern struct cftype mem_cgroup_legacy_files[]; #endif /* CONFIG_MEMCG_V1 */ #endif /* __MM_MEMCONTROL_V1_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d563fb515766..7845c64a2c57 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,6 +25,7 @@ * Copyright (C) 2020 Alibaba, Inc, Alex Shi */ +#include #include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -93,9 +95,6 @@ static bool cgroup_memory_nobpf __ro_after_init; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif -#define THRESHOLDS_EVENTS_TARGET 128 -#define SOFTLIMIT_EVENTS_TARGET 1024 - static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || @@ -305,6 +304,12 @@ static const unsigned int memcg_node_stat_items[] = { #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif +#ifdef CONFIG_NUMA_BALANCING + PGPROMOTE_SUCCESS, +#endif + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, }; static const unsigned int memcg_stat_items[] = { @@ -320,24 +325,27 @@ static const unsigned int memcg_stat_items[] = { #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) #define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \ ARRAY_SIZE(memcg_stat_items)) -static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; +#define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX) +static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; static void init_memcg_stats(void) { - int8_t i, j = 0; + u8 i, j = 0; - BUILD_BUG_ON(MEMCG_NR_STAT >= S8_MAX); + BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX); - for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i) - mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j; + memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index)); - for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i) - mem_cgroup_stats_index[memcg_stat_items[i]] = ++j; + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j) + mem_cgroup_stats_index[memcg_node_stat_items[i]] = j; + + for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j) + mem_cgroup_stats_index[memcg_stat_items[i]] = j; } static inline int memcg_stats_index(int idx) { - return mem_cgroup_stats_index[idx] - 1; + return mem_cgroup_stats_index[idx]; } struct lruvec_stats_percpu { @@ -369,7 +377,7 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) return node_page_state(lruvec_pgdat(lruvec), idx); i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -392,7 +400,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -406,8 +414,10 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { +#ifdef CONFIG_MEMCG_V1 PGPGIN, PGPGOUT, +#endif PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, @@ -432,24 +442,32 @@ static const unsigned int memcg_vm_event_stat[] = { THP_SWPOUT, THP_SWPOUT_FALLBACK, #endif +#ifdef CONFIG_NUMA_BALANCING + NUMA_PAGE_MIGRATE, + NUMA_PTE_UPDATES, + NUMA_HINT_FAULTS, +#endif }; #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) -static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; +static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; static void init_memcg_events(void) { - int8_t i; + u8 i; - BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX); + BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX); + + memset(mem_cgroup_events_index, U8_MAX, + sizeof(mem_cgroup_events_index)); for (i = 0; i < NR_MEMCG_EVENTS; ++i) - mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i; } static inline int memcg_events_index(enum vm_event_item idx) { - return mem_cgroup_events_index[idx] - 1; + return mem_cgroup_events_index[idx]; } struct memcg_vmstats_percpu { @@ -469,10 +487,6 @@ struct memcg_vmstats_percpu { /* Delta calculation for lockless upward propagation */ long state_prev[MEMCG_VMSTAT_SIZE]; unsigned long events_prev[NR_MEMCG_EVENTS]; - - /* Cgroup1: threshold notifications & softlimit tree updates */ - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; } ____cacheline_aligned; struct memcg_vmstats { @@ -621,7 +635,7 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) long x; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; x = READ_ONCE(memcg->vmstats->state[i]); @@ -662,7 +676,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (mem_cgroup_disabled()) return; - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; __this_cpu_add(memcg->vmstats_percpu->state[i], val); @@ -675,7 +689,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) long x; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; x = READ_ONCE(memcg->vmstats->state_local[i]); @@ -694,7 +708,7 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, struct mem_cgroup *memcg; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -810,7 +824,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; memcg_stats_lock(); @@ -823,7 +837,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event) { int i = memcg_events_index(event); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event)) return 0; return READ_ONCE(memcg->vmstats->events[i]); @@ -833,50 +847,12 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { int i = memcg_events_index(event); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event)) return 0; return READ_ONCE(memcg->vmstats->events_local[i]); } -void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages) -{ - /* pagein of a big page is an event. So, ignore page size */ - if (nr_pages > 0) - __count_memcg_events(memcg, PGPGIN, 1); - else { - __count_memcg_events(memcg, PGPGOUT, 1); - nr_pages = -nr_pages; /* for event */ - } - - __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); -} - -bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, - enum mem_cgroup_events_target target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); - next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); - /* from time_after() in jiffies.h */ - if ((long)(next - val) < 0) { - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - default: - break; - } - __this_cpu_write(memcg->vmstats_percpu->targets[target], next); - return true; - } - return false; -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -970,6 +946,24 @@ struct mem_cgroup *get_mem_cgroup_from_current(void) return memcg; } +/** + * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg. + * @folio: folio from which memcg should be extracted. + */ +struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + + if (mem_cgroup_disabled()) + return NULL; + + rcu_read_lock(); + if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) + memcg = root_mem_cgroup; + rcu_read_unlock(); + return memcg; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -992,9 +986,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup_reclaim_iter *iter; - struct cgroup_subsys_state *css = NULL; - struct mem_cgroup *memcg = NULL; - struct mem_cgroup *pos = NULL; + struct cgroup_subsys_state *css; + struct mem_cgroup *pos; + struct mem_cgroup *next; if (mem_cgroup_disabled()) return NULL; @@ -1003,81 +997,67 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; rcu_read_lock(); +restart: + next = NULL; if (reclaim) { - struct mem_cgroup_per_node *mz; + int gen; + int nid = reclaim->pgdat->node_id; - mz = root->nodeinfo[reclaim->pgdat->node_id]; - iter = &mz->iter; + iter = &root->nodeinfo[nid]->iter; + gen = atomic_read(&iter->generation); /* * On start, join the current reclaim iteration cycle. * Exit when a concurrent walker completes it. */ if (!prev) - reclaim->generation = iter->generation; - else if (reclaim->generation != iter->generation) + reclaim->generation = gen; + else if (reclaim->generation != gen) goto out_unlock; - while (1) { - pos = READ_ONCE(iter->position); - if (!pos || css_tryget(&pos->css)) - break; - /* - * css reference reached zero, so iter->position will - * be cleared by ->css_released. However, we should not - * rely on this happening soon, because ->css_released - * is called from a work queue, and by busy-waiting we - * might block it. So we clear iter->position right - * away. - */ - (void)cmpxchg(&iter->position, pos, NULL); - } - } else if (prev) { + pos = READ_ONCE(iter->position); + } else pos = prev; - } - if (pos) - css = &pos->css; - - for (;;) { - css = css_next_descendant_pre(css, &root->css); - if (!css) { - /* - * Reclaimers share the hierarchy walk, and a - * new one might jump in right at the end of - * the hierarchy - make sure they see at least - * one group and restart from the beginning. - */ - if (!prev) - continue; - break; - } + css = pos ? &pos->css : NULL; + while ((css = css_next_descendant_pre(css, &root->css))) { /* * Verify the css and acquire a reference. The root * is provided by the caller, so we know it's alive * and kicking, and don't take an extra reference. */ - if (css == &root->css || css_tryget(css)) { - memcg = mem_cgroup_from_css(css); + if (css == &root->css || css_tryget(css)) break; - } } + next = mem_cgroup_from_css(css); + if (reclaim) { /* * The position could have already been updated by a competing * thread, so check that the value hasn't changed since we read * it to avoid reclaiming from the same cgroup twice. */ - (void)cmpxchg(&iter->position, pos, memcg); + if (cmpxchg(&iter->position, pos, next) != pos) { + if (css && css != &root->css) + css_put(css); + goto restart; + } - if (pos) - css_put(&pos->css); + if (!next) { + atomic_inc(&iter->generation); - if (!memcg) - iter->generation++; + /* + * Reclaimers share the hierarchy walk, and a + * new one might jump in right at the end of + * the hierarchy - make sure they see at least + * one group and restart from the beginning. + */ + if (!prev) + goto restart; + } } out_unlock: @@ -1085,7 +1065,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && prev != root) css_put(&prev->css); - return memcg; + return next; } /** @@ -1375,6 +1355,13 @@ static const struct memory_stat memory_stats[] = { { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, + + { "pgdemote_kswapd", PGDEMOTE_KSWAPD }, + { "pgdemote_direct", PGDEMOTE_DIRECT }, + { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED }, +#ifdef CONFIG_NUMA_BALANCING + { "pgpromote_success", PGPROMOTE_SUCCESS }, +#endif }; /* The actual unit of the state item, not the same as the output unit */ @@ -1399,6 +1386,9 @@ static int memcg_page_state_output_unit(int item) /* * Workingset state is actually in pages, but we export it to userspace * as a scalar count of events, so special case it here. + * + * Demotion and promotion activities are exported in pages, consistent + * with their global counterparts. */ switch (item) { case WORKINGSET_REFAULT_ANON: @@ -1408,6 +1398,12 @@ static int memcg_page_state_output_unit(int item) case WORKINGSET_RESTORE_ANON: case WORKINGSET_RESTORE_FILE: case WORKINGSET_NODERECLAIM: + case PGDEMOTE_KSWAPD: + case PGDEMOTE_DIRECT: + case PGDEMOTE_KHUGEPAGED: +#ifdef CONFIG_NUMA_BALANCING + case PGPROMOTE_SUCCESS: +#endif return 1; default: return memcg_page_state_unit(item); @@ -1466,10 +1462,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) memcg_events(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { +#ifdef CONFIG_MEMCG_V1 if (memcg_vm_event_stat[i] == PGPGIN || memcg_vm_event_stat[i] == PGPGOUT) continue; - +#endif seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); @@ -2366,7 +2363,7 @@ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { - VM_BUG_ON_FOLIO(folio_memcg(folio), folio); + VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); /* * Any of the following ensures page's memcg stability: * @@ -2388,11 +2385,7 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) { css_get(&memcg->css); commit_charge(folio, memcg); - - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio)); - memcg1_check_events(memcg, folio_nid(folio)); - local_irq_enable(); + memcg1_commit_charge(folio, memcg); } static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, @@ -2446,37 +2439,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) /* * Returns a pointer to the memory cgroup to which the kernel object is charged. - * - * A passed kernel object can be a slab object, vmalloc object or a generic - * kernel page, so different mechanisms for getting the memory cgroup pointer - * should be used. - * - * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller - * can not know for sure how the kernel object is implemented. - * mem_cgroup_from_obj() can be safely used in such cases. - * - * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), - * cgroup_mutex, etc. - */ -struct mem_cgroup *mem_cgroup_from_obj(void *p) -{ - struct folio *folio; - - if (mem_cgroup_disabled()) - return NULL; - - if (unlikely(is_vmalloc_addr(p))) - folio = page_folio(vmalloc_to_page(p)); - else - folio = virt_to_folio(p); - - return mem_cgroup_from_obj_folio(folio, p); -} - -/* - * Returns a pointer to the memory cgroup to which the kernel object is charged. - * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, - * allocated using vmalloc(). + * It is not suitable for objects allocated using vmalloc(). * * A passed kernel object must be a slab object or a generic kernel page. * @@ -3057,12 +3020,11 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void split_page_memcg(struct page *head, int old_order, int new_order) { struct folio *folio = page_folio(head); - struct mem_cgroup *memcg = folio_memcg(folio); int i; unsigned int old_nr = 1 << old_order; unsigned int new_nr = 1 << new_order; - if (mem_cgroup_disabled() || !memcg) + if (mem_cgroup_disabled() || !folio_memcg_charged(folio)) return; for (i = new_nr; i < old_nr; i += new_nr) @@ -3071,7 +3033,7 @@ void split_page_memcg(struct page *head, int old_order, int new_order) if (folio_memcg_kmem(folio)) obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); else - css_get_many(&memcg->css, old_nr / new_nr - 1); + css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1); } unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) @@ -3385,29 +3347,12 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) */ #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) -static DEFINE_IDR(mem_cgroup_idr); -static DEFINE_SPINLOCK(memcg_idr_lock); - -static int mem_cgroup_alloc_id(void) -{ - int ret; - - idr_preload(GFP_KERNEL); - spin_lock(&memcg_idr_lock); - ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, - GFP_NOWAIT); - spin_unlock(&memcg_idr_lock); - idr_preload_end(); - return ret; -} +static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids); static void mem_cgroup_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { - spin_lock(&memcg_idr_lock); - idr_remove(&mem_cgroup_idr, memcg->id.id); - spin_unlock(&memcg_idr_lock); - + xa_erase(&mem_cgroup_ids, memcg->id.id); memcg->id.id = 0; } } @@ -3442,7 +3387,7 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); - return idr_find(&mem_cgroup_idr, id); + return xa_load(&mem_cgroup_ids, id); } #ifdef CONFIG_SHRINKER_DEBUG @@ -3517,6 +3462,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + memcg1_free_events(memcg); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); @@ -3535,17 +3481,17 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) struct mem_cgroup *memcg; int node, cpu; int __maybe_unused i; - long error = -ENOMEM; + long error; memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); if (!memcg) - return ERR_PTR(error); + return ERR_PTR(-ENOMEM); - memcg->id.id = mem_cgroup_alloc_id(); - if (memcg->id.id < 0) { - error = memcg->id.id; + error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL, + XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL); + if (error) goto fail; - } + error = -ENOMEM; memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL_ACCOUNT); @@ -3557,6 +3503,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg->vmstats_percpu) goto fail; + if (!memcg1_alloc_events(memcg)) + goto fail; + for_each_possible_cpu(cpu) { if (parent) pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); @@ -3574,6 +3523,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) INIT_WORK(&memcg->high_work, high_work_func); vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->memory_peaks); + INIT_LIST_HEAD(&memcg->swap_peaks); + spin_lock_init(&memcg->peaks_lock); memcg->socket_pressure = jiffies; memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; @@ -3619,21 +3571,21 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); - page_counter_init(&memcg->memory, &parent->memory); - page_counter_init(&memcg->swap, &parent->swap); + page_counter_init(&memcg->memory, &parent->memory, true); + page_counter_init(&memcg->swap, &parent->swap, false); #ifdef CONFIG_MEMCG_V1 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); - page_counter_init(&memcg->kmem, &parent->kmem); - page_counter_init(&memcg->tcpmem, &parent->tcpmem); + page_counter_init(&memcg->kmem, &parent->kmem, false); + page_counter_init(&memcg->tcpmem, &parent->tcpmem, false); #endif } else { init_memcg_stats(); init_memcg_events(); - page_counter_init(&memcg->memory, NULL); - page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->memory, NULL, true); + page_counter_init(&memcg->swap, NULL, false); #ifdef CONFIG_MEMCG_V1 - page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcpmem, NULL); + page_counter_init(&memcg->kmem, NULL, false); + page_counter_init(&memcg->tcpmem, NULL, false); #endif root_mem_cgroup = memcg; return &memcg->css; @@ -3682,9 +3634,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) * publish it here at the end of onlining. This matches the * regular ID destruction during offlining. */ - spin_lock(&memcg_idr_lock); - idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); - spin_unlock(&memcg_idr_lock); + xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; offline_kmem: @@ -3967,14 +3917,91 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } -static u64 memory_peak_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(css); +#define OFP_PEAK_UNSET (((-1UL))) - return (u64)memcg->memory.watermark * PAGE_SIZE; +static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc) +{ + struct cgroup_of_peak *ofp = of_peak(sf->private); + u64 fd_peak = READ_ONCE(ofp->value), peak; + + /* User wants global or local peak? */ + if (fd_peak == OFP_PEAK_UNSET) + peak = pc->watermark; + else + peak = max(fd_peak, READ_ONCE(pc->local_watermark)); + + seq_printf(sf, "%llu\n", peak * PAGE_SIZE); + return 0; } +static int memory_peak_show(struct seq_file *sf, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); + + return peak_show(sf, v, &memcg->memory); +} + +static int peak_open(struct kernfs_open_file *of) +{ + struct cgroup_of_peak *ofp = of_peak(of); + + ofp->value = OFP_PEAK_UNSET; + return 0; +} + +static void peak_release(struct kernfs_open_file *of) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup_of_peak *ofp = of_peak(of); + + if (ofp->value == OFP_PEAK_UNSET) { + /* fast path (no writes on this fd) */ + return; + } + spin_lock(&memcg->peaks_lock); + list_del(&ofp->list); + spin_unlock(&memcg->peaks_lock); +} + +static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off, struct page_counter *pc, + struct list_head *watchers) +{ + unsigned long usage; + struct cgroup_of_peak *peer_ctx; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup_of_peak *ofp = of_peak(of); + + spin_lock(&memcg->peaks_lock); + + usage = page_counter_read(pc); + WRITE_ONCE(pc->local_watermark, usage); + + list_for_each_entry(peer_ctx, watchers, list) + if (usage > peer_ctx->value) + WRITE_ONCE(peer_ctx->value, usage); + + /* initial write, register watcher */ + if (ofp->value == -1) + list_add(&ofp->list, watchers); + + WRITE_ONCE(ofp->value, usage); + spin_unlock(&memcg->peaks_lock); + + return nbytes; +} + +static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + return peak_write(of, buf, nbytes, off, &memcg->memory, + &memcg->memory_peaks); +} + +#undef OFP_PEAK_UNSET + static int memory_min_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -4324,7 +4351,10 @@ static struct cftype memory_files[] = { { .name = "peak", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = memory_peak_read, + .open = peak_open, + .release = peak_release, + .seq_show = memory_peak_show, + .write = memory_peak_write, }, { .name = "min", @@ -4528,14 +4558,15 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, /* * mem_cgroup_swapin_uncharge_swap - uncharge swap slot - * @entry: swap entry for which the page is charged + * @entry: the first swap entry for which the pages are charged + * @nr_pages: number of pages which will be uncharged * * Call this function after successfully adding the charged page to swapcache. * * Note: This function assumes the page for which swap slot is being uncharged * is order 0 page. */ -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { /* * Cgroup1's unified memory+swap counter has been charged with the @@ -4555,7 +4586,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, 1); + mem_cgroup_uncharge_swap(entry, nr_pages); } } @@ -4574,8 +4605,6 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long flags; - if (ug->nr_memory) { page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) @@ -4587,11 +4616,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) memcg1_oom_recover(ug->memcg); } - local_irq_save(flags); - __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); - memcg1_check_events(ug->memcg, ug->nid); - local_irq_restore(flags); + memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid); /* drop reference from uncharge_folio */ css_put(&ug->memcg->css); @@ -4606,7 +4631,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_order(folio) > 1 && !folio_test_hugetlb(folio) && - !list_empty(&folio->_deferred_list), folio); + !list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio), folio); /* * Nobody should be changing or seriously looking at @@ -4664,7 +4690,7 @@ void __mem_cgroup_uncharge(struct folio *folio) struct uncharge_gather ug; /* Don't touch folio->lru of any random page, pre-check: */ - if (!folio_memcg(folio)) + if (!folio_memcg_charged(folio)) return; uncharge_gather_clear(&ug); @@ -4698,7 +4724,6 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; long nr_pages = folio_nr_pages(new); - unsigned long flags; VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); @@ -4709,7 +4734,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) return; /* Page cache replacement: new folio already charged? */ - if (folio_memcg(new)) + if (folio_memcg_charged(new)) return; memcg = folio_memcg(old); @@ -4726,11 +4751,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) css_get(&memcg->css); commit_charge(new, memcg); - - local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, nr_pages); - memcg1_check_events(memcg, folio_nid(new)); - local_irq_restore(flags); + memcg1_commit_charge(new, memcg); } /** @@ -4966,17 +4987,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) page_counter_uncharge(&memcg->memsw, nr_entries); } - /* - * Interrupts should be disabled here because the caller holds the - * i_pages lock which is taken with interrupts-off. It is - * important here to have the interrupts disabled because it is the - * only synchronisation we have for updating the per-CPU variables. - */ - memcg_stats_lock(); - mem_cgroup_charge_statistics(memcg, -nr_entries); - memcg_stats_unlock(); - memcg1_check_events(memcg, folio_nid(folio)); - + memcg1_swapout(folio, memcg); css_put(&memcg->css); } @@ -5116,12 +5127,20 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } -static u64 swap_peak_read(struct cgroup_subsys_state *css, - struct cftype *cft) +static int swap_peak_show(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); - return (u64)memcg->swap.watermark * PAGE_SIZE; + return peak_show(sf, v, &memcg->swap); +} + +static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + return peak_write(of, buf, nbytes, off, &memcg->swap, + &memcg->swap_peaks); } static int swap_high_show(struct seq_file *m, void *v) @@ -5205,7 +5224,10 @@ static struct cftype swap_files[] = { { .name = "swap.peak", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = swap_peak_read, + .open = peak_open, + .release = peak_release, + .seq_show = swap_peak_show, + .write = swap_peak_write, }, { .name = "swap.events", diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7066fc84f351..96ce31e5a203 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1554,6 +1554,32 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } +void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +{ + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { + struct address_space *mapping; + + /* + * For hugetlb folios in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_folio_mapping_lock_write(folio); + if (!mapping) { + pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", + folio_pfn(folio)); + return; + } + + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else { + try_to_unmap(folio, ttu); + } +} + /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -1615,23 +1641,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - /* - * For hugetlb pages in shared mappings, try_to_unmap - * could potentially call huge_pmd_unshare. Because of - * this, take semaphore in write mode here and set - * TTU_RMAP_LOCKED to indicate we have taken the lock - * at this higher level. - */ - mapping = hugetlb_folio_mapping_lock_write(folio); - if (mapping) { - try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } else - pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn); - } else { - try_to_unmap(folio, ttu); - } + unmap_poisoned_folio(folio, ttu); unmap_success = !folio_mapped(folio); if (!unmap_success) @@ -2643,40 +2653,6 @@ EXPORT_SYMBOL(unpoison_memory); #undef pr_fmt #define pr_fmt(fmt) "Soft offline: " fmt -static bool mf_isolate_folio(struct folio *folio, struct list_head *pagelist) -{ - bool isolated = false; - - if (folio_test_hugetlb(folio)) { - isolated = isolate_hugetlb(folio, pagelist); - } else { - bool lru = !__folio_test_movable(folio); - - if (lru) - isolated = folio_isolate_lru(folio); - else - isolated = isolate_movable_page(&folio->page, - ISOLATE_UNEVICTABLE); - - if (isolated) { - list_add(&folio->lru, pagelist); - if (lru) - node_stat_add_folio(folio, NR_ISOLATED_ANON + - folio_is_file_lru(folio)); - } - } - - /* - * If we succeed to isolate the folio, we grabbed another refcount on - * the folio, so we can safely drop the one we got from get_any_page(). - * If we failed to isolate the folio, it means that we cannot go further - * and we will return an error, so drop the reference we got from - * get_any_page() as well. - */ - folio_put(folio); - return isolated; -} - /* * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. * If the page is a non-dirty unmapped page-cache page, it simply invalidates. @@ -2689,6 +2665,7 @@ static int soft_offline_in_use_page(struct page *page) struct folio *folio = page_folio(page); char const *msg_page[] = {"page", "hugepage"}; bool huge = folio_test_hugetlb(folio); + bool isolated; LIST_HEAD(pagelist); struct migration_target_control mtc = { .nid = NUMA_NO_NODE, @@ -2728,7 +2705,18 @@ static int soft_offline_in_use_page(struct page *page) return 0; } - if (mf_isolate_folio(folio, &pagelist)) { + isolated = isolate_folio_to_list(folio, &pagelist); + + /* + * If we succeed to isolate the folio, we grabbed another refcount on + * the folio, so we can safely drop the one we got from get_any_page(). + * If we failed to isolate the folio, it means that we cannot go further + * and we will return an error, so drop the reference we got from + * get_any_page() as well. + */ + folio_put(folio); + + if (isolated) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 4775b3a3dabe..ed7607f692bd 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "internal.h" @@ -50,6 +51,24 @@ static const struct bus_type memory_tier_subsys = { .dev_name = "memory_tier", }; +#ifdef CONFIG_NUMA_BALANCING +/** + * folio_use_access_time - check if a folio reuses cpupid for page access time + * @folio: folio to check + * + * folio's _last_cpupid field is repurposed by memory tiering. In memory + * tiering mode, cpupid of slow memory folio (not toptier memory) is used to + * record page access time. + * + * Return: the folio _last_cpupid is used to record page access time + */ +bool folio_use_access_time(struct folio *folio) +{ + return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(folio_nid(folio)); +} +#endif + #ifdef CONFIG_MIGRATION static int top_tier_adistance; /* @@ -895,13 +914,14 @@ static int __init memory_tier_init(void) WARN_ON(!node_demotion); #endif - guard(mutex)(&memory_tier_lock); + mutex_lock(&memory_tier_lock); /* * For now we can have 4 faster memory tiers with smaller adistance * than default DRAM tier. */ default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, &default_memory_types); + mutex_unlock(&memory_tier_lock); if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); diff --git a/mm/memory.c b/mm/memory.c index cda2c12c500b..2366578015ad 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -666,17 +666,16 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, return NULL; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); - /* - * There is no pmd_special() but there may be special pmds, e.g. - * in a direct-access (dax) mapping, so let's just replicate the - * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. - */ + /* Currently it's only used for huge pfnmaps */ + if (unlikely(pmd_special(pmd))) + return NULL; + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -927,8 +926,11 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * We have a prealloc page, all good! Take it * over and copy the page & arm it. */ + + if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) + return -EHWPOISON; + *prealloc = NULL; - copy_user_highpage(&new_folio->page, page, addr, src_vma); __folio_mark_uptodate(new_folio); folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); @@ -1167,8 +1169,9 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. + * If copy failed due to hwpoison in source page, break out. */ - if (unlikely(ret == -EAGAIN)) + if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) break; if (unlikely(prealloc)) { /* @@ -1198,7 +1201,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, goto out; } entry.val = 0; - } else if (ret == -EBUSY) { + } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { goto out; } else if (ret == -EAGAIN) { prealloc = folio_prealloc(src_mm, src_vma, addr, false); @@ -4001,6 +4004,194 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +static struct folio *__alloc_swap_folio(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + swp_entry_t entry; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + vmf->address, false); + if (!folio) + return NULL; + + entry = pte_to_swp_entry(vmf->orig_pte); + if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, + GFP_KERNEL, entry)) { + folio_put(folio); + return NULL; + } + + return folio; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing swap_read_folio, which is + * the case the being faulted pte doesn't have swapcache. We need to + * ensure all PTEs have no cache as well, otherwise, we might go to + * swap devices while the content is in swapcache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + +/* + * Check if the PTEs within a range are contiguous swap entries + * and have consistent swapcache, zeromap. + */ +static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) +{ + unsigned long addr; + swp_entry_t entry; + int idx; + pte_t pte; + + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); + idx = (vmf->address - addr) / PAGE_SIZE; + pte = ptep_get(ptep); + + if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) + return false; + entry = pte_to_swp_entry(pte); + if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) + return false; + + /* + * swap_read_folio() can't handle the case a large folio is hybridly + * from different backends. And they are likely corner cases. Similar + * things might be added once zswap support large folios. + */ + if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) + return false; + if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) + return false; + + return true; +} + +static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, + unsigned long addr, + unsigned long orders) +{ + int order, nr; + + order = highest_order(orders); + + /* + * To swap in a THP with nr pages, we require that its first swap_offset + * is aligned with that number, as it was when the THP was swapped out. + * This helps filter out most invalid entries. + */ + while (orders) { + nr = 1 << order; + if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr) + break; + order = next_order(&orders, order); + } + + return orders; +} + +static struct folio *alloc_swap_folio(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long orders; + struct folio *folio; + unsigned long addr; + swp_entry_t entry; + spinlock_t *ptl; + pte_t *pte; + gfp_t gfp; + int order; + + /* + * If uffd is active for the vma we need per-page fault fidelity to + * maintain the uffd semantics. + */ + if (unlikely(userfaultfd_armed(vma))) + goto fallback; + + /* + * A large swapped out folio could be partially or fully in zswap. We + * lack handling for such cases, so fallback to swapping in order-0 + * folio. + */ + if (!zswap_never_enabled()) + goto fallback; + + entry = pte_to_swp_entry(vmf->orig_pte); + /* + * Get a list of all the (large) orders below PMD_ORDER that are enabled + * and suitable for swapping THP. + */ + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + orders = thp_swap_suitable_orders(swp_offset(entry), + vmf->address, orders); + + if (!orders) + goto fallback; + + pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address & PMD_MASK, &ptl); + if (unlikely(!pte)) + goto fallback; + + /* + * For do_swap_page, find the highest order where the aligned range is + * completely swap entries with contiguous swap offsets. + */ + order = highest_order(orders); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order)) + break; + order = next_order(&orders, order); + } + + pte_unmap_unlock(pte, ptl); + + /* Try allocating the highest of the remaining orders. */ + gfp = vma_thp_gfp_mask(vma); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vma, addr, true); + if (folio) { + if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, + gfp, entry)) + return folio; + folio_put(folio); + } + order = next_order(&orders, order); + } + +fallback: + return __alloc_swap_folio(vmf); +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) +{ + return false; +} + +static struct folio *alloc_swap_folio(struct vm_fault *vmf) +{ + return __alloc_swap_folio(vmf); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -4089,35 +4280,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { - /* - * Prevent parallel swapin from proceeding with - * the cache flag. Otherwise, another thread may - * finish swapin first, free the entry, and swapout - * reusing the same entry. It's undetectable as - * pte_same() returns true due to entry reuse. - */ - if (swapcache_prepare(entry)) { - /* Relax a bit to prevent rapid repeated page faults */ - schedule_timeout_uninterruptible(1); - goto out; - } - need_clear_cache = true; - /* skip swapcache */ - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, - vma, vmf->address, false); - page = &folio->page; + folio = alloc_swap_folio(vmf); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_folio(folio, - vma->vm_mm, GFP_KERNEL, - entry)) { - ret = VM_FAULT_OOM; + nr_pages = folio_nr_pages(folio); + if (folio_test_large(folio)) + entry.val = ALIGN_DOWN(entry.val, nr_pages); + /* + * Prevent parallel swapin from proceeding with + * the cache flag. Otherwise, another thread + * may finish swapin first, free the entry, and + * swapout reusing the same entry. It's + * undetectable as pte_same() returns true due + * to entry reuse. + */ + if (swapcache_prepare(entry, nr_pages)) { + /* + * Relax a bit to prevent rapid + * repeated page faults. + */ + schedule_timeout_uninterruptible(1); goto out_page; } - mem_cgroup_swapin_uncharge_swap(entry); + need_clear_cache = true; + + mem_cgroup_swapin_uncharge_swap(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) @@ -4131,10 +4321,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio->private = NULL; } } else { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - if (page) - folio = page_folio(page); swapcache = folio; } @@ -4155,6 +4343,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); + page = folio_file_page(folio, swp_offset(entry)); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -4224,6 +4413,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } + /* allocated large folios for SWP_SYNCHRONOUS_IO */ + if (folio_test_large(folio) && !folio_test_swapcache(folio)) { + unsigned long nr = folio_nr_pages(folio); + unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); + unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; + pte_t *folio_ptep = vmf->pte - idx; + pte_t folio_pte = ptep_get(folio_ptep); + + if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || + swap_pte_batch(folio_ptep, nr, folio_pte) != nr) + goto out_nomap; + + page_idx = idx; + address = folio_start; + ptep = folio_ptep; + goto check_folio; + } + nr_pages = 1; page_idx = 0; address = vmf->address; @@ -4355,11 +4562,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_add_lru_vma(folio, vma); } else if (!folio_test_anon(folio)) { /* - * We currently only expect small !anon folios, which are either - * fully exclusive or fully shared. If we ever get large folios - * here, we have to be careful. + * We currently only expect small !anon folios which are either + * fully exclusive or fully shared, or new allocated large + * folios which are fully exclusive. If we ever get large + * folios within swapcache here, we have to be careful. */ - VM_WARN_ON_ONCE(folio_test_large(folio)); + VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { @@ -4402,7 +4610,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) - swapcache_clear(si, entry); + swapcache_clear(si, entry, nr_pages); if (si) put_swap_device(si); return ret; @@ -4418,7 +4626,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_put(swapcache); } if (need_clear_cache) - swapcache_clear(si, entry); + swapcache_clear(si, entry, nr_pages); if (si) put_swap_device(si); return ret; @@ -4612,9 +4820,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); -#endif folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: @@ -5109,10 +5315,14 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (ret & VM_FAULT_DONE_COW) return ret; - copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { + ret = VM_FAULT_HWPOISON; + goto unlock; + } __folio_mark_uptodate(folio); ret |= finish_fault(vmf); +unlock: unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -5217,16 +5427,46 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, - unsigned long addr, int page_nid, int *flags) +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, + unsigned long addr, int *flags, + bool writable, int *last_cpupid) { struct vm_area_struct *vma = vmf->vma; + /* + * Avoid grouping on RO pages in general. RO pages shouldn't hurt as + * much anyway since they can be in shared cache state. This misses + * the case where a mapping is writable but the process never writes + * to it but pte_write gets cleared during protection updates and + * pte_dirty has unpredictable behaviour between PTE scan updates, + * background writeback, dirty balancing and application behaviour. + */ + if (!writable) + *flags |= TNF_NO_GROUP; + + /* + * Flag if the folio is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) + *flags |= TNF_SHARED; + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if (folio_use_access_time(folio)) + *last_cpupid = (-1 & LAST_CPUPID_MASK); + else + *last_cpupid = folio_last_cpupid(folio); + /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == numa_node_id()) { +#ifdef CONFIG_NUMA_BALANCING + count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1); +#endif + if (folio_nid(folio) == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); *flags |= TNF_FAULT_LOCAL; } @@ -5328,36 +5568,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; - /* - * Avoid grouping on RO pages in general. RO pages shouldn't hurt as - * much anyway since they can be in shared cache state. This misses - * the case where a mapping is writable but the process never writes - * to it but pte_write gets cleared during protection updates and - * pte_dirty has unpredictable behaviour between PTE scan updates, - * background writeback, dirty balancing and application behaviour. - */ - if (!writable) - flags |= TNF_NO_GROUP; - - /* - * Flag if the folio is shared between multiple address spaces. This - * is later used when determining whether to group tasks together - */ - if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) - flags |= TNF_SHARED; - nid = folio_nid(folio); nr_pages = folio_nr_pages(folio); - /* - * For memory tiering mode, cpupid of slow memory page is used - * to record page access time. So use default value. - */ - if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && - !node_is_toptier(nid)) - last_cpupid = (-1 & LAST_CPUPID_MASK); - else - last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags); + + target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, + writable, &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { @@ -6013,10 +6228,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, if (!vma_start_read(vma)) goto inval; - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; - /* Check if the VMA got isolated after we found it */ if (vma->detached) { vma_end_read(vma); @@ -6024,6 +6235,16 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, /* The area was replaced with another one */ goto retry; } + /* + * At this point, we have a stable reference to a VMA: The VMA is + * locked and we know it hasn't already been isolated. + * From here on, we can access the VMA without worrying about which + * fields are accessible for RCU readers. + */ + + /* Check since vm_start/vm_end might change before we lock the VMA */ + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + goto inval_end_read; rcu_read_unlock(); return vma; @@ -6108,78 +6329,155 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ +static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, + spinlock_t *lock, pte_t *ptep, + pgprot_t pgprot, unsigned long pfn_base, + unsigned long addr_mask, bool writable, + bool special) +{ + args->lock = lock; + args->ptep = ptep; + args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); + args->pgprot = pgprot; + args->writable = writable; + args->special = special; +} + +static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) +{ +#ifdef CONFIG_LOCKDEP + struct address_space *mapping = vma->vm_file->f_mapping; + + if (mapping) + lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || + lockdep_is_held(&vma->vm_mm->mmap_lock)); + else + lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); +#endif +} + /** - * follow_pte - look up PTE at a user virtual address - * @vma: the memory mapping - * @address: user virtual address - * @ptepp: location to store found PTE - * @ptlp: location to store the lock for the PTE + * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address + * @args: Pointer to struct @follow_pfnmap_args * - * On a successful return, the pointer to the PTE is stored in @ptepp; - * the corresponding lock is taken and its location is stored in @ptlp. + * The caller needs to setup args->vma and args->address to point to the + * virtual address as the target of such lookup. On a successful return, + * the results will be put into other output fields. * - * The contents of the PTE are only stable until @ptlp is released using - * pte_unmap_unlock(). This function will fail if the PTE is non-present. - * Present PTEs may include PTEs that map refcounted pages, such as - * anonymous folios in COW mappings. + * After the caller finished using the fields, the caller must invoke + * another follow_pfnmap_end() to proper releases the locks and resources + * of such look up request. * - * Callers must be careful when relying on PTE content after - * pte_unmap_unlock(). Especially if the PTE maps a refcounted page, - * callers must protect against invalidation with MMU notifiers; otherwise - * access to the PFN at a later point in time can trigger use-after-free. + * During the start() and end() calls, the results in @args will be valid + * as proper locks will be held. After the end() is called, all the fields + * in @follow_pfnmap_args will be invalid to be further accessed. Further + * use of such information after end() may require proper synchronizations + * by the caller with page table updates, otherwise it can create a + * security bug. + * + * If the PTE maps a refcounted page, callers are responsible to protect + * against invalidation with MMU notifiers; otherwise access to the PFN at + * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore - * should be taken for read. + * should be taken for read, and the mmap semaphore cannot be released + * before the end() is invoked. * * This function must not be used to modify PTE content. * - * Return: zero on success, -ve otherwise. + * Return: zero on success, negative otherwise. */ -int follow_pte(struct vm_area_struct *vma, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pfnmap_start(struct follow_pfnmap_args *args) { + struct vm_area_struct *vma = args->vma; + unsigned long address = args->address; struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; + spinlock_t *lock; + pgd_t *pgdp; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pfnmap_lockdep_assert(vma); - mmap_assert_locked(mm); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) +retry: + pgdp = pgd_offset(mm, address); + if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + p4dp = p4d_offset(pgdp, address); + p4d = READ_ONCE(*p4dp); + if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; - pud = pud_offset(p4d, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + pudp = pud_offset(p4dp, address); + pud = READ_ONCE(*pudp); + if (pud_none(pud)) goto out; + if (pud_leaf(pud)) { + lock = pud_lock(mm, pudp); + if (!unlikely(pud_leaf(pud))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), + pud_pfn(pud), PUD_MASK, pud_write(pud), + pud_special(pud)); + return 0; + } - pmd = pmd_offset(pud, address); - VM_BUG_ON(pmd_trans_huge(*pmd)); + pmdp = pmd_offset(pudp, address); + pmd = pmdp_get_lockless(pmdp); + if (pmd_leaf(pmd)) { + lock = pmd_lock(mm, pmdp); + if (!unlikely(pmd_leaf(pmd))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), + pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), + pmd_special(pmd)); + return 0; + } - ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; - if (!pte_present(ptep_get(ptep))) + pte = ptep_get(ptep); + if (!pte_present(pte)) goto unlock; - *ptepp = ptep; + pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), + pte_pfn(pte), PAGE_MASK, pte_write(pte), + pte_special(pte)); return 0; unlock: - pte_unmap_unlock(ptep, *ptlp); + pte_unmap_unlock(ptep, lock); out: return -EINVAL; } -EXPORT_SYMBOL_GPL(follow_pte); +EXPORT_SYMBOL_GPL(follow_pfnmap_start); + +/** + * follow_pfnmap_end(): End a follow_pfnmap_start() process + * @args: Pointer to struct @follow_pfnmap_args + * + * Must be used in pair of follow_pfnmap_start(). See the start() function + * above for more information. + */ +void follow_pfnmap_end(struct follow_pfnmap_args *args) +{ + if (args->lock) + spin_unlock(args->lock); + if (args->ptep) + pte_unmap(args->ptep); +} +EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT /** @@ -6200,34 +6498,34 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; - pte_t *ptep, pte; - spinlock_t *ptl; int offset = offset_in_page(addr); int ret = -EINVAL; + bool writable; + struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: - if (follow_pte(vma, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) return -EINVAL; - pte = ptep_get(ptep); - pte_unmap_unlock(ptep, ptl); + prot = pgprot_val(args.pgprot); + phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; + writable = args.writable; + follow_pfnmap_end(&args); - prot = pgprot_val(pte_pgprot(pte)); - phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - if ((write & FOLL_WRITE) && !pte_write(pte)) + if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; - if (follow_pte(vma, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) goto out_unmap; - if (!pte_same(pte, ptep_get(ptep))) { - pte_unmap_unlock(ptep, ptl); + if ((prot != pgprot_val(args.pgprot)) || + (phys_addr != (args.pfn << PAGE_SHIFT)) || + (writable != args.writable)) { + follow_pfnmap_end(&args); iounmap(maddr); - goto retry; } @@ -6236,7 +6534,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, else memcpy_fromio(buf, maddr + offset, len); ret = len; - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unmap: iounmap(maddr); @@ -6587,7 +6885,7 @@ long copy_folio_from_user(struct folio *dst_folio, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 951878ab627a..621ae1015106 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -366,7 +366,7 @@ struct page *pfn_to_online_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(pfn_to_online_page); -int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, +int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { const unsigned long end_pfn = pfn + nr_pages; @@ -524,7 +524,7 @@ static void update_pgdat_span(struct pglist_data *pgdat) pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } -void __ref remove_pfn_range_from_zone(struct zone *zone, +void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -629,7 +629,7 @@ int restore_online_page_callback(online_page_callback_t callback) EXPORT_SYMBOL_GPL(restore_online_page_callback); /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -void __ref generic_online_page(struct page *page, unsigned int order) +void generic_online_page(struct page *page, unsigned int order) { __free_pages_core(page, order, MEMINIT_HOTPLUG); } @@ -741,7 +741,7 @@ static inline void section_taint_zone_device(unsigned long pfn) * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ -void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, +void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype) { @@ -1143,7 +1143,7 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) /* * Must be called with mem_hotplug_lock in write mode. */ -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, +int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { unsigned long flags; @@ -1233,7 +1233,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_init_pgdat(int nid) +static pg_data_t *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; @@ -1386,7 +1386,7 @@ bool mhp_supports_memmap_on_memory(void) } EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); -static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) +static void remove_memory_blocks_and_altmaps(u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1473,7 +1473,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) +int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; @@ -1580,7 +1580,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) } /* requires device_hotplug_lock, see add_memory_resource() */ -int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) +int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; int ret; @@ -1772,67 +1772,59 @@ static int scan_movable_pages(unsigned long start, unsigned long end, static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { + struct folio *folio; unsigned long pfn; - struct page *page, *head; LIST_HEAD(source); static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); for (pfn = start_pfn; pfn < end_pfn; pfn++) { - struct folio *folio; - bool isolated; + struct page *page; if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); folio = page_folio(page); - head = &folio->page; - if (PageHuge(page)) { - pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(folio, &source); - continue; - } else if (PageTransHuge(page)) - pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; + /* + * No reference or lock is held on the folio, so it might + * be modified concurrently (e.g. split). As such, + * folio_nr_pages() may read garbage. This is fine as the outer + * loop will revisit the split folio later. + */ + if (folio_test_large(folio)) + pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; /* * HWPoison pages have elevated reference counts so the migration would * fail on them. It also doesn't make any sense to migrate them in the * first place. Still try to unmap such a page in case it is still mapped - * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep - * the unmap as the catch all safety net). + * (keep the unmap as the catch all safety net). */ - if (PageHWPoison(page)) { + if (folio_test_hwpoison(folio) || + (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); if (folio_mapped(folio)) - try_to_unmap(folio, TTU_IGNORE_MLOCK); + unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK); continue; } - if (!get_page_unless_zero(page)) + if (!folio_try_get(folio)) continue; - /* - * We can skip free pages. And we can deal with pages on - * LRU and non-lru movable pages. - */ - if (PageLRU(page)) - isolated = isolate_lru_page(page); - else - isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - if (isolated) { - list_add_tail(&page->lru, &source); - if (!__PageMovable(page)) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); - } else { + if (unlikely(page_folio(page) != folio)) + goto put_folio; + + if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { - pr_warn("failed to isolate pfn %lx\n", pfn); + pr_warn("failed to isolate pfn %lx\n", + page_to_pfn(page)); dump_page(page, "isolation failed"); } } - put_page(page); +put_folio: + folio_put(folio); } if (!list_empty(&source)) { nodemask_t nmask = node_states[N_MEMORY]; @@ -1847,7 +1839,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We have checked that migration range is on a single zone so * we can use the nid of the first page to all the others. */ - mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru)); /* * try to allocate from a different node but reuse this node @@ -1860,11 +1852,12 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) ret = migrate_pages(&source, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { - list_for_each_entry(page, &source, lru) { + list_for_each_entry(folio, &source, lru) { if (__ratelimit(&migrate_rs)) { pr_warn("migrating pfn %lx failed ret:%d\n", - page_to_pfn(page), ret); - dump_page(page, "migration failure"); + folio_pfn(folio), ret); + dump_page(&folio->page, + "migration failure"); } } putback_movable_pages(&source); @@ -1939,7 +1932,7 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, /* * Must be called with mem_hotplug_lock in write mode. */ -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, +int offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; @@ -2240,7 +2233,7 @@ static int memory_blocks_have_altmaps(u64 start, u64 size) return 1; } -static int __ref try_remove_memory(u64 start, u64 size) +static int try_remove_memory(u64 start, u64 size) { int rc, nid = NUMA_NO_NODE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b858e22b259d..b646fab3e45e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -676,8 +676,10 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); - if (nr_updated > 0) + if (nr_updated > 0) { count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); + } tlb_finish_mmu(&tlb); @@ -1951,7 +1953,7 @@ unsigned int mempolicy_slab_node(void) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); - return z->zone ? zone_to_nid(z->zone) : node; + return zonelist_zone(z) ? zonelist_node_idx(z) : node; } case MPOL_LOCAL: return node; @@ -2809,7 +2811,7 @@ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); - polnid = zone_to_nid(z->zone); + polnid = zonelist_node_idx(z); break; default: diff --git a/mm/migrate.c b/mm/migrate.c index 923ea80ba744..dfdb3a136bf8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -35,21 +34,16 @@ #include #include #include -#include #include #include -#include -#include -#include #include #include #include #include -#include #include -#include #include #include +#include #include @@ -177,13 +171,83 @@ void putback_movable_pages(struct list_head *l) } } +/* Must be called with an elevated refcount on the non-hugetlb folio */ +bool isolate_folio_to_list(struct folio *folio, struct list_head *list) +{ + bool isolated, lru; + + if (folio_test_hugetlb(folio)) + return isolate_hugetlb(folio, list); + + lru = !__folio_test_movable(folio); + if (lru) + isolated = folio_isolate_lru(folio); + else + isolated = isolate_movable_page(&folio->page, + ISOLATE_UNEVICTABLE); + + if (!isolated) + return false; + + list_add(&folio->lru, list); + if (lru) + node_stat_add_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio)); + + return true; +} + +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, + struct folio *folio, + unsigned long idx) +{ + struct page *page = folio_page(folio, idx); + bool contains_data; + pte_t newpte; + void *addr; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || + mm_forbids_zeropage(pvmw->vma->vm_mm)) + return false; + + /* + * The pmd entry mapping the old thp was flushed and the pte mapping + * this subpage has been non present. If the subpage is only zero-filled + * then map it to the shared zeropage. + */ + addr = kmap_local_page(page); + contains_data = memchr_inv(addr, 0, PAGE_SIZE); + kunmap_local(addr); + + if (contains_data) + return false; + + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), + pvmw->vma->vm_page_prot)); + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); + + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio)); + return true; +} + +struct rmap_walk_arg { + struct folio *folio; + bool map_unused_to_zeropage; +}; + /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *old) + struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + struct rmap_walk_arg *rmap_walk_arg = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; @@ -207,6 +271,9 @@ static bool remove_migration_pte(struct folio *folio, continue; } #endif + if (rmap_walk_arg->map_unused_to_zeropage && + try_to_map_unused_to_zeropage(&pvmw, folio, idx)) + continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); @@ -285,14 +352,21 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) { - struct rmap_walk_control rwc = { - .rmap_one = remove_migration_pte, - .arg = src, + struct rmap_walk_arg rmap_walk_arg = { + .folio = src, + .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, }; - if (locked) + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = &rmap_walk_arg, + }; + + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); + + if (flags & RMP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); @@ -422,6 +496,8 @@ static int __folio_migrate_mapping(struct address_space *mapping, /* No turning back from here */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; + if (folio_test_anon(folio) && folio_test_large(folio)) + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); @@ -446,6 +522,8 @@ static int __folio_migrate_mapping(struct address_space *mapping, */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; + if (folio_test_anon(folio) && folio_test_large(folio)) + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ if (folio_test_swapbacked(folio)) { __folio_set_swapbacked(newfolio); @@ -585,8 +663,6 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) { int cpupid; - if (folio_test_error(folio)) - folio_set_error(newfolio); if (folio_test_referenced(folio)) folio_set_referenced(newfolio); if (folio_test_uptodate(folio)) @@ -640,7 +716,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's - * ksm_get_folio() depends upon ksm_migrate_page() and PageSwapCache(). + * ksm_get_folio() depends upon ksm_migrate_page() and the + * swapcache flag. */ if (folio_test_swapcache(folio)) folio_clear_swapcache(folio); @@ -666,6 +743,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_set_readahead(newfolio); folio_copy_owner(newfolio, folio); + pgalloc_tag_copy(newfolio, folio); mem_cgroup_migrate(folio, newfolio); } @@ -904,7 +982,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, 0); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1068,7 +1146,7 @@ static void migrate_folio_undo_src(struct folio *src, struct list_head *ret) { if (page_was_mapped) - remove_migration_ptes(src, src, false); + remove_migration_ptes(src, src, 0); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); @@ -1306,7 +1384,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, lru_add_drain(); if (old_page_state & PAGE_WAS_MAPPED) - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, 0); out_unlock_both: folio_unlock(dst); @@ -1444,7 +1522,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1682,7 +1760,8 @@ static int migrate_pages_batch(struct list_head *from, * use _deferred_list. */ if (nr_pages > 2 && - !list_empty(&folio->_deferred_list)) { + !list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) { if (!try_split_folio(folio, split_folios, mode)) { nr_failed++; stats->nr_thp_failed += is_thp; @@ -2111,76 +2190,66 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node) return err; } -/* - * Resolves the given address to a struct page, isolates it from the LRU and - * puts it to the given pagelist. - * Returns: - * errno - if the page cannot be found/isolated - * 0 - when it doesn't have to be migrated because it is already on the - * target node - * 1 - when it has been queued - */ -static int add_page_for_migration(struct mm_struct *mm, const void __user *p, - int node, struct list_head *pagelist, bool migrate_all) +static int __add_folio_for_migration(struct folio *folio, int node, + struct list_head *pagelist, bool migrate_all) { - struct vm_area_struct *vma; - unsigned long addr; - struct page *page; - struct folio *folio; - int err; + if (is_zero_folio(folio) || is_huge_zero_folio(folio)) + return -EFAULT; - mmap_read_lock(mm); - addr = (unsigned long)untagged_addr_remote(mm, p); - - err = -EFAULT; - vma = vma_lookup(mm, addr); - if (!vma || !vma_migratable(vma)) - goto out; - - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - - err = -ENOENT; - if (!page) - goto out; - - folio = page_folio(page); if (folio_is_zone_device(folio)) - goto out_putfolio; + return -ENOENT; - err = 0; if (folio_nid(folio) == node) - goto out_putfolio; + return 0; - err = -EACCES; if (folio_likely_mapped_shared(folio) && !migrate_all) - goto out_putfolio; + return -EACCES; - err = -EBUSY; if (folio_test_hugetlb(folio)) { if (isolate_hugetlb(folio, pagelist)) - err = 1; - } else { - if (!folio_isolate_lru(folio)) - goto out_putfolio; - - err = 1; + return 1; + } else if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, pagelist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); + return 1; + } + return -EBUSY; +} + +/* + * Resolves the given address to a struct folio, isolates it from the LRU and + * puts it to the given pagelist. + * Returns: + * errno - if the folio cannot be found/isolated + * 0 - when it doesn't have to be migrated because it is already on the + * target node + * 1 - when it has been queued + */ +static int add_folio_for_migration(struct mm_struct *mm, const void __user *p, + int node, struct list_head *pagelist, bool migrate_all) +{ + struct vm_area_struct *vma; + struct folio_walk fw; + struct folio *folio; + unsigned long addr; + int err = -EFAULT; + + mmap_read_lock(mm); + addr = (unsigned long)untagged_addr_remote(mm, p); + + vma = vma_lookup(mm, addr); + if (vma && vma_migratable(vma)) { + folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); + if (folio) { + err = __add_folio_for_migration(folio, node, pagelist, + migrate_all); + folio_walk_end(&fw, vma); + } else { + err = -ENOENT; + } } -out_putfolio: - /* - * Either remove the duplicate refcount from folio_isolate_lru() - * or drop the folio ref if it was not isolated. - */ - folio_put(folio); -out: mmap_read_unlock(mm); return err; } @@ -2274,8 +2343,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, * Errors in the page lookup or isolation are not fatal and we simply * report them via status */ - err = add_page_for_migration(mm, p, current_node, &pagelist, - flags & MPOL_MF_MOVE_ALL); + err = add_folio_for_migration(mm, p, current_node, &pagelist, + flags & MPOL_MF_MOVE_ALL); if (err > 0) { /* The page is successfully queued for migration */ @@ -2331,28 +2400,26 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); struct vm_area_struct *vma; - struct page *page; + struct folio_walk fw; + struct folio *folio; int err = -EFAULT; vma = vma_lookup(mm, addr); if (!vma) goto set_status; - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; - - err = -ENOENT; - if (!page) - goto set_status; - - if (!is_zone_device_page(page)) - err = page_to_nid(page); - - put_page(page); + folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); + if (folio) { + if (is_zero_folio(folio) || is_huge_zero_folio(folio)) + err = -EFAULT; + else if (folio_is_zone_device(folio)) + err = -ENOENT; + else + err = folio_nid(folio); + folio_walk_end(&fw, vma); + } else { + err = -ENOENT; + } set_status: *status = err; @@ -2432,25 +2499,19 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) return current->mm; } - /* Find the mm_struct */ - rcu_read_lock(); - task = find_task_by_vpid(pid); + task = find_get_task_by_vpid(pid); if (!task) { - rcu_read_unlock(); return ERR_PTR(-ESRCH); } - get_task_struct(task); /* * Check if this process has the right to modify the specified * process. Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { - rcu_read_unlock(); mm = ERR_PTR(-EPERM); goto out; } - rcu_read_unlock(); mm = ERR_PTR(security_task_movememory(task)); if (IS_ERR(mm)) @@ -2526,7 +2587,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, - ZONE_MOVABLE, 0)) + ZONE_MOVABLE, ALLOC_CMA)) continue; return true; } @@ -2627,6 +2688,8 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int nr_remaining; unsigned int nr_succeeded; LIST_HEAD(migratepages); + struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); list_add(&folio->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, @@ -2636,10 +2699,13 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, putback_movable_pages(&migratepages); if (nr_succeeded) { count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); - if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node)) - mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, - nr_succeeded); + count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded); + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) + && !node_is_toptier(folio_nid(folio)) + && node_is_toptier(node)) + mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded); } + mem_cgroup_put(memcg); BUG_ON(!list_empty(&migratepages)); return nr_remaining ? -EAGAIN : 0; } diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 6d66dc1c6ffa..9cf26592ac93 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -328,8 +328,8 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) /* * One extra ref because caller holds an extra reference, either from - * isolate_lru_page() for a regular page, or migrate_vma_collect() for - * a device page. + * folio_isolate_lru() for a regular folio, or migrate_vma_collect() for + * a device folio. */ int extra = 1 + (page == fault_page); @@ -379,33 +379,33 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, continue; } - /* ZONE_DEVICE pages are not on LRU */ - if (!is_zone_device_page(page)) { - if (!PageLRU(page) && allow_drain) { + folio = page_folio(page); + /* ZONE_DEVICE folios are not on LRU */ + if (!folio_is_zone_device(folio)) { + if (!folio_test_lru(folio) && allow_drain) { /* Drain CPU's lru cache */ lru_add_drain_all(); allow_drain = false; } - if (!isolate_lru_page(page)) { + if (!folio_isolate_lru(folio)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } /* Drop the reference we took in collect */ - put_page(page); + folio_put(folio); } - folio = page_folio(page); if (folio_mapped(folio)) try_to_migrate(folio, 0); - if (page_mapped(page) || + if (folio_mapped(folio) || !migrate_vma_check_page(page, fault_page)) { - if (!is_zone_device_page(page)) { - get_page(page); - putback_lru_page(page); + if (!folio_is_zone_device(folio)) { + folio_get(folio); + folio_putback_lru(folio); } src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; @@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, continue; folio = page_folio(page); - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, 0); src_pfns[i] = 0; folio_unlock(folio); @@ -708,7 +708,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, /* * The only time there is no vma is when called from - * migrate_device_coherent_page(). However this isn't + * migrate_device_coherent_folio(). However this isn't * called if the page could not be unmapped. */ VM_BUG_ON(!migrate); @@ -815,42 +815,45 @@ void migrate_device_finalize(unsigned long *src_pfns, unsigned long i; for (i = 0; i < npages; i++) { - struct folio *dst, *src; + struct folio *dst = NULL, *src = NULL; struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); struct page *page = migrate_pfn_to_page(src_pfns[i]); + if (newpage) + dst = page_folio(newpage); + if (!page) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); + if (dst) { + folio_unlock(dst); + folio_put(dst); } continue; } - if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); + src = page_folio(page); + + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) { + if (dst) { + folio_unlock(dst); + folio_put(dst); } - newpage = page; + dst = src; } - src = page_folio(page); - dst = page_folio(newpage); - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, 0); folio_unlock(src); - if (is_zone_device_page(page)) - put_page(page); + if (folio_is_zone_device(src)) + folio_put(src); else - putback_lru_page(page); + folio_putback_lru(src); - if (newpage != page) { - unlock_page(newpage); - if (is_zone_device_page(newpage)) - put_page(newpage); + if (dst != src) { + folio_unlock(dst); + if (folio_is_zone_device(dst)) + folio_put(dst); else - putback_lru_page(newpage); + folio_putback_lru(dst); } } } @@ -898,16 +901,17 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start, unsigned long i, pfn; for (pfn = start, i = 0; i < npages; pfn++, i++) { - struct page *page = pfn_to_page(pfn); + struct folio *folio; - if (!get_page_unless_zero(page)) { + folio = folio_get_nontail_page(pfn_to_page(pfn)); + if (!folio) { src_pfns[i] = 0; continue; } - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { src_pfns[i] = 0; - put_page(page); + folio_put(folio); continue; } @@ -921,38 +925,38 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start, EXPORT_SYMBOL(migrate_device_range); /* - * Migrate a device coherent page back to normal memory. The caller should have - * a reference on page which will be copied to the new page if migration is + * Migrate a device coherent folio back to normal memory. The caller should have + * a reference on folio which will be copied to the new folio if migration is * successful or dropped on failure. */ -int migrate_device_coherent_page(struct page *page) +int migrate_device_coherent_folio(struct folio *folio) { unsigned long src_pfn, dst_pfn = 0; - struct page *dpage; + struct folio *dfolio; - WARN_ON_ONCE(PageCompound(page)); + WARN_ON_ONCE(folio_test_large(folio)); - lock_page(page); - src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; + folio_lock(folio); + src_pfn = migrate_pfn(folio_pfn(folio)) | MIGRATE_PFN_MIGRATE; /* * We don't have a VMA and don't need to walk the page tables to find - * the source page. So call migrate_vma_unmap() directly to unmap the - * page as migrate_vma_setup() will fail if args.vma == NULL. + * the source folio. So call migrate_vma_unmap() directly to unmap the + * folio as migrate_vma_setup() will fail if args.vma == NULL. */ migrate_device_unmap(&src_pfn, 1, NULL); if (!(src_pfn & MIGRATE_PFN_MIGRATE)) return -EBUSY; - dpage = alloc_page(GFP_USER | __GFP_NOWARN); - if (dpage) { - lock_page(dpage); - dst_pfn = migrate_pfn(page_to_pfn(dpage)); + dfolio = folio_alloc(GFP_USER | __GFP_NOWARN, 0); + if (dfolio) { + folio_lock(dfolio); + dst_pfn = migrate_pfn(folio_pfn(dfolio)); } migrate_device_pages(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) - copy_highpage(dpage, page); + folio_copy(dfolio, folio); migrate_device_finalize(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) diff --git a/mm/mm_init.c b/mm/mm_init.c index 51960079875b..4ba5607aaf19 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1835,14 +1835,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) for_each_node(nid) { pg_data_t *pgdat; - if (!node_online(nid)) { - /* Allocator not initialized yet */ - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - arch_refresh_nodedata(nid, pgdat); - } + if (!node_online(nid)) + alloc_offline_node_data(nid); pgdat = NODE_DATA(nid); free_area_init_node(nid); @@ -1939,7 +1933,7 @@ static void __init deferred_free_pages(unsigned long pfn, } /* Accept chunks smaller than MAX_PAGE_ORDER upfront */ - accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); + accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE); for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) diff --git a/mm/mmap.c b/mm/mmap.c index 6ddb278a5ee8..ee8f91eaadb9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -76,16 +76,6 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, struct vm_area_struct *prev, - struct vm_area_struct *next, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked); - -static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) -{ - return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); -} - /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { @@ -101,100 +91,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } -/* - * Requires inode->i_mapping->i_mmap_rwsem - */ -static void __remove_shared_vm_struct(struct vm_area_struct *vma, - struct address_space *mapping) -{ - if (vma_is_shared_maywrite(vma)) - mapping_unmap_writable(mapping); - - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); -} - -/* - * Unlink a file-based vm structure from its interval tree, to hide - * vma from rmap and vmtruncate before freeing its page tables. - */ -void unlink_file_vma(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - - if (file) { - struct address_space *mapping = file->f_mapping; - i_mmap_lock_write(mapping); - __remove_shared_vm_struct(vma, mapping); - i_mmap_unlock_write(mapping); - } -} - -void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) -{ - vb->count = 0; -} - -static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) -{ - struct address_space *mapping; - int i; - - mapping = vb->vmas[0]->vm_file->f_mapping; - i_mmap_lock_write(mapping); - for (i = 0; i < vb->count; i++) { - VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); - __remove_shared_vm_struct(vb->vmas[i], mapping); - } - i_mmap_unlock_write(mapping); - - unlink_file_vma_batch_init(vb); -} - -void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, - struct vm_area_struct *vma) -{ - if (vma->vm_file == NULL) - return; - - if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || - vb->count == ARRAY_SIZE(vb->vmas)) - unlink_file_vma_batch_process(vb); - - vb->vmas[vb->count] = vma; - vb->count++; -} - -void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) -{ - if (vb->count > 0) - unlink_file_vma_batch_process(vb); -} - -/* - * Close a vm structure and free it. - */ -static void remove_vma(struct vm_area_struct *vma, bool unreachable) -{ - might_sleep(); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); - mpol_put(vma_policy(vma)); - if (unreachable) - __vm_area_free(vma); - else - vm_area_free(vma); -} - -static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, - unsigned long min) -{ - return mas_prev(&vmi->mas, min); -} - /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. @@ -273,11 +169,12 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. - * do_vma_munmap() will drop the lock on success, so update it - * before calling do_vma_munmap(). + * do_vmi_align_munmap() will drop the lock on success, so + * update it before calling do_vma_munmap(). */ mm->brk = brk; - if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true)) + if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf, + /* unlock = */ true)) goto out; goto success_unlocked; @@ -318,875 +215,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) return origbrk; } -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) -static void validate_mm(struct mm_struct *mm) -{ - int bug = 0; - int i = 0; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mt_validate(&mm->mm_mt); - for_each_vma(vmi, vma) { -#ifdef CONFIG_DEBUG_VM_RB - struct anon_vma *anon_vma = vma->anon_vma; - struct anon_vma_chain *avc; -#endif - unsigned long vmi_start, vmi_end; - bool warn = 0; - - vmi_start = vma_iter_addr(&vmi); - vmi_end = vma_iter_end(&vmi); - if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) - warn = 1; - - if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) - warn = 1; - - if (warn) { - pr_emerg("issue in %s\n", current->comm); - dump_stack(); - dump_vma(vma); - pr_emerg("tree range: %px start %lx end %lx\n", vma, - vmi_start, vmi_end - 1); - vma_iter_dump_tree(&vmi); - } - -#ifdef CONFIG_DEBUG_VM_RB - if (anon_vma) { - anon_vma_lock_read(anon_vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - anon_vma_unlock_read(anon_vma); - } -#endif - i++; - } - if (i != mm->map_count) { - pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); - bug = 1; - } - VM_BUG_ON_MM(bug, mm); -} - -#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ -#define validate_mm(mm) do { } while (0) -#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ - -/* - * vma has some anon_vma assigned, and is already inserted on that - * anon_vma's interval trees. - * - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the - * vma must be removed from the anon_vma's interval trees using - * anon_vma_interval_tree_pre_update_vma(). - * - * After the update, the vma will be reinserted using - * anon_vma_interval_tree_post_update_vma(). - * - * The entire update must be protected by exclusive mmap_lock and by - * the root anon_vma's mutex. - */ -static inline void -anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); -} - -static inline void -anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); -} - -static unsigned long count_vma_pages_range(struct mm_struct *mm, - unsigned long addr, unsigned long end) -{ - VMA_ITERATOR(vmi, mm, addr); - struct vm_area_struct *vma; - unsigned long nr_pages = 0; - - for_each_vma_range(vmi, vma, end) { - unsigned long vm_start = max(addr, vma->vm_start); - unsigned long vm_end = min(end, vma->vm_end); - - nr_pages += PHYS_PFN(vm_end - vm_start); - } - - return nr_pages; -} - -static void __vma_link_file(struct vm_area_struct *vma, - struct address_space *mapping) -{ - if (vma_is_shared_maywrite(vma)) - mapping_allow_writable(mapping); - - flush_dcache_mmap_lock(mapping); - vma_interval_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); -} - -static void vma_link_file(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct address_space *mapping; - - if (file) { - mapping = file->f_mapping; - i_mmap_lock_write(mapping); - __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); - } -} - -static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) -{ - VMA_ITERATOR(vmi, mm, 0); - - vma_iter_config(&vmi, vma->vm_start, vma->vm_end); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - vma_start_write(vma); - vma_iter_store(&vmi, vma); - vma_link_file(vma); - mm->map_count++; - validate_mm(mm); - return 0; -} - -/* - * init_multi_vma_prep() - Initializer for struct vma_prepare - * @vp: The vma_prepare struct - * @vma: The vma that will be altered once locked - * @next: The next vma if it is to be adjusted - * @remove: The first vma to be removed - * @remove2: The second vma to be removed - */ -static inline void init_multi_vma_prep(struct vma_prepare *vp, - struct vm_area_struct *vma, struct vm_area_struct *next, - struct vm_area_struct *remove, struct vm_area_struct *remove2) -{ - memset(vp, 0, sizeof(struct vma_prepare)); - vp->vma = vma; - vp->anon_vma = vma->anon_vma; - vp->remove = remove; - vp->remove2 = remove2; - vp->adj_next = next; - if (!vp->anon_vma && next) - vp->anon_vma = next->anon_vma; - - vp->file = vma->vm_file; - if (vp->file) - vp->mapping = vma->vm_file->f_mapping; - -} - -/* - * init_vma_prep() - Initializer wrapper for vma_prepare struct - * @vp: The vma_prepare struct - * @vma: The vma that will be altered once locked - */ -static inline void init_vma_prep(struct vma_prepare *vp, - struct vm_area_struct *vma) -{ - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); -} - - -/* - * vma_prepare() - Helper function for handling locking VMAs prior to altering - * @vp: The initialized vma_prepare struct - */ -static inline void vma_prepare(struct vma_prepare *vp) -{ - if (vp->file) { - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); - - if (vp->adj_next) - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, - vp->adj_next->vm_end); - - i_mmap_lock_write(vp->mapping); - if (vp->insert && vp->insert->vm_file) { - /* - * Put into interval tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(vp->insert, - vp->insert->vm_file->f_mapping); - } - } - - if (vp->anon_vma) { - anon_vma_lock_write(vp->anon_vma); - anon_vma_interval_tree_pre_update_vma(vp->vma); - if (vp->adj_next) - anon_vma_interval_tree_pre_update_vma(vp->adj_next); - } - - if (vp->file) { - flush_dcache_mmap_lock(vp->mapping); - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); - if (vp->adj_next) - vma_interval_tree_remove(vp->adj_next, - &vp->mapping->i_mmap); - } - -} - -/* - * vma_complete- Helper function for handling the unlocking after altering VMAs, - * or for inserting a VMA. - * - * @vp: The vma_prepare struct - * @vmi: The vma iterator - * @mm: The mm_struct - */ -static inline void vma_complete(struct vma_prepare *vp, - struct vma_iterator *vmi, struct mm_struct *mm) -{ - if (vp->file) { - if (vp->adj_next) - vma_interval_tree_insert(vp->adj_next, - &vp->mapping->i_mmap); - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); - flush_dcache_mmap_unlock(vp->mapping); - } - - if (vp->remove && vp->file) { - __remove_shared_vm_struct(vp->remove, vp->mapping); - if (vp->remove2) - __remove_shared_vm_struct(vp->remove2, vp->mapping); - } else if (vp->insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - vma_iter_store(vmi, vp->insert); - mm->map_count++; - } - - if (vp->anon_vma) { - anon_vma_interval_tree_post_update_vma(vp->vma); - if (vp->adj_next) - anon_vma_interval_tree_post_update_vma(vp->adj_next); - anon_vma_unlock_write(vp->anon_vma); - } - - if (vp->file) { - i_mmap_unlock_write(vp->mapping); - uprobe_mmap(vp->vma); - - if (vp->adj_next) - uprobe_mmap(vp->adj_next); - } - - if (vp->remove) { -again: - vma_mark_detached(vp->remove, true); - if (vp->file) { - uprobe_munmap(vp->remove, vp->remove->vm_start, - vp->remove->vm_end); - fput(vp->file); - } - if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); - mm->map_count--; - mpol_put(vma_policy(vp->remove)); - if (!vp->remove2) - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); - vm_area_free(vp->remove); - - /* - * In mprotect's case 6 (see comments on vma_merge), - * we are removing both mid and next vmas - */ - if (vp->remove2) { - vp->remove = vp->remove2; - vp->remove2 = NULL; - goto again; - } - } - if (vp->insert && vp->file) - uprobe_mmap(vp->insert); - validate_mm(mm); -} - -/* - * dup_anon_vma() - Helper function to duplicate anon_vma - * @dst: The destination VMA - * @src: The source VMA - * @dup: Pointer to the destination VMA when successful. - * - * Returns: 0 on success. - */ -static inline int dup_anon_vma(struct vm_area_struct *dst, - struct vm_area_struct *src, struct vm_area_struct **dup) -{ - /* - * Easily overlooked: when mprotect shifts the boundary, make sure the - * expanding vma has anon_vma set if the shrinking vma had, to cover any - * anon pages imported. - */ - if (src->anon_vma && !dst->anon_vma) { - int ret; - - vma_assert_write_locked(dst); - dst->anon_vma = src->anon_vma; - ret = anon_vma_clone(dst, src); - if (ret) - return ret; - - *dup = dst; - } - - return 0; -} - -/* - * vma_expand - Expand an existing VMA - * - * @vmi: The vma iterator - * @vma: The vma to expand - * @start: The start of the vma - * @end: The exclusive end of the vma - * @pgoff: The page offset of vma - * @next: The current of next vma. - * - * Expand @vma to @start and @end. Can expand off the start and end. Will - * expand over @next if it's different from @vma and @end == @next->vm_end. - * Checking if the @vma can expand and merge with @next needs to be handled by - * the caller. - * - * Returns: 0 on success - */ -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) -{ - struct vm_area_struct *anon_dup = NULL; - bool remove_next = false; - struct vma_prepare vp; - - vma_start_write(vma); - if (next && (vma != next) && (end == next->vm_end)) { - int ret; - - remove_next = true; - vma_start_write(next); - ret = dup_anon_vma(vma, next, &anon_dup); - if (ret) - return ret; - } - - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); - /* Not merging but overwriting any part of next is not handled. */ - VM_WARN_ON(next && !vp.remove && - next != vma && end > next->vm_start); - /* Only handles expanding */ - VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); - - /* Note: vma iterator must be pointing to 'start' */ - vma_iter_config(vmi, start, end); - if (vma_iter_prealloc(vmi, vma)) - goto nomem; - - vma_prepare(&vp); - vma_adjust_trans_huge(vma, start, end, 0); - vma_set_range(vma, start, end, pgoff); - vma_iter_store(vmi, vma); - - vma_complete(&vp, vmi, vma->vm_mm); - return 0; - -nomem: - if (anon_dup) - unlink_anon_vmas(anon_dup); - return -ENOMEM; -} - -/* - * vma_shrink() - Reduce an existing VMAs memory area - * @vmi: The vma iterator - * @vma: The VMA to modify - * @start: The new start - * @end: The new end - * - * Returns: 0 on success, -ENOMEM otherwise - */ -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff) -{ - struct vma_prepare vp; - - WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); - - if (vma->vm_start < start) - vma_iter_config(vmi, vma->vm_start, start); - else - vma_iter_config(vmi, end, vma->vm_end); - - if (vma_iter_prealloc(vmi, NULL)) - return -ENOMEM; - - vma_start_write(vma); - - init_vma_prep(&vp, vma); - vma_prepare(&vp); - vma_adjust_trans_huge(vma, start, end, 0); - - vma_iter_clear(vmi); - vma_set_range(vma, start, end, pgoff); - vma_complete(&vp, vmi, vma->vm_mm); - return 0; -} - -/* - * If the vma has a ->close operation then the driver probably needs to release - * per-vma resources, so we don't attempt to merge those if the caller indicates - * the current vma may be removed as part of the merge. - */ -static inline bool is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name, bool may_remove_vma) -{ - /* - * VM_SOFTDIRTY should not prevent from VMA merging, if we - * match the flags but dirty bit -- the caller should mark - * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressure on the memory system forcing - * the kernel to generate new VMAs when old one could be - * extended instead. - */ - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) - return false; - if (vma->vm_file != file) - return false; - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) - return false; - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) - return false; - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) - return false; - return true; -} - -static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2, struct vm_area_struct *vma) -{ - /* - * The list_is_singular() test is to avoid merging VMA cloned from - * parents. This can improve scalability caused by anon_vma lock. - */ - if ((!anon_vma1 || !anon_vma2) && (!vma || - list_is_singular(&vma->anon_vma_chain))) - return true; - return anon_vma1 == anon_vma2; -} - -/* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) - * in front of (at a lower virtual address and file offset than) the vma. - * - * We cannot merge two vmas if they have differently assigned (non-NULL) - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. - * - * We don't check here for the merged mmap wrapping around the end of pagecache - * indices (16TB on ia32) because do_mmap() does not permit mmap's which - * wrap, nor mmaps which cover the final page at index -1UL. - * - * We assume the vma may be removed as part of the merge. - */ -static bool -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { - if (vma->vm_pgoff == vm_pgoff) - return true; - } - return false; -} - -/* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) - * beyond (at a higher virtual address and file offset than) the vma. - * - * We cannot merge two vmas if they have differently assigned (non-NULL) - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. - * - * We assume that vma is not removed as part of the merge. - */ -static bool -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { - pgoff_t vm_pglen; - vm_pglen = vma_pages(vma); - if (vma->vm_pgoff + vm_pglen == vm_pgoff) - return true; - } - return false; -} - -/* - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), - * figure out whether that can be merged with its predecessor or its - * successor. Or both (it neatly fills a hole). - * - * In most cases - when called for mmap, brk or mremap - [addr,end) is - * certain not to be mapped by the time vma_merge is called; but when - * called for mprotect, it is certain to be already mapped (either at - * an offset within prev, or at the start of next), and the flags of - * this area are about to be changed to vm_flags - and the no-change - * case has already been eliminated. - * - * The following mprotect cases have to be considered, where **** is - * the area passed down from mprotect_fixup, never extending beyond one - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts - * at the same address as **** and is of the same or larger span, and - * NNNN the next vma after ****: - * - * **** **** **** - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC - * cannot merge might become might become - * PPNNNNNNNNNN PPPPPPPPPPCC - * mmap, brk or case 4 below case 5 below - * mremap move: - * **** **** - * PPPP NNNN PPPPCCCCNNNN - * might become might become - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 - * - * It is important for case 8 that the vma CCCC overlapping the - * region **** is never going to extended over NNNN. Instead NNNN must - * be extended in region **** and CCCC must be removed. This way in - * all cases where vma_merge succeeds, the moment vma_merge drops the - * rmap_locks, the properties of the merged vma will be already - * correct for the whole merged range. Some of those properties like - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must - * be correct for the whole merged range immediately after the - * rmap_locks are released. Otherwise if NNNN would be removed and - * CCCC would be extended over the NNNN range, remove_migration_ptes - * or other rmap walkers (if working on addresses beyond the "end" - * parameter) may establish ptes with the wrong permissions of CCCC - * instead of the right permissions of NNNN. - * - * In the code below: - * PPPP is represented by *prev - * CCCC is represented by *curr or not represented at all (NULL) - * NNNN is represented by *next or not represented at all (NULL) - * **** is not represented - it will be merged and the vma containing the - * area is returned, or the function will return NULL - */ -static struct vm_area_struct -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *src, unsigned long addr, unsigned long end, - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - struct mm_struct *mm = src->vm_mm; - struct anon_vma *anon_vma = src->anon_vma; - struct file *file = src->vm_file; - struct vm_area_struct *curr, *next, *res; - struct vm_area_struct *vma, *adjust, *remove, *remove2; - struct vm_area_struct *anon_dup = NULL; - struct vma_prepare vp; - pgoff_t vma_pgoff; - int err = 0; - bool merge_prev = false; - bool merge_next = false; - bool vma_expanded = false; - unsigned long vma_start = addr; - unsigned long vma_end = end; - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - long adj_start = 0; - - /* - * We later require that vma->vm_flags == vm_flags, - * so this tests vma->vm_flags & VM_SPECIAL, too. - */ - if (vm_flags & VM_SPECIAL) - return NULL; - - /* Does the input range span an existing VMA? (cases 5 - 8) */ - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); - - if (!curr || /* cases 1 - 4 */ - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ - next = vma_lookup(mm, end); - else - next = NULL; /* case 5 */ - - if (prev) { - vma_start = prev->vm_start; - vma_pgoff = prev->vm_pgoff; - - /* Can we merge the predecessor? */ - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) - && can_vma_merge_after(prev, vm_flags, anon_vma, file, - pgoff, vm_userfaultfd_ctx, anon_name)) { - merge_prev = true; - vma_prev(vmi); - } - } - - /* Can we merge the successor? */ - if (next && mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx, anon_name)) { - merge_next = true; - } - - /* Verify some invariant that must be enforced by the caller. */ - VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); - VM_WARN_ON(addr >= end); - - if (!merge_prev && !merge_next) - return NULL; /* Not mergeable. */ - - if (merge_prev) - vma_start_write(prev); - - res = vma = prev; - remove = remove2 = adjust = NULL; - - /* Can we merge both the predecessor and the successor? */ - if (merge_prev && merge_next && - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { - vma_start_write(next); - remove = next; /* case 1 */ - vma_end = next->vm_end; - err = dup_anon_vma(prev, next, &anon_dup); - if (curr) { /* case 6 */ - vma_start_write(curr); - remove = curr; - remove2 = next; - /* - * Note that the dup_anon_vma below cannot overwrite err - * since the first caller would do nothing unless next - * has an anon_vma. - */ - if (!next->anon_vma) - err = dup_anon_vma(prev, curr, &anon_dup); - } - } else if (merge_prev) { /* case 2 */ - if (curr) { - vma_start_write(curr); - if (end == curr->vm_end) { /* case 7 */ - /* - * can_vma_merge_after() assumed we would not be - * removing prev vma, so it skipped the check - * for vm_ops->close, but we are removing curr - */ - if (curr->vm_ops && curr->vm_ops->close) - err = -EINVAL; - remove = curr; - } else { /* case 5 */ - adjust = curr; - adj_start = (end - curr->vm_start); - } - if (!err) - err = dup_anon_vma(prev, curr, &anon_dup); - } - } else { /* merge_next */ - vma_start_write(next); - res = next; - if (prev && addr < prev->vm_end) { /* case 4 */ - vma_start_write(prev); - vma_end = addr; - adjust = next; - adj_start = -(prev->vm_end - addr); - err = dup_anon_vma(next, prev, &anon_dup); - } else { - /* - * Note that cases 3 and 8 are the ONLY ones where prev - * is permitted to be (but is not necessarily) NULL. - */ - vma = next; /* case 3 */ - vma_start = addr; - vma_end = next->vm_end; - vma_pgoff = next->vm_pgoff - pglen; - if (curr) { /* case 8 */ - vma_pgoff = curr->vm_pgoff; - vma_start_write(curr); - remove = curr; - err = dup_anon_vma(next, curr, &anon_dup); - } - } - } - - /* Error in anon_vma clone. */ - if (err) - goto anon_vma_fail; - - if (vma_start < vma->vm_start || vma_end > vma->vm_end) - vma_expanded = true; - - if (vma_expanded) { - vma_iter_config(vmi, vma_start, vma_end); - } else { - vma_iter_config(vmi, adjust->vm_start + adj_start, - adjust->vm_end); - } - - if (vma_iter_prealloc(vmi, vma)) - goto prealloc_fail; - - init_multi_vma_prep(&vp, vma, adjust, remove, remove2); - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && - vp.anon_vma != adjust->anon_vma); - - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); - vma_set_range(vma, vma_start, vma_end, vma_pgoff); - - if (vma_expanded) - vma_iter_store(vmi, vma); - - if (adj_start) { - adjust->vm_start += adj_start; - adjust->vm_pgoff += adj_start >> PAGE_SHIFT; - if (adj_start < 0) { - WARN_ON(vma_expanded); - vma_iter_store(vmi, next); - } - } - - vma_complete(&vp, vmi, mm); - khugepaged_enter_vma(res, vm_flags); - return res; - -prealloc_fail: - if (anon_dup) - unlink_anon_vmas(anon_dup); - -anon_vma_fail: - vma_iter_set(vmi, addr); - vma_iter_load(vmi); - return NULL; -} - -/* - * Rough compatibility check to quickly see if it's even worth looking - * at sharing an anon_vma. - * - * They need to have the same vm_file, and the flags can only differ - * in things that mprotect may change. - * - * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that - * we can merge the two vma's. For example, we refuse to merge a vma if - * there is a vm_ops->close() function, because that indicates that the - * driver is doing some kind of reference counting. But that doesn't - * really matter for the anon_vma sharing case. - */ -static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) -{ - return a->vm_end == b->vm_start && - mpol_equal(vma_policy(a), vma_policy(b)) && - a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && - b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); -} - -/* - * Do some basic sanity checking to see if we can re-use the anon_vma - * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be - * the same as 'old', the other will be the new one that is trying - * to share the anon_vma. - * - * NOTE! This runs with mmap_lock held for reading, so it is possible that - * the anon_vma of 'old' is concurrently in the process of being set up - * by another page fault trying to merge _that_. But that's ok: if it - * is being set up, that automatically means that it will be a singleton - * acceptable for merging, so we can do all of this optimistically. But - * we do that READ_ONCE() to make sure that we never re-load the pointer. - * - * IOW: that the "list_is_singular()" test on the anon_vma_chain only - * matters for the 'stable anon_vma' case (ie the thing we want to avoid - * is to return an anon_vma that is "complex" due to having gone through - * a fork). - * - * We also make sure that the two vma's are compatible (adjacent, - * and with the same memory policies). That's all stable, even with just - * a read lock on the mmap_lock. - */ -static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) -{ - if (anon_vma_compatible(a, b)) { - struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); - - if (anon_vma && list_is_singular(&old->anon_vma_chain)) - return anon_vma; - } - return NULL; -} - -/* - * find_mergeable_anon_vma is used by anon_vma_prepare, to check - * neighbouring vmas for a suitable anon_vma, before it goes off - * to allocate a new anon_vma. It checks because a repetitive - * sequence of mprotects and faults may otherwise lead to distinct - * anon_vmas being allocated, preventing vma merge in subsequent - * mprotect. - */ -struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = NULL; - struct vm_area_struct *prev, *next; - VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); - - /* Try next first. */ - next = vma_iter_load(&vmi); - if (next) { - anon_vma = reusable_anon_vma(next, vma, next); - if (anon_vma) - return anon_vma; - } - - prev = vma_prev(&vmi); - VM_BUG_ON_VMA(prev != vma, vma); - prev = vma_prev(&vmi); - /* Try prev next. */ - if (prev) - anon_vma = reusable_anon_vma(prev, prev, vma); - - /* - * We might reach here with anon_vma == NULL if we can't find - * any reusable anon_vma. - * There's no absolute need to look only at touching neighbours: - * we could search further afield for "compatible" anon_vmas. - * But it would probably just be a waste of time searching, - * or lead to too many vmas hanging off the same anon_vma. - * We're trying to allow mprotect remerging later on, - * not trying to minimize memory used for anon_vmas. - */ - return anon_vma; -} - /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -1549,85 +577,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ -static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) -{ - return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); -} - -static bool vma_is_shared_writable(struct vm_area_struct *vma) -{ - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == - (VM_WRITE | VM_SHARED); -} - -static bool vma_fs_can_writeback(struct vm_area_struct *vma) -{ - /* No managed pages to writeback. */ - if (vma->vm_flags & VM_PFNMAP) - return false; - - return vma->vm_file && vma->vm_file->f_mapping && - mapping_can_writeback(vma->vm_file->f_mapping); -} - -/* - * Does this VMA require the underlying folios to have their dirty state - * tracked? - */ -bool vma_needs_dirty_tracking(struct vm_area_struct *vma) -{ - /* Only shared, writable VMAs require dirty tracking. */ - if (!vma_is_shared_writable(vma)) - return false; - - /* Does the filesystem need to be notified? */ - if (vm_ops_needs_writenotify(vma->vm_ops)) - return true; - - /* - * Even if the filesystem doesn't indicate a need for writenotify, if it - * can writeback, dirty tracking is still required. - */ - return vma_fs_can_writeback(vma); -} - -/* - * Some shared mappings will want the pages marked read-only - * to track write events. If so, we'll downgrade vm_page_prot - * to the private version (using protection_map[] without the - * VM_SHARED bit). - */ -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) -{ - /* If it was private or non-writable, the write bit is already clear */ - if (!vma_is_shared_writable(vma)) - return false; - - /* The backer wishes to know when pages are first written to? */ - if (vm_ops_needs_writenotify(vma->vm_ops)) - return true; - - /* The open routine did something to the protections that pgprot_modify - * won't preserve? */ - if (pgprot_val(vm_page_prot) != - pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) - return false; - - /* - * Do we need to track softdirty? hugetlb does not support softdirty - * tracking yet. - */ - if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) - return true; - - /* Do we need write faults for uffd-wp tracking? */ - if (userfaultfd_wp(vma)) - return true; - - /* Can the mapping track the dirty pages? */ - return vma_fs_can_writeback(vma); -} - /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. @@ -1753,6 +702,18 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) return gap; } +/* + * Determine if the allocation needs to ensure that there is no + * existing mapping within it's guard gaps, for use as start_gap. + */ +static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) +{ + if (vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + /* * Search for an unmapped address range. * @@ -1789,7 +750,7 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1814,6 +775,7 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; + info.start_gap = stack_guard_placement(vm_flags); return vm_unmapped_area(&info); } @@ -1821,9 +783,10 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { - return generic_get_unmapped_area(filp, addr, len, pgoff, flags); + return generic_get_unmapped_area(filp, addr, len, pgoff, flags, + vm_flags); } #endif @@ -1834,7 +797,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; @@ -1862,6 +825,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); + info.start_gap = stack_guard_placement(vm_flags); addr = vm_unmapped_area(&info); /* @@ -1885,26 +849,10 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { - return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); -} -#endif - -#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS -unsigned long -arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) -{ - return arch_get_unmapped_area(filp, addr, len, pgoff, flags); -} - -unsigned long -arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t vm_flags) -{ - return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); + return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, + vm_flags); } #endif @@ -1914,9 +862,9 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi vm_flags_t vm_flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) - return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, - flags, vm_flags); - return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); + return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, + flags, vm_flags); + return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } unsigned long @@ -1978,8 +926,8 @@ mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long pgoff, unsigned long flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) - return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags); - return arch_get_unmapped_area(file, addr, len, pgoff, flags); + return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0); + return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); @@ -2393,443 +1341,6 @@ struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) return vma; } -/* - * Ok - we have the memory areas we should free on a maple tree so release them, - * and do the vma updates. - * - * Called with the mm semaphore held. - */ -static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) -{ - unsigned long nr_accounted = 0; - struct vm_area_struct *vma; - - /* Update high watermark before we lower total_vm */ - update_hiwater_vm(mm); - mas_for_each(mas, vma, ULONG_MAX) { - long nrpages = vma_pages(vma); - - if (vma->vm_flags & VM_ACCOUNT) - nr_accounted += nrpages; - vm_stat_account(mm, vma->vm_flags, -nrpages); - remove_vma(vma, false); - } - vm_unacct_memory(nr_accounted); -} - -/* - * Get rid of page table information in the indicated region. - * - * Called with the mm semaphore held. - */ -static void unmap_region(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, struct vm_area_struct *prev, - struct vm_area_struct *next, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked) -{ - struct mmu_gather tlb; - unsigned long mt_start = mas->index; - - lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); - mas_set(mas, mt_start); - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : USER_PGTABLES_CEILING, - mm_wr_locked); - tlb_finish_mmu(&tlb); -} - -/* - * __split_vma() bypasses sysctl_max_map_count checking. We use this where it - * has already been checked or doesn't make sense to fail. - * VMA Iterator will point to the end VMA. - */ -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) -{ - struct vma_prepare vp; - struct vm_area_struct *new; - int err; - - WARN_ON(vma->vm_start >= addr); - WARN_ON(vma->vm_end <= addr); - - if (vma->vm_ops && vma->vm_ops->may_split) { - err = vma->vm_ops->may_split(vma, addr); - if (err) - return err; - } - - new = vm_area_dup(vma); - if (!new) - return -ENOMEM; - - if (new_below) { - new->vm_end = addr; - } else { - new->vm_start = addr; - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); - } - - err = -ENOMEM; - vma_iter_config(vmi, new->vm_start, new->vm_end); - if (vma_iter_prealloc(vmi, new)) - goto out_free_vma; - - err = vma_dup_policy(vma, new); - if (err) - goto out_free_vmi; - - err = anon_vma_clone(new, vma); - if (err) - goto out_free_mpol; - - if (new->vm_file) - get_file(new->vm_file); - - if (new->vm_ops && new->vm_ops->open) - new->vm_ops->open(new); - - vma_start_write(vma); - vma_start_write(new); - - init_vma_prep(&vp, vma); - vp.insert = new; - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); - - if (new_below) { - vma->vm_start = addr; - vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; - } else { - vma->vm_end = addr; - } - - /* vma_complete stores the new vma */ - vma_complete(&vp, vmi, vma->vm_mm); - - /* Success. */ - if (new_below) - vma_next(vmi); - return 0; - -out_free_mpol: - mpol_put(vma_policy(new)); -out_free_vmi: - vma_iter_free(vmi); -out_free_vma: - vm_area_free(new); - return err; -} - -/* - * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the tail. - */ -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) -{ - if (vma->vm_mm->map_count >= sysctl_max_map_count) - return -ENOMEM; - - return __split_vma(vmi, vma, addr, new_below); -} - -/* - * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd - * context and anonymous VMA name within the range [start, end). - * - * As a result, we might be able to merge the newly modified VMA range with an - * adjacent VMA with identical properties. - * - * If no merge is possible and the range does not span the entirety of the VMA, - * we then need to split the VMA to accommodate the change. - * - * The function returns either the merged VMA, the original VMA if a split was - * required instead, or an error if the split failed. - */ -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long vm_flags, - struct mempolicy *policy, - struct vm_userfaultfd_ctx uffd_ctx, - struct anon_vma_name *anon_name) -{ - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - struct vm_area_struct *merged; - - merged = vma_merge(vmi, prev, vma, start, end, vm_flags, - pgoff, policy, uffd_ctx, anon_name); - if (merged) - return merged; - - if (vma->vm_start < start) { - int err = split_vma(vmi, vma, start, 1); - - if (err) - return ERR_PTR(err); - } - - if (vma->vm_end > end) { - int err = split_vma(vmi, vma, end, 0); - - if (err) - return ERR_PTR(err); - } - - return vma; -} - -/* - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller - * must ensure that [start, end) does not overlap any existing VMA. - */ -static struct vm_area_struct -*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff) -{ - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} - -/* - * Expand vma by delta bytes, potentially merging with an immediately adjacent - * VMA with identical properties. - */ -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta) -{ - pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); - - /* vma is specified as prev, so case 1 or 2 will apply. */ - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, - vma->vm_flags, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} - -/* - * do_vmi_align_munmap() - munmap the aligned region from @start to @end. - * @vmi: The vma iterator - * @vma: The starting vm_area_struct - * @mm: The mm_struct - * @start: The aligned start address to munmap. - * @end: The aligned end address to munmap. - * @uf: The userfaultfd list_head - * @unlock: Set to true to drop the mmap_lock. unlocking only happens on - * success. - * - * Return: 0 on success and drops the lock if so directed, error and leaves the - * lock held otherwise. - */ -static int -do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - struct mm_struct *mm, unsigned long start, - unsigned long end, struct list_head *uf, bool unlock) -{ - struct vm_area_struct *prev, *next = NULL; - struct maple_tree mt_detach; - int count = 0; - int error = -ENOMEM; - unsigned long locked_vm = 0; - MA_STATE(mas_detach, &mt_detach, 0, 0); - mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); - mt_on_stack(mt_detach); - - /* - * If we need to split any vma, do it now to save pain later. - * - * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially - * unmapped vm_area_struct will remain in use: so lower split_vma - * places tmp vma above, and higher split_vma places tmp vma below. - */ - - /* Does it split the first one? */ - if (start > vma->vm_start) { - - /* - * Make sure that map_count on return from munmap() will - * not exceed its limit; but let map_count go just above - * its limit temporarily, to help free resources as expected. - */ - if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) - goto map_count_exceeded; - - error = __split_vma(vmi, vma, start, 1); - if (error) - goto start_split_failed; - } - - /* - * Detach a range of VMAs from the mm. Using next as a temp variable as - * it is always overwritten. - */ - next = vma; - do { - /* Does it split the end? */ - if (next->vm_end > end) { - error = __split_vma(vmi, next, end, 0); - if (error) - goto end_split_failed; - } - vma_start_write(next); - mas_set(&mas_detach, count); - error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); - if (error) - goto munmap_gather_failed; - vma_mark_detached(next, true); - if (next->vm_flags & VM_LOCKED) - locked_vm += vma_pages(next); - - count++; - if (unlikely(uf)) { - /* - * If userfaultfd_unmap_prep returns an error the vmas - * will remain split, but userland will get a - * highly unexpected error anyway. This is no - * different than the case where the first of the two - * __split_vma fails, but we don't undo the first - * split, despite we could. This is unlikely enough - * failure that it's not worth optimizing it for. - */ - error = userfaultfd_unmap_prep(next, start, end, uf); - - if (error) - goto userfaultfd_error; - } -#ifdef CONFIG_DEBUG_VM_MAPLE_TREE - BUG_ON(next->vm_start < start); - BUG_ON(next->vm_start > end); -#endif - } for_each_vma_range(*vmi, next, end); - -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - /* Make sure no VMAs are about to be lost. */ - { - MA_STATE(test, &mt_detach, 0, 0); - struct vm_area_struct *vma_mas, *vma_test; - int test_count = 0; - - vma_iter_set(vmi, start); - rcu_read_lock(); - vma_test = mas_find(&test, count - 1); - for_each_vma_range(*vmi, vma_mas, end) { - BUG_ON(vma_mas != vma_test); - test_count++; - vma_test = mas_next(&test, count - 1); - } - rcu_read_unlock(); - BUG_ON(count != test_count); - } -#endif - - while (vma_iter_addr(vmi) > start) - vma_iter_prev_range(vmi); - - error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); - if (error) - goto clear_tree_failed; - - /* Point of no return */ - mm->locked_vm -= locked_vm; - mm->map_count -= count; - if (unlock) - mmap_write_downgrade(mm); - - prev = vma_iter_prev_range(vmi); - next = vma_next(vmi); - if (next) - vma_iter_prev_range(vmi); - - /* - * We can free page tables without write-locking mmap_lock because VMAs - * were isolated before we downgraded mmap_lock. - */ - mas_set(&mas_detach, 1); - unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, - !unlock); - /* Statistics and freeing VMAs */ - mas_set(&mas_detach, 0); - remove_mt(mm, &mas_detach); - validate_mm(mm); - if (unlock) - mmap_read_unlock(mm); - - __mt_destroy(&mt_detach); - return 0; - -clear_tree_failed: -userfaultfd_error: -munmap_gather_failed: -end_split_failed: - mas_set(&mas_detach, 0); - mas_for_each(&mas_detach, next, end) - vma_mark_detached(next, false); - - __mt_destroy(&mt_detach); -start_split_failed: -map_count_exceeded: - validate_mm(mm); - return error; -} - -/* - * do_vmi_munmap() - munmap a given range. - * @vmi: The vma iterator - * @mm: The mm_struct - * @start: The start address to munmap - * @len: The length of the range to munmap - * @uf: The userfaultfd list_head - * @unlock: set to true if the user wants to drop the mmap_lock on success - * - * This function takes a @mas that is either pointing to the previous VMA or set - * to MA_START and sets it up to remove the mapping(s). The @len will be - * aligned and any arch_unmap work will be preformed. - * - * Return: 0 on success and drops the lock if so directed, error and leaves the - * lock held otherwise. - */ -int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, - unsigned long start, size_t len, struct list_head *uf, - bool unlock) -{ - unsigned long end; - struct vm_area_struct *vma; - - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; - - end = start + PAGE_ALIGN(len); - if (end == start) - return -EINVAL; - - /* - * Check if memory is sealed before arch_unmap. - * Prevent unmapping a sealed VMA. - * can_modify_mm assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm(mm, start, end))) - return -EPERM; - - /* arch_unmap() might do unmaps itself. */ - arch_unmap(mm, start, end); - - /* Find the first overlapping VMA */ - vma = vma_find(vmi, end); - if (!vma) { - if (unlock) - mmap_write_unlock(mm); - return 0; - } - - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); -} - /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. * @mm: The mm_struct * @start: The start address to munmap @@ -2852,100 +1363,67 @@ unsigned long mmap_region(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct vm_area_struct *next, *prev, *merge; - pgoff_t pglen = len >> PAGE_SHIFT; + pgoff_t pglen = PHYS_PFN(len); + struct vm_area_struct *merge; unsigned long charged = 0; + struct vma_munmap_struct vms; + struct ma_state mas_detach; + struct maple_tree mt_detach; unsigned long end = addr + len; - unsigned long merge_start = addr, merge_end = end; bool writable_file_mapping = false; - pgoff_t vm_pgoff; - int error; + int error = -ENOMEM; VMA_ITERATOR(vmi, mm, addr); + VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; + vmg.file = file; + /* Find the first overlapping VMA */ + vma = vma_find(&vmi, end); + init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false); + if (vma) { + mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); + mt_on_stack(mt_detach); + mas_init(&mas_detach, &mt_detach, /* addr = */ 0); + /* Prepare to unmap any existing mapping in the area */ + error = vms_gather_munmap_vmas(&vms, &mas_detach); + if (error) + goto gather_failed; - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, end); - - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; + vmg.next = vms.next; + vmg.prev = vms.prev; + vma = NULL; + } else { + vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev); } - /* Unmap any existing mapping in the area */ - error = do_vmi_munmap(&vmi, mm, addr, len, uf, false); - if (error == -EPERM) - return error; - else if (error) - return -ENOMEM; + /* Check against address space limit. */ + if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) + goto abort_munmap; /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; + charged = pglen; + charged -= vms.nr_accounted; + if (charged && security_vm_enough_memory_mm(mm, charged)) + goto abort_munmap; + + vms.nr_accounted = 0; vm_flags |= VM_ACCOUNT; + vmg.flags = vm_flags; } - next = vma_next(&vmi); - prev = vma_prev(&vmi); - if (vm_flags & VM_SPECIAL) { - if (prev) - vma_iter_next_range(&vmi); - goto cannot_expand; - } - - /* Attempt to expand an old mapping */ - /* Check next */ - if (next && next->vm_start == end && !vma_policy(next) && - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, - NULL_VM_UFFD_CTX, NULL)) { - merge_end = next->vm_end; - vma = next; - vm_pgoff = next->vm_pgoff - pglen; - } - - /* Check prev */ - if (prev && prev->vm_end == addr && !vma_policy(prev) && - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, - pgoff, vma->vm_userfaultfd_ctx, NULL) : - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, - NULL_VM_UFFD_CTX, NULL))) { - merge_start = prev->vm_start; - vma = prev; - vm_pgoff = prev->vm_pgoff; - } else if (prev) { - vma_iter_next_range(&vmi); - } - - /* Actually expand, if possible */ - if (vma && - !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { - khugepaged_enter_vma(vma, vm_flags); + vma = vma_merge_new_range(&vmg); + if (vma) goto expanded; - } - - if (vma == prev) - vma_iter_set(&vmi, addr); -cannot_expand: - /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; + if (!vma) goto unacct_error; - } vma_iter_config(&vmi, addr, end); vma_set_range(vma, addr, end, pgoff); @@ -2954,6 +1432,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (file) { vma->vm_file = get_file(file); + /* + * call_mmap() may map PTE, so ensure there are no existing PTEs + * and call the vm_ops close function if one exists. + */ + vms_clean_up_area(&vms, &mas_detach); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; @@ -2979,10 +1462,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge_new_vma(&vmi, prev, vma, - vma->vm_start, vma->vm_end, - vma->vm_pgoff); + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { + vmg.flags = vma->vm_flags; + /* If this fails, state is reset ready for a reattempt. */ + merge = vma_merge_new_range(&vmg); + if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -2998,6 +1482,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags = vma->vm_flags; goto unmap_writable; } + vma_iter_config(&vmi, addr, end); } vm_flags = vma->vm_flags; @@ -3030,7 +1515,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_link_file(vma); /* - * vma_merge() calls khugepaged_enter_vma() either, the below + * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ khugepaged_enter_vma(vma, vma->vm_flags); @@ -3044,14 +1529,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, expanded: perf_event_mmap(vma); - vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); + /* Unmap any existing mapping in the area */ + vms_complete_munmap_vmas(&vms, &mas_detach); + + vm_stat_account(mm, vm_flags, pglen); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else - mm->locked_vm += (len >> PAGE_SHIFT); + mm->locked_vm += pglen; } if (file) @@ -3072,7 +1560,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return addr; close_and_free_vma: - if (file && vma->vm_ops && vma->vm_ops->close) + if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (file || vma->vm_file) { @@ -3082,8 +1570,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_iter_set(&vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, - vma->vm_end, vma->vm_end, true); + unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); } if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); @@ -3092,6 +1579,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); + +abort_munmap: + vms_abort_munmap_vmas(&vms, &mas_detach); +gather_failed: validate_mm(mm); return error; } @@ -3210,39 +1701,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, return ret; } -/* - * do_vma_munmap() - Unmap a full or partial vma. - * @vmi: The vma iterator pointing at the vma - * @vma: The first vma to be munmapped - * @start: the start of the address to unmap - * @end: The end of the address to unmap - * @uf: The userfaultfd list_head - * @unlock: Drop the lock on success - * - * unmaps a VMA mapping when the vma iterator is already in position. - * Does not handle alignment. - * - * Return: 0 on success drops the lock of so directed, error on failure and will - * still hold the lock. - */ -int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, struct list_head *uf, - bool unlock) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * Check if memory is sealed before arch_unmap. - * Prevent unmapping a sealed VMA. - * can_modify_mm assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm(mm, start, end))) - return -EPERM; - - arch_unmap(mm, start, end); - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); -} - /* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator @@ -3259,7 +1717,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vma_prepare vp; /* * Check against address space limits by the changed size @@ -3279,25 +1736,16 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ - if (vma && vma->vm_end == addr && !vma_policy(vma) && - can_vma_merge_after(vma, flags, NULL, NULL, - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - vma_iter_config(vmi, vma->vm_start, addr + len); - if (vma_iter_prealloc(vmi, vma)) + if (vma && vma->vm_end == addr) { + VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + + vmg.prev = vma; + vma_iter_next_range(vmi); + + if (vma_merge_new_range(&vmg)) + goto out; + else if (vmg_nomem(&vmg)) goto unacct_fail; - - vma_start_write(vma); - - init_vma_prep(&vp, vma); - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - vma->vm_end = addr + len; - vm_flags_set(vma, VM_SOFTDIRTY); - vma_iter_store(vmi, vma); - - vma_complete(&vp, vmi, mm); - khugepaged_enter_vma(vma, flags); - goto out; } if (vma) @@ -3433,7 +1881,7 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, true); + remove_vma(vma, /* unreachable = */ true, /* closed = */ false); count++; cond_resched(); vma = vma_next(&vmi); @@ -3490,92 +1938,6 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } -/* - * Copy the vma structure to a new location in the same mm, - * prior to moving page table entries, to effect an mremap move. - */ -struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks) -{ - struct vm_area_struct *vma = *vmap; - unsigned long vma_start = vma->vm_start; - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma, *prev; - bool faulted_in_anon_vma = true; - VMA_ITERATOR(vmi, mm, addr); - - /* - * If anonymous vma has not yet been faulted, update new pgoff - * to match new location, to increase its chance of merging. - */ - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { - pgoff = addr >> PAGE_SHIFT; - faulted_in_anon_vma = false; - } - - new_vma = find_vma_prev(mm, addr, &prev); - if (new_vma && new_vma->vm_start < addr + len) - return NULL; /* should never get here */ - - new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); - if (new_vma) { - /* - * Source vma may have been merged into new_vma - */ - if (unlikely(vma_start >= new_vma->vm_start && - vma_start < new_vma->vm_end)) { - /* - * The only way we can get a vma_merge with - * self during an mremap is if the vma hasn't - * been faulted in yet and we were allowed to - * reset the dst vma->vm_pgoff to the - * destination address of the mremap to allow - * the merge to happen. mremap must change the - * vm_pgoff linearity between src and dst vmas - * (in turn preventing a vma_merge) to be - * safe. It is only safe to keep the vm_pgoff - * linear if there are no pages mapped yet. - */ - VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); - *vmap = vma = new_vma; - } - *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); - } else { - new_vma = vm_area_dup(vma); - if (!new_vma) - goto out; - vma_set_range(new_vma, addr, addr + len, pgoff); - if (vma_dup_policy(vma, new_vma)) - goto out_free_vma; - if (anon_vma_clone(new_vma, vma)) - goto out_free_mempol; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - if (vma_link(mm, new_vma)) - goto out_vma_link; - *need_rmap_locks = false; - } - return new_vma; - -out_vma_link: - if (new_vma->vm_ops && new_vma->vm_ops->close) - new_vma->vm_ops->close(new_vma); - - if (new_vma->vm_file) - fput(new_vma->vm_file); - - unlink_anon_vmas(new_vma); -out_free_mempol: - mpol_put(vma_policy(new_vma)); -out_free_vma: - vm_area_free(new_vma); -out: - return NULL; -} - /* * Return true if the calling process may expand its vm space by the passed * number of pages @@ -3620,10 +1982,16 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* + * Close hook, called for unmap() and on the old vma for mremap(). + * * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { + const struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->close) + sm->close(sm, vma); } static const char *special_mapping_name(struct vm_area_struct *vma) @@ -3665,27 +2033,17 @@ static const struct vm_operations_struct special_mapping_vmops = { .may_split = special_mapping_split, }; -static const struct vm_operations_struct legacy_special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; + struct vm_special_mapping *sm = vma->vm_private_data; - if (vma->vm_ops == &legacy_special_mapping_vmops) { - pages = vma->vm_private_data; - } else { - struct vm_special_mapping *sm = vma->vm_private_data; + if (sm->fault) + return sm->fault(sm, vmf->vma, vmf); - if (sm->fault) - return sm->fault(sm, vmf->vma, vmf); - - pages = sm->pages; - } + pages = sm->pages; for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; @@ -3740,8 +2098,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm) { return vma->vm_private_data == sm && - (vma->vm_ops == &special_mapping_vmops || - vma->vm_ops == &legacy_special_mapping_vmops); + vma->vm_ops == &special_mapping_vmops; } /* @@ -3762,214 +2119,6 @@ struct vm_area_struct *_install_special_mapping( &special_mapping_vmops); } -int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) -{ - struct vm_area_struct *vma = __install_special_mapping( - mm, addr, len, vm_flags, (void *)pages, - &legacy_special_mapping_vmops); - - return PTR_ERR_OR_ZERO(vma); -} - -static DEFINE_MUTEX(mm_all_locks_mutex); - -static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) -{ - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { - /* - * The LSB of head.next can't change from under us - * because we hold the mm_all_locks_mutex. - */ - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); - /* - * We can safely modify head.next after taking the - * anon_vma->root->rwsem. If some other vma in this mm shares - * the same anon_vma we won't take it again. - * - * No need of atomic instructions here, head.next - * can't change from under us thanks to the - * anon_vma->root->rwsem. - */ - if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_root.rb_node)) - BUG(); - } -} - -static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) -{ - if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { - /* - * AS_MM_ALL_LOCKS can't change from under us because - * we hold the mm_all_locks_mutex. - * - * Operations on ->flags have to be atomic because - * even if AS_MM_ALL_LOCKS is stable thanks to the - * mm_all_locks_mutex, there may be other cpus - * changing other bitflags in parallel to us. - */ - if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) - BUG(); - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); - } -} - -/* - * This operation locks against the VM for all pte/vma/mm related - * operations that could ever happen on a certain mm. This includes - * vmtruncate, try_to_unmap, and all page faults. - * - * The caller must take the mmap_lock in write mode before calling - * mm_take_all_locks(). The caller isn't allowed to release the - * mmap_lock until mm_drop_all_locks() returns. - * - * mmap_lock in write mode is required in order to block all operations - * that could modify pagetables and free pages without need of - * altering the vma layout. It's also needed in write mode to avoid new - * anon_vmas to be associated with existing vmas. - * - * A single task can't take more than one mm_take_all_locks() in a row - * or it would deadlock. - * - * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in - * mapping->flags avoid to take the same lock twice, if more than one - * vma in this mm is backed by the same anon_vma or address_space. - * - * We take locks in following order, accordingly to comment at beginning - * of mm/rmap.c: - * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for - * hugetlb mapping); - * - all vmas marked locked - * - all i_mmap_rwsem locks; - * - all anon_vma->rwseml - * - * We can take all locks within these types randomly because the VM code - * doesn't nest them and we protected from parallel mm_take_all_locks() by - * mm_all_locks_mutex. - * - * mm_take_all_locks() and mm_drop_all_locks are expensive operations - * that may have to take thousand of locks. - * - * mm_take_all_locks() can fail if it's interrupted by signals. - */ -int mm_take_all_locks(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - struct anon_vma_chain *avc; - VMA_ITERATOR(vmi, mm, 0); - - mmap_assert_write_locked(mm); - - mutex_lock(&mm_all_locks_mutex); - - /* - * vma_start_write() does not have a complement in mm_drop_all_locks() - * because vma_start_write() is always asymmetrical; it marks a VMA as - * being written to until mmap_write_unlock() or mmap_write_downgrade() - * is reached. - */ - for_each_vma(vmi, vma) { - if (signal_pending(current)) - goto out_unlock; - vma_start_write(vma); - } - - vma_iter_init(&vmi, mm, 0); - for_each_vma(vmi, vma) { - if (signal_pending(current)) - goto out_unlock; - if (vma->vm_file && vma->vm_file->f_mapping && - is_vm_hugetlb_page(vma)) - vm_lock_mapping(mm, vma->vm_file->f_mapping); - } - - vma_iter_init(&vmi, mm, 0); - for_each_vma(vmi, vma) { - if (signal_pending(current)) - goto out_unlock; - if (vma->vm_file && vma->vm_file->f_mapping && - !is_vm_hugetlb_page(vma)) - vm_lock_mapping(mm, vma->vm_file->f_mapping); - } - - vma_iter_init(&vmi, mm, 0); - for_each_vma(vmi, vma) { - if (signal_pending(current)) - goto out_unlock; - if (vma->anon_vma) - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - vm_lock_anon_vma(mm, avc->anon_vma); - } - - return 0; - -out_unlock: - mm_drop_all_locks(mm); - return -EINTR; -} - -static void vm_unlock_anon_vma(struct anon_vma *anon_vma) -{ - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { - /* - * The LSB of head.next can't change to 0 from under - * us because we hold the mm_all_locks_mutex. - * - * We must however clear the bitflag before unlocking - * the vma so the users using the anon_vma->rb_root will - * never see our bitflag. - * - * No need of atomic instructions here, head.next - * can't change from under us until we release the - * anon_vma->root->rwsem. - */ - if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_root.rb_node)) - BUG(); - anon_vma_unlock_write(anon_vma); - } -} - -static void vm_unlock_mapping(struct address_space *mapping) -{ - if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { - /* - * AS_MM_ALL_LOCKS can't change to 0 from under us - * because we hold the mm_all_locks_mutex. - */ - i_mmap_unlock_write(mapping); - if (!test_and_clear_bit(AS_MM_ALL_LOCKS, - &mapping->flags)) - BUG(); - } -} - -/* - * The mmap_lock cannot be released by the caller until - * mm_drop_all_locks() returns. - */ -void mm_drop_all_locks(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - struct anon_vma_chain *avc; - VMA_ITERATOR(vmi, mm, 0); - - mmap_assert_write_locked(mm); - BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - - for_each_vma(vmi, vma) { - if (vma->anon_vma) - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - vm_unlock_anon_vma(avc->anon_vma); - if (vma->vm_file && vma->vm_file->f_mapping) - vm_unlock_mapping(vma->vm_file->f_mapping); - } - - mutex_unlock(&mm_all_locks_mutex); -} - /* * initialise the percpu counter for VM */ @@ -4088,3 +2237,86 @@ static int __meminit init_reserve_notifier(void) return 0; } subsys_initcall(init_reserve_notifier); + +/* + * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between + * this VMA and its relocated range, which will now reside at [vma->vm_start - + * shift, vma->vm_end - shift). + * + * This function is almost certainly NOT what you want for anything other than + * early executable temporary stack relocation. + */ +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) +{ + /* + * The process proceeds as follows: + * + * 1) Use shift to calculate the new vma endpoints. + * 2) Extend vma to cover both the old and new ranges. This ensures the + * arguments passed to subsequent functions are consistent. + * 3) Move vma's page tables to the new range. + * 4) Free up any cleared pgd range. + * 5) Shrink the vma to cover only the new range. + */ + + struct mm_struct *mm = vma->vm_mm; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + struct vm_area_struct *next; + struct mmu_gather tlb; + + BUG_ON(new_start > new_end); + + /* + * ensure there are no vmas between where we want to go + * and where we are + */ + if (vma != vma_next(&vmi)) + return -EFAULT; + + vma_iter_prev_range(&vmi); + /* + * cover the whole range: [new_start, old_end) + */ + vmg.vma = vma; + if (vma_expand(&vmg)) + return -ENOMEM; + + /* + * move the page tables downwards, on failure we rely on + * process cleanup to remove whatever mess we made. + */ + if (length != move_page_tables(vma, old_start, + vma, new_start, length, false, true)) + return -ENOMEM; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } else { + /* + * otherwise, clean from old_start; this is done to not touch + * the address space in [new_end, old_start) some architectures + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } + tlb_finish_mmu(&tlb); + + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8982e6139d07..fc18fe274505 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -19,6 +19,8 @@ #include #include +#include "vma.h" + /* global SRCU for all MMs */ DEFINE_STATIC_SRCU(srcu); diff --git a/mm/mmzone.c b/mm/mmzone.c index c01896eca736..f9baa8882fbf 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -66,7 +66,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, z++; else while (zonelist_zone_idx(z) > highest_zoneidx || - (z->zone && !zref_in_nodemask(z, nodes))) + (zonelist_zone(z) && !zref_in_nodemask(z, nodes))) z++; return z; diff --git a/mm/mprotect.c b/mm/mprotect.c index 222ab434da54..0c5d6d06107d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -161,8 +161,7 @@ static long change_pte_range(struct mmu_gather *tlb, if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) continue; - if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && - !toptier) + if (folio_use_access_time(folio)) folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); } @@ -303,8 +302,9 @@ pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags) { /* * pte markers only resides in pte level, if we need pte markers, - * we need to split. We cannot wr-protect shmem thp because file - * thp is handled differently when split by erasing the pmd so far. + * we need to split. For example, we cannot wr-protect a file thp + * (e.g. 2M shmem) because file thp is handled differently when + * split by erasing the pmd so far. */ return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); } @@ -364,9 +364,6 @@ static inline long change_pmd_range(struct mmu_gather *tlb, unsigned long next; long pages = 0; unsigned long nr_huge_updates = 0; - struct mmu_notifier_range range; - - range.start = 0; pmd = pmd_offset(pud, addr); do { @@ -384,14 +381,6 @@ static inline long change_pmd_range(struct mmu_gather *tlb, if (pmd_none(*pmd)) goto next; - /* invoke the mmu notifier if the pmd is populated */ - if (!range.start) { - mmu_notifier_range_init(&range, - MMU_NOTIFY_PROTECTION_VMA, 0, - vma->vm_mm, addr, end); - mmu_notifier_invalidate_range_start(&range); - } - _pmd = pmdp_get_lockless(pmd); if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || @@ -432,9 +421,6 @@ static inline long change_pmd_range(struct mmu_gather *tlb, cond_resched(); } while (pmd++, addr = next, addr != end); - if (range.start) - mmu_notifier_invalidate_range_end(&range); - if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; @@ -444,21 +430,57 @@ static inline long change_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { - pud_t *pud; + struct mmu_notifier_range range; + pud_t *pudp, pud; unsigned long next; long pages = 0, ret; - pud = pud_offset(p4d, addr); + range.start = 0; + + pudp = pud_offset(p4d, addr); do { +again: next = pud_addr_end(addr, end); - ret = change_prepare(vma, pud, pmd, addr, cp_flags); - if (ret) - return ret; - if (pud_none_or_clear_bad(pud)) + ret = change_prepare(vma, pudp, pmd, addr, cp_flags); + if (ret) { + pages = ret; + break; + } + + pud = READ_ONCE(*pudp); + if (pud_none(pud)) continue; - pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, + + if (!range.start) { + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma->vm_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); + } + + if (pud_leaf(pud)) { + if ((next - addr != PUD_SIZE) || + pgtable_split_needed(vma, cp_flags)) { + __split_huge_pud(vma, pudp, addr); + goto again; + } else { + ret = change_huge_pud(tlb, vma, pudp, + addr, newprot, cp_flags); + if (ret == 0) + goto again; + /* huge pud was handled */ + if (ret == HPAGE_PUD_NR) + pages += HPAGE_PUD_NR; + continue; + } + } + + pages += change_pmd_range(tlb, vma, pudp, addr, next, newprot, cp_flags); - } while (pud++, addr = next, addr != end); + } while (pudp++, addr = next, addr != end); + + if (range.start) + mmu_notifier_invalidate_range_end(&range); return pages; } @@ -589,6 +611,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long charged = 0; int error; + if (!can_modify_vma(vma)) + return -EPERM; + if (newflags == oldflags) { *pprev = vma; return 0; @@ -747,15 +772,6 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } - /* - * checking if memory is sealed. - * can_modify_mm assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm(current->mm, start, end))) { - error = -EPERM; - goto out; - } - prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; diff --git a/mm/mremap.c b/mm/mremap.c index e7ae140fc640..24712f8dbb6b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -902,19 +902,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if ((mm->map_count + 2) >= sysctl_max_map_count - 3) return -ENOMEM; - /* - * In mremap_to(). - * Move a VMA to another location, check if src addr is sealed. - * - * Place can_modify_mm here because mremap_to() - * does its own checking for address range, and we only - * check the sealing after passing those checks. - * - * can_modify_mm assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) - return -EPERM; - if (flags & MREMAP_FIXED) { /* * In mremap_to(). @@ -1052,6 +1039,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } + /* Don't allow remapping vmas when they have already been sealed */ + if (!can_modify_vma(vma)) { + ret = -EPERM; + goto out; + } + if (is_vm_hugetlb_page(vma)) { struct hstate *h __maybe_unused = hstate_vma(vma); @@ -1079,19 +1072,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - /* - * Below is shrink/expand case (not mremap_to()) - * Check if src address is sealed, if so, reject. - * In other words, prevent shrinking or expanding a sealed VMA. - * - * Place can_modify_mm here so we can keep the logic related to - * shrink/expand together. - */ - if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) { - ret = -EPERM; - goto out; - } - /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. diff --git a/mm/mseal.c b/mm/mseal.c index c8787cc6ba55..ece977bd21e1 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -16,28 +16,11 @@ #include #include "internal.h" -static inline bool vma_is_sealed(struct vm_area_struct *vma) -{ - return (vma->vm_flags & VM_SEALED); -} - static inline void set_vma_sealed(struct vm_area_struct *vma) { vm_flags_set(vma, VM_SEALED); } -/* - * check if a vma is sealed for modification. - * return true, if modification is allowed. - */ -static bool can_modify_vma(struct vm_area_struct *vma) -{ - if (unlikely(vma_is_sealed(vma))) - return false; - - return true; -} - static bool is_madv_discard(int behavior) { switch (behavior) { @@ -71,45 +54,15 @@ static bool is_ro_anon(struct vm_area_struct *vma) } /* - * Check if the vmas of a memory range are allowed to be modified. - * the memory ranger can have a gap (unallocated memory). - * return true, if it is allowed. + * Check if a vma is allowed to be modified by madvise. */ -bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { - struct vm_area_struct *vma; - - VMA_ITERATOR(vmi, mm, start); - - /* going through each vma to check. */ - for_each_vma_range(vmi, vma, end) { - if (unlikely(!can_modify_vma(vma))) - return false; - } - - /* Allow by default. */ - return true; -} - -/* - * Check if the vmas of a memory range are allowed to be modified by madvise. - * the memory ranger can have a gap (unallocated memory). - * return true, if it is allowed. - */ -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, - int behavior) -{ - struct vm_area_struct *vma; - - VMA_ITERATOR(vmi, mm, start); - if (!is_madv_discard(behavior)) return true; - /* going through each vma to check. */ - for_each_vma_range(vmi, vma, end) - if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) - return false; + if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) + return false; /* Allow by default. */ return true; diff --git a/mm/nommu.c b/mm/nommu.c index 7296e775e04e..385b0c15add8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -126,6 +126,11 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc_noprof); +void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); +} + void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, @@ -1573,12 +1578,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) -{ - return NULL; -} - int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { diff --git a/mm/numa.c b/mm/numa.c new file mode 100644 index 000000000000..e2eec07707d1 --- /dev/null +++ b/mm/numa.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +struct pglist_data *node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); + +/* Allocate NODE_DATA for a node on the local memory */ +void __init alloc_node_data(int nid) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); + u64 nd_pa; + void *nd; + int tnid; + + /* Allocate node data. Try node-local memory and then any node. */ + nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) + panic("Cannot allocate %zu bytes for node %d data\n", + nd_size, nid); + nd = __va(nd_pa); + + /* report and initialize */ + pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); +} + +void __init alloc_offline_node_data(int nid) +{ + pg_data_t *pgdat; + + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); + if (!pgdat) + panic("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + + node_data[nid] = pgdat; +} + +/* Stub functions: */ + +#ifndef memory_add_physaddr_to_nid +int memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + +#ifndef phys_to_target_node +int phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(phys_to_target_node); +#endif diff --git a/arch/x86/mm/numa_emulation.c b/mm/numa_emulation.c similarity index 94% rename from arch/x86/mm/numa_emulation.c rename to mm/numa_emulation.c index 9a9305367fdd..031fb9961bf7 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/mm/numa_emulation.c @@ -6,9 +6,11 @@ #include #include #include -#include +#include +#include -#include "numa_internal.h" +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) static int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; @@ -125,7 +127,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, */ while (!nodes_empty(physnode_mask)) { for_each_node_mask(i, physnode_mask) { - u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 dma32_end = numa_emu_dma_end(); u64 start, limit, end; int phys_blk; @@ -272,7 +274,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, */ while (!nodes_empty(physnode_mask)) { for_each_node_mask(i, physnode_mask) { - u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 dma32_end = numa_emu_dma_end(); u64 start, limit, end; int phys_blk; @@ -445,15 +447,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) /* copy the physical distance table */ if (numa_dist_cnt) { - u64 phys; - - phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, - PFN_PHYS(max_pfn_mapped)); - if (!phys) { + phys_dist = memblock_alloc(phys_size, PAGE_SIZE); + if (!phys_dist) { pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) for (j = 0; j < numa_dist_cnt; j++) @@ -477,19 +475,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) ei.blk[i].nid != NUMA_NO_NODE) node_set(ei.blk[i].nid, numa_nodes_parsed); - /* - * Transform __apicid_to_node table to use emulated nids by - * reverse-mapping phys_nid. The maps should always exist but fall - * back to zero just in case. - */ - for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { - if (__apicid_to_node[i] == NUMA_NO_NODE) - continue; - for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) - if (__apicid_to_node[i] == emu_nid_to_phys[j]) - break; - __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; - } + numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); /* make sure all emulated nodes are mapped to a physical node */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) @@ -527,7 +513,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } #ifndef CONFIG_DEBUG_PER_CPU_MAPS -void numa_add_cpu(int cpu) +void numa_add_cpu(unsigned int cpu) { int physnid, nid; @@ -545,7 +531,7 @@ void numa_add_cpu(int cpu) cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); } -void numa_remove_cpu(int cpu) +void numa_remove_cpu(unsigned int cpu) { int i; @@ -553,7 +539,7 @@ void numa_remove_cpu(int cpu) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void numa_set_cpumask(int cpu, bool enable) +static void numa_set_cpumask(unsigned int cpu, bool enable) { int nid, physnid; @@ -573,12 +559,12 @@ static void numa_set_cpumask(int cpu, bool enable) } } -void numa_add_cpu(int cpu) +void numa_add_cpu(unsigned int cpu) { numa_set_cpumask(cpu, true); } -void numa_remove_cpu(int cpu) +void numa_remove_cpu(unsigned int cpu) { numa_set_cpumask(cpu, false); } diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c new file mode 100644 index 000000000000..be52b93a9c58 --- /dev/null +++ b/mm/numa_memblks.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include + +static int numa_distance_cnt; +static u8 *numa_distance; + +nodemask_t numa_nodes_parsed __initdata; + +static struct numa_meminfo numa_meminfo __initdata_or_meminfo; +static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_free(numa_distance, size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + numa_distance = memblock_alloc(size, PAGE_SIZE); + if (!numa_distance) { + pr_warn("Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] = src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unnecessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = memblock_start_of_DRAM(); + const u64 high = memblock_end_of_DRAM(); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ + bi->start = max(bi->start, low); + + /* preserve info for non-RAM areas above 'max_pfn': */ + if (bi->end > high) { + numa_add_memblk_to(bi->nid, high, bi->end, + &numa_reserved_meminfo); + bi->end = high; + } + + /* and there's no empty block */ + if (bi->start >= bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); + return -EINVAL; + } + pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ +static void __init numa_clear_kernel_node_hotplug(void) +{ + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; + + /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * + * At this time, all memory regions reserved by memblock are + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all parsed memory blocks and use those ranges to + * set the nid in memblock.reserved. This will split up the + * memblock regions along node boundaries and will set the node IDs + * as well. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; + + ret = memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); + } + + /* + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. + * + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] + */ + for_each_reserved_mem_region(mb_region) { + int nid = memblock_get_region_node(mb_region); + + if (nid != MAX_NUMNODES) + node_set(nid, reserved_nodemask); + } + + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + + if (!node_isset(mb->nid, reserved_nodemask)) + continue; + + memblock_clear_hotplug(mb->start, mb->end - mb->start); + } +} + +static int __init numa_register_meminfo(struct numa_meminfo *mi) +{ + int i; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); + } + + /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, alloc node data won't fail. + */ + numa_clear_kernel_node_hotplug(); + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ + if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { + unsigned long pfn_align = node_map_pfn_alignment(); + + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20; + + unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20; + + pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n", + node_align_mb, sect_align_mb); + return -EINVAL; + } + } + + return 0; +} + +int __init numa_memblks_init(int (*init_func)(void), + bool memblock_force_top_down) +{ + phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + int ret; + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE)); + WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved, + NUMA_NO_NODE)); + /* In case that parsing SRAT failed. */ + WARN_ON(memblock_clear_hotplug(0, max_addr)); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + + /* + * We reset memblock back to the top-down direction + * here because if we configured ACPI_NUMA, we have + * parsed SRAT in init_func(). It is ok to have the + * reset here even if we did't configure ACPI_NUMA + * or acpi numa init fails and fallbacks to dummy + * numa init. + */ + if (memblock_force_top_down) + memblock_set_bottom_up(false); + + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + return numa_register_meminfo(&numa_meminfo); +} + +static int __init cmp_memblk(const void *a, const void *b) +{ + const struct numa_memblk *ma = *(const struct numa_memblk **)a; + const struct numa_memblk *mb = *(const struct numa_memblk **)b; + + return (ma->start > mb->start) - (ma->start < mb->start); +} + +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; + +/** + * numa_fill_memblks - Fill gaps in numa_meminfo memblks + * @start: address to begin fill + * @end: address to end fill + * + * Find and extend numa_meminfo memblks to cover the physical + * address range @start-@end + * + * RETURNS: + * 0 : Success + * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end + */ + +int __init numa_fill_memblks(u64 start, u64 end) +{ + struct numa_memblk **blk = &numa_memblk_list[0]; + struct numa_meminfo *mi = &numa_meminfo; + int count = 0; + u64 prev_end; + + /* + * Create a list of pointers to numa_meminfo memblks that + * overlap start, end. The list is used to make in-place + * changes that fill out the numa_meminfo memblks. + */ + for (int i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + if (memblock_addrs_overlap(start, end - start, bi->start, + bi->end - bi->start)) { + blk[count] = &mi->blk[i]; + count++; + } + } + if (!count) + return NUMA_NO_MEMBLK; + + /* Sort the list of pointers in memblk->start order */ + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); + + /* Make sure the first/last memblks include start/end */ + blk[0]->start = min(blk[0]->start, start); + blk[count - 1]->end = max(blk[count - 1]->end, end); + + /* + * Fill any gaps by tracking the previous memblks + * end address and backfilling to it if needed. + */ + prev_end = blk[0]->end; + for (int i = 1; i < count; i++) { + struct numa_memblk *curr = blk[i]; + + if (prev_end >= curr->start) { + if (prev_end < curr->end) + prev_end = curr->end; + } else { + curr->start = prev_end; + prev_end = curr->end; + } + } + return 0; +} + +#ifdef CONFIG_NUMA_KEEP_MEMINFO +static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + return mi->blk[i].nid; + return NUMA_NO_NODE; +} + +int phys_to_target_node(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + /* + * Prefer online nodes, but if reserved memory might be + * hot-added continue the search with reserved ranges. + */ + if (nid != NUMA_NO_NODE) + return nid; + + return meminfo_to_nid(&numa_reserved_meminfo, start); +} +EXPORT_SYMBOL_GPL(phys_to_target_node); + +int memory_add_physaddr_to_nid(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + if (nid == NUMA_NO_NODE) + nid = numa_meminfo.blk[0].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +#endif /* CONFIG_NUMA_KEEP_MEMINFO */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7a04cb1918fd..fcd4c1439cb9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2612,7 +2612,7 @@ struct folio *writeback_iter(struct address_space *mapping, done: if (wbc->range_cyclic) - mapping->writeback_index = folio->index + folio_nr_pages(folio); + mapping->writeback_index = folio_next_index(folio); folio_batch_release(&wbc->fbatch); return NULL; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0aefae4a26b2..8afab64814dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -286,9 +286,7 @@ EXPORT_SYMBOL(nr_online_nodes); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); -static void accept_page(struct page *page, unsigned int order); static bool cond_accept_memory(struct zone *zone, unsigned int order); -static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page); int page_group_by_mobility_disabled __read_mostly; @@ -322,6 +320,11 @@ static inline bool deferred_pages_enabled(void) { return false; } + +static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return false; +} #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /* Return a pointer to the bitmap storing bits affecting a block of pages */ @@ -958,8 +961,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) break; case 2: /* the second tail page: deferred_list overlaps ->mapping */ - if (unlikely(!list_empty(&folio->_deferred_list))) { - bad_page(page, "on deferred list"); + if (unlikely(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio))) { + bad_page(page, "partially mapped folio on deferred list"); goto out; } break; @@ -1087,8 +1091,11 @@ __always_inline bool free_pages_prepare(struct page *page, (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageMappingFlags(page)) + if (PageMappingFlags(page)) { + if (PageAnon(page)) + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); page->mapping = NULL; + } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) bad++; @@ -1199,17 +1206,39 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock_irqrestore(&zone->lock, flags); } +/* Split a multi-block free page into its individual pageblocks. */ +static void split_large_buddy(struct zone *zone, struct page *page, + unsigned long pfn, int order, fpi_t fpi) +{ + unsigned long end = pfn + (1 << order); + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order)); + /* Caller removed page from freelist, buddy info cleared! */ + VM_WARN_ON_ONCE(PageBuddy(page)); + + if (order > pageblock_order) + order = pageblock_order; + + while (pfn != end) { + int mt = get_pfnblock_migratetype(page, pfn); + + __free_one_page(page, pfn, zone, order, mt, fpi); + pfn += 1 << order; + page = pfn_to_page(pfn); + } +} + static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, fpi_t fpi_flags) { unsigned long flags; - int migratetype; spin_lock_irqsave(&zone->lock, flags); - migratetype = get_pfnblock_migratetype(page, pfn); - __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + split_large_buddy(zone, page, pfn, order, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); + + __count_vm_events(PGFREE, 1 << order); } static void __free_pages_ok(struct page *page, unsigned int order, @@ -1218,12 +1247,8 @@ static void __free_pages_ok(struct page *page, unsigned int order, unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); - if (!free_pages_prepare(page, order)) - return; - - free_one_page(zone, page, pfn, order, fpi_flags); - - __count_vm_events(PGFREE, 1 << order); + if (free_pages_prepare(page, order)) + free_one_page(zone, page, pfn, order, fpi_flags); } void __meminit __free_pages_core(struct page *page, unsigned int order, @@ -1270,7 +1295,7 @@ void __meminit __free_pages_core(struct page *page, unsigned int order, if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) return; - accept_page(page, order); + accept_memory(page_to_phys(page), PAGE_SIZE << order); } /* @@ -1346,11 +1371,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, * * -- nyc */ -static inline void expand(struct zone *zone, struct page *page, - int low, int high, int migratetype) +static inline unsigned int expand(struct zone *zone, struct page *page, int low, + int high, int migratetype) { - unsigned long size = 1 << high; - unsigned long nr_added = 0; + unsigned int size = 1 << high; + unsigned int nr_added = 0; while (high > low) { high--; @@ -1370,7 +1395,19 @@ static inline void expand(struct zone *zone, struct page *page, set_buddy_order(&page[size], high); nr_added += size; } - account_freepages(zone, nr_added, migratetype); + + return nr_added; +} + +static __always_inline void page_del_and_expand(struct zone *zone, + struct page *page, int low, + int high, int migratetype) +{ + int nr_pages = 1 << high; + + __del_page_from_free_list(page, zone, high, migratetype); + nr_pages -= expand(zone, page, low, high, migratetype); + account_freepages(zone, -nr_pages, migratetype); } static void check_new_page_bad(struct page *page) @@ -1540,8 +1577,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order, migratetype); - expand(zone, page, order, current_order, migratetype); + + page_del_and_expand(zone, page, order, current_order, + migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && migratetype < MIGRATE_PCPTYPES); @@ -1700,27 +1738,6 @@ static unsigned long find_large_buddy(unsigned long start_pfn) return start_pfn; } -/* Split a multi-block free page into its individual pageblocks */ -static void split_large_buddy(struct zone *zone, struct page *page, - unsigned long pfn, int order) -{ - unsigned long end_pfn = pfn + (1 << order); - - VM_WARN_ON_ONCE(order <= pageblock_order); - VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); - - /* Caller removed page from freelist, buddy info cleared! */ - VM_WARN_ON_ONCE(PageBuddy(page)); - - while (pfn != end_pfn) { - int mt = get_pfnblock_migratetype(page, pfn); - - __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); - pfn += pageblock_nr_pages; - page = pfn_to_page(pfn); - } -} - /** * move_freepages_block_isolate - move free pages in block for page isolation * @zone: the zone @@ -1761,7 +1778,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, del_page_from_free_list(buddy, zone, order, get_pfnblock_migratetype(buddy, pfn)); set_pageblock_migratetype(page, migratetype); - split_large_buddy(zone, buddy, pfn, order); + split_large_buddy(zone, buddy, pfn, order, FPI_NONE); return true; } @@ -1772,7 +1789,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, del_page_from_free_list(page, zone, order, get_pfnblock_migratetype(page, pfn)); set_pageblock_migratetype(page, migratetype); - split_large_buddy(zone, page, pfn, order); + split_large_buddy(zone, page, pfn, order, FPI_NONE); return true; } move: @@ -1892,9 +1909,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page, /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { + unsigned int nr_added; + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); - expand(zone, page, order, current_order, start_type); + nr_added = expand(zone, page, order, current_order, start_type); + account_freepages(zone, nr_added, start_type); return page; } @@ -1947,8 +1967,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, } single_page: - del_page_from_free_list(page, zone, current_order, block_type); - expand(zone, page, order, current_order, block_type); + page_del_and_expand(zone, page, order, current_order, block_type); return page; } @@ -2764,7 +2783,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order, 0); - pgalloc_tag_split(page, 1 << order); + pgalloc_tag_split(page_folio(page), order, 0); split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -3033,12 +3052,6 @@ struct page *rmqueue(struct zone *preferred_zone, { struct page *page; - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - if (likely(pcp_allowed_order(order))) { page = rmqueue_pcplist(preferred_zone, zone, order, migratetype, alloc_flags); @@ -3357,7 +3370,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } if (no_fallback && nr_online_nodes > 1 && - zone != ac->preferred_zoneref->zone) { + zone != zonelist_zone(ac->preferred_zoneref)) { int local_nid; /* @@ -3365,7 +3378,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, * fragmenting fallbacks. Locality is more important * than fragmentation avoidance. */ - local_nid = zone_to_nid(ac->preferred_zoneref->zone); + local_nid = zonelist_node_idx(ac->preferred_zoneref); if (zone_to_nid(zone) != local_nid) { alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; @@ -3402,7 +3415,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, if (cond_accept_memory(zone, order)) goto try_this_zone; -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. @@ -3411,14 +3423,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, if (_deferred_grow_zone(zone, order)) goto try_this_zone; } -#endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; if (!node_reclaim_enabled() || - !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) + !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone)) continue; ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); @@ -3440,7 +3451,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } try_this_zone: - page = rmqueue(ac->preferred_zoneref->zone, zone, order, + page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); @@ -3457,13 +3468,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, if (cond_accept_memory(zone, order)) goto try_this_zone; -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } -#endif } } @@ -4100,6 +4109,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, unsigned long min_wmark = min_wmark_pages(zone); bool wmark; + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + available = reclaimable = zone_reclaimable_pages(zone); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); @@ -4175,6 +4189,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; bool can_compact = gfp_compaction_allowed(gfp_mask); + bool nofail = gfp_mask & __GFP_NOFAIL; const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@ -4187,6 +4202,25 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; + if (unlikely(nofail)) { + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 1); + /* + * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM, + * otherwise, we may result in lockup. + */ + WARN_ON_ONCE(!can_direct_reclaim); + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us. + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + } + restart: compaction_retries = 0; no_progress_loops = 0; @@ -4209,7 +4243,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask); - if (!ac->preferred_zoneref->zone) + if (!zonelist_zone(ac->preferred_zoneref)) goto nopage; /* @@ -4221,7 +4255,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zoneref *z = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, &cpuset_current_mems_allowed); - if (!z->zone) + if (!zonelist_zone(z)) goto nopage; } @@ -4404,29 +4438,15 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * Make sure that __GFP_NOFAIL request doesn't leak out and make sure * we always retry */ - if (gfp_mask & __GFP_NOFAIL) { + if (unlikely(nofail)) { /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually require GFP_NOWAIT + * Lacking direct_reclaim we can't do anything to reclaim memory, + * we disregard these unreasonable nofail requests and still + * return NULL */ - if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) + if (!can_direct_reclaim) goto fail; - /* - * PF_MEMALLOC request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us - */ - WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask); - - /* - * non failing costly orders are a hard requirement which we - * are not prepared for much so let's warn about these users - * so that we can identify them and convert them to something - * else. - */ - WARN_ON_ONCE_GFP(costly_order, gfp_mask); - /* * Help non-failing allocations by giving some access to memory * reserves normally used for high priority non-blocking @@ -4578,17 +4598,28 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, continue; } - if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && - zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + if (nr_online_nodes > 1 && zone != zonelist_zone(ac.preferred_zoneref) && + zone_to_nid(zone) != zonelist_node_idx(ac.preferred_zoneref)) { goto failed; } + cond_accept_memory(zone, 0); +retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, zonelist_zone_idx(ac.preferred_zoneref), alloc_flags, gfp)) { break; } + + if (cond_accept_memory(zone, 0)) + goto retry_this_zone; + + /* Try again if zone has deferred pages */ + if (deferred_pages_enabled()) { + if (_deferred_grow_zone(zone, 0)) + goto retry_this_zone; + } } /* @@ -4638,7 +4669,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, pcp_trylock_finish(UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); - zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); + zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); out: return nr_populated; @@ -4696,7 +4727,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ - alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); + alloc_flags |= alloc_flags_nofragment(zonelist_zone(ac.preferred_zoneref), gfp); /* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); @@ -4950,7 +4981,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, order, 0); - pgalloc_tag_split(page, 1 << order); + pgalloc_tag_split(page_folio(page), order, 0); split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); @@ -5301,7 +5332,7 @@ int local_memory_node(int node) z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); - return zone_to_nid(z->zone); + return zonelist_node_idx(z); } #endif @@ -6433,6 +6464,31 @@ int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } +static void split_free_pages(struct list_head *list) +{ + int order; + + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; + int nr_pages = 1 << order; + + list_for_each_entry_safe(page, next, &list[order], lru) { + int i; + + post_alloc_hook(page, order, __GFP_MOVABLE); + if (!order) + continue; + + split_page(page, order); + + /* Add all subpages to the order-0 head, in sequence. */ + list_del(&page->lru); + for (i = 0; i < nr_pages; i++) + list_add_tail(&page[i].lru, &list[0]); + } + } +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -6545,12 +6601,25 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, goto done; } - /* Free head and tail (if any) */ - if (start != outer_start) - free_contig_range(outer_start, start - outer_start); - if (end != outer_end) - free_contig_range(end, outer_end - end); + if (!(gfp_mask & __GFP_COMP)) { + split_free_pages(cc.freepages); + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { + struct page *head = pfn_to_page(start); + int order = ilog2(end - start); + + check_new_pages(head, order); + prep_new_page(head, order, gfp_mask, 0); + } else { + ret = -EINVAL; + WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", + start, end, outer_start, outer_end); + } done: undo_isolate_page_range(start, end, migratetype); return ret; @@ -6659,6 +6728,18 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, void free_contig_range(unsigned long pfn, unsigned long nr_pages) { unsigned long count = 0; + struct folio *folio = pfn_folio(pfn); + + if (folio_test_large(folio)) { + int expected = folio_nr_pages(folio); + + if (nr_pages == expected) + folio_put(folio); + else + WARN(true, "PFN %lu: nr_pages %lu != expected %d\n", + pfn, nr_pages, expected); + return; + } for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -6927,23 +7008,50 @@ early_param("accept_memory", accept_memory_parse); static bool page_contains_unaccepted(struct page *page, unsigned int order) { phys_addr_t start = page_to_phys(page); - phys_addr_t end = start + (PAGE_SIZE << order); - return range_contains_unaccepted_memory(start, end); + return range_contains_unaccepted_memory(start, PAGE_SIZE << order); } -static void accept_page(struct page *page, unsigned int order) +static void __accept_page(struct zone *zone, unsigned long *flags, + struct page *page) { - phys_addr_t start = page_to_phys(page); + bool last; - accept_memory(start, start + (PAGE_SIZE << order)); + list_del(&page->lru); + last = list_empty(&zone->unaccepted_pages); + + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + __ClearPageUnaccepted(page); + spin_unlock_irqrestore(&zone->lock, *flags); + + accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); + + __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); + + if (last) + static_branch_dec(&zones_with_unaccepted_pages); +} + +void accept_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); + if (!PageUnaccepted(page)) { + spin_unlock_irqrestore(&zone->lock, flags); + return; + } + + /* Unlocks zone->lock */ + __accept_page(zone, &flags, page); } static bool try_to_accept_memory_one(struct zone *zone) { unsigned long flags; struct page *page; - bool last; spin_lock_irqsave(&zone->lock, flags); page = list_first_entry_or_null(&zone->unaccepted_pages, @@ -6953,23 +7061,17 @@ static bool try_to_accept_memory_one(struct zone *zone) return false; } - list_del(&page->lru); - last = list_empty(&zone->unaccepted_pages); - - account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); - __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); - spin_unlock_irqrestore(&zone->lock, flags); - - accept_page(page, MAX_PAGE_ORDER); - - __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - - if (last) - static_branch_dec(&zones_with_unaccepted_pages); + /* Unlocks zone->lock */ + __accept_page(zone, &flags, page); return true; } +static inline bool has_unaccepted_memory(void) +{ + return static_branch_unlikely(&zones_with_unaccepted_pages); +} + static bool cond_accept_memory(struct zone *zone, unsigned int order) { long to_accept; @@ -6981,8 +7083,8 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) if (list_empty(&zone->unaccepted_pages)) return false; - /* How much to accept to get to high watermark? */ - to_accept = high_wmark_pages(zone) - + /* How much to accept to get to promo watermark? */ + to_accept = promo_wmark_pages(zone) - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0) - zone_page_state(zone, NR_UNACCEPTED)); @@ -6997,11 +7099,6 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) return ret; } -static inline bool has_unaccepted_memory(void) -{ - return static_branch_unlikely(&zones_with_unaccepted_pages); -} - static bool __free_unaccepted(struct page *page) { struct zone *zone = page_zone(page); @@ -7016,6 +7113,7 @@ static bool __free_unaccepted(struct page *page) list_add_tail(&page->lru, &zone->unaccepted_pages); account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + __SetPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags); if (first) @@ -7031,20 +7129,11 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) return false; } -static void accept_page(struct page *page, unsigned int order) -{ -} - static bool cond_accept_memory(struct zone *zone, unsigned int order) { return false; } -static inline bool has_unaccepted_memory(void) -{ - return false; -} - static bool __free_unaccepted(struct page *page) { BUILD_BUG(); diff --git a/mm/page_counter.c b/mm/page_counter.c index 0153f5bb3161..b249d15af9dd 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,11 @@ #include #include +static bool track_protection(struct page_counter *c) +{ + return c->protection_support; +} + static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { @@ -57,7 +62,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) new = 0; atomic_long_set(&counter->usage, new); } - propagate_protected_usage(counter, new); + if (track_protection(counter)) + propagate_protected_usage(counter, new); } /** @@ -70,18 +76,33 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; + bool protection = track_protection(counter); for (c = counter; c; c = c->parent) { long new; new = atomic_long_add_return(nr_pages, &c->usage); - propagate_protected_usage(c, new); + if (protection) + propagate_protected_usage(c, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. + * + * Notably, we have two watermarks to allow for both a globally + * visible peak and one that can be reset at a smaller scope. + * + * Since we reset both watermarks when the global reset occurs, + * we can guarantee that watermark >= local_watermark, so we + * don't need to do both comparisons every time. + * + * On systems with branch predictors, the inner condition should + * be almost free. */ - if (new > READ_ONCE(c->watermark)) - WRITE_ONCE(c->watermark, new); + if (new > READ_ONCE(c->local_watermark)) { + WRITE_ONCE(c->local_watermark, new); + if (new > READ_ONCE(c->watermark)) + WRITE_ONCE(c->watermark, new); + } } } @@ -99,6 +120,7 @@ bool page_counter_try_charge(struct page_counter *counter, struct page_counter **fail) { struct page_counter *c; + bool protection = track_protection(counter); for (c = counter; c; c = c->parent) { long new; @@ -128,13 +150,15 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } - propagate_protected_usage(c, new); - /* - * Just like with failcnt, we can live with some - * inaccuracy in the watermark. - */ - if (new > READ_ONCE(c->watermark)) - WRITE_ONCE(c->watermark, new); + if (protection) + propagate_protected_usage(c, new); + + /* see comment on page_counter_charge */ + if (new > READ_ONCE(c->local_watermark)) { + WRITE_ONCE(c->local_watermark, new); + if (new > READ_ONCE(c->watermark)) + WRITE_ONCE(c->watermark, new); + } } return true; @@ -264,6 +288,7 @@ int page_counter_memparse(const char *buf, const char *max, } +#ifdef CONFIG_MEMCG /* * This function calculates an individual page counter's effective * protection which is derived from its own memory.min/low, its @@ -435,3 +460,4 @@ void page_counter_calculate_protection(struct page_counter *root, atomic_long_read(&parent->children_low_usage), recursive_protection)); } +#endif /* CONFIG_MEMCG */ diff --git a/mm/page_io.c b/mm/page_io.c index ff8c99ee3af7..78bc88acee79 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -172,6 +172,60 @@ int generic_swapfile_activate(struct swap_info_struct *sis, goto out; } +static bool is_folio_zero_filled(struct folio *folio) +{ + unsigned int pos, last_pos; + unsigned long *data; + unsigned int i; + + last_pos = PAGE_SIZE / sizeof(*data) - 1; + for (i = 0; i < folio_nr_pages(folio); i++) { + data = kmap_local_folio(folio, i * PAGE_SIZE); + /* + * Check last word first, incase the page is zero-filled at + * the start and has non-zero data at the end, which is common + * in real-world workloads. + */ + if (data[last_pos]) { + kunmap_local(data); + return false; + } + for (pos = 0; pos < last_pos; pos++) { + if (data[pos]) { + kunmap_local(data); + return false; + } + } + kunmap_local(data); + } + + return true; +} + +static void swap_zeromap_folio_set(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + set_bit(swp_offset(entry), sis->zeromap); + } +} + +static void swap_zeromap_folio_clear(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + clear_bit(swp_offset(entry), sis->zeromap); + } +} + /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -195,6 +249,25 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_unlock(folio); return ret; } + + /* + * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. + * The bits in zeromap are protected by the locked swapcache folio + * and atomic updates are used to protect against read-modify-write + * corruption due to other zero swap entries seeing concurrent updates. + */ + if (is_folio_zero_filled(folio)) { + swap_zeromap_folio_set(folio); + folio_unlock(folio); + return 0; + } else { + /* + * Clear bits this folio occupies in the zeromap to prevent + * zero data being read in from any previous zero writes that + * occupied the same swap entries. + */ + swap_zeromap_folio_clear(folio); + } if (zswap_store(folio)) { folio_unlock(folio); return 0; @@ -273,9 +346,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) * memory for allocating transmit buffers. * Mark the page dirty and avoid * folio_rotate_reclaimable but rate-limit the - * messages but do not flag PageError like - * the normal direct-to-bio case as it could - * be temporary. + * messages. */ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", ret, swap_dev_pos(page_swap_entry(page))); @@ -429,6 +500,28 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } +static bool swap_read_folio_zeromap(struct folio *folio) +{ + int nr_pages = folio_nr_pages(folio); + bool is_zeromap; + + /* + * Swapping in a large folio that is partially in the zeromap is not + * currently handled. Return true without marking the folio uptodate so + * that an IO error is emitted (e.g. do_swap_page() will sigbus). + */ + if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages, + &is_zeromap) != nr_pages)) + return true; + + if (!is_zeromap) + return false; + + folio_zero_range(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); + return true; +} + static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); @@ -519,9 +612,18 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) } delayacct_swapin_start(); - if (zswap_load(folio)) { + if (swap_read_folio_zeromap(folio)) { folio_unlock(folio); - } else if (data_race(sis->flags & SWP_FS_OPS)) { + goto finish; + } else if (zswap_load(folio)) { + folio_unlock(folio); + goto finish; + } + + /* We have to read from slower devices. Increase zswap protection. */ + zswap_folio_swapin(folio); + + if (data_race(sis->flags & SWP_FS_OPS)) { swap_read_folio_fs(folio, plug); } else if (synchronous) { swap_read_folio_bdev_sync(folio, sis); @@ -529,6 +631,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) swap_read_folio_bdev_async(folio, sis); } +finish: if (workingset) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 042937d5abe4..7e04047977cf 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -152,6 +152,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ unsigned long flags; unsigned long check_unmovable_start, check_unmovable_end; + if (PageUnaccepted(page)) + accept_page(page); + spin_lock_irqsave(&zone->lock, flags); /* @@ -367,6 +370,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, VM_BUG_ON(!page); pfn = page_to_pfn(page); + if (PageUnaccepted(page)) { + pfn += MAX_ORDER_NR_PAGES; + continue; + } + if (PageBuddy(page)) { int order = buddy_order(page); @@ -395,30 +403,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, unsigned long head_pfn = page_to_pfn(head); unsigned long nr_pages = compound_nr(head); - if (head_pfn + nr_pages <= boundary_pfn) { - pfn = head_pfn + nr_pages; - continue; - } - -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - if (PageHuge(page)) { - int page_mt = get_pageblock_migratetype(page); - struct compact_control cc = { - .nr_migratepages = 0, - .order = -1, - .zone = page_zone(pfn_to_page(head_pfn)), - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - .no_set_skip_hint = true, - .gfp_mask = gfp_flags, - .alloc_contig = true, - }; - INIT_LIST_HEAD(&cc.migratepages); - - ret = __alloc_contig_migrate_range(&cc, head_pfn, - head_pfn + nr_pages, page_mt); - if (ret) - goto failed; + if (head_pfn + nr_pages <= boundary_pfn || + PageHuge(page)) { pfn = head_pfn + nr_pages; continue; } @@ -432,7 +418,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, */ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); -#endif + goto failed; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index ae2f08ce991b..461ea3bbd8d9 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include /* * We want to know the real level where a entry is located ignoring any @@ -654,3 +656,203 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, return err; } + +/** + * folio_walk_start - walk the page tables to a folio + * @fw: filled with information on success. + * @vma: the VMA. + * @addr: the virtual address to use for the page table walk. + * @flags: flags modifying which folios to walk to. + * + * Walk the page tables using @addr in a given @vma to a mapped folio and + * return the folio, making sure that the page table entry referenced by + * @addr cannot change until folio_walk_end() was called. + * + * As default, this function returns only folios that are not special (e.g., not + * the zeropage) and never returns folios that are supposed to be ignored by the + * VM as documented by vm_normal_page(). If requested, zeropages will be + * returned as well. + * + * As default, this function only considers present page table entries. + * If requested, it will also consider migration entries. + * + * If this function returns NULL it might either indicate "there is nothing" or + * "there is nothing suitable". + * + * On success, @fw is filled and the function returns the folio while the PTL + * is still held and folio_walk_end() must be called to clean up, + * releasing any held locks. The returned folio must *not* be used after the + * call to folio_walk_end(), unless a short-term folio reference is taken before + * that call. + * + * @fw->page will correspond to the page that is effectively referenced by + * @addr. However, for migration entries and shared zeropages @fw->page is + * set to NULL. Note that large folios might be mapped by multiple page table + * entries, and this function will always only lookup a single entry as + * specified by @addr, which might or might not cover more than a single page of + * the returned folio. + * + * This function must *not* be used as a naive replacement for + * get_user_pages() / pin_user_pages(), especially not to perform DMA or + * to carelessly modify page content. This function may *only* be used to grab + * short-term folio references, never to grab long-term folio references. + * + * Using the page table entry pointers in @fw for reading or modifying the + * entry should be avoided where possible: however, there might be valid + * use cases. + * + * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. + * For example, PMD page table sharing might require prior unsharing. Also, + * logical hugetlb entries might span multiple physical page table entries, + * which *must* be modified in a single operation (set_huge_pte_at(), + * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might + * not correspond to the first physical entry of a logical hugetlb entry. + * + * The mmap lock must be held in read mode. + * + * Return: folio pointer on success, otherwise NULL. + */ +struct folio *folio_walk_start(struct folio_walk *fw, + struct vm_area_struct *vma, unsigned long addr, + folio_walk_flags_t flags) +{ + unsigned long entry_size; + bool expose_page = true; + struct page *page; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + pgd_t *pgdp; + p4d_t *p4dp; + + mmap_assert_locked(vma->vm_mm); + vma_pgtable_walk_begin(vma); + + if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) + goto not_found; + + pgdp = pgd_offset(vma->vm_mm, addr); + if (pgd_none_or_clear_bad(pgdp)) + goto not_found; + + p4dp = p4d_offset(pgdp, addr); + if (p4d_none_or_clear_bad(p4dp)) + goto not_found; + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (pud_none(pud)) + goto not_found; + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { + ptl = pud_lock(vma->vm_mm, pudp); + pud = pudp_get(pudp); + + entry_size = PUD_SIZE; + fw->level = FW_LEVEL_PUD; + fw->pudp = pudp; + fw->pud = pud; + + if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { + spin_unlock(ptl); + goto not_found; + } else if (!pud_leaf(pud)) { + spin_unlock(ptl); + goto pmd_table; + } + /* + * TODO: vm_normal_page_pud() will be handy once we want to + * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. + */ + page = pud_page(pud); + goto found; + } + +pmd_table: + VM_WARN_ON_ONCE(pud_leaf(*pudp)); + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd)) + goto not_found; + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { + ptl = pmd_lock(vma->vm_mm, pmdp); + pmd = pmdp_get(pmdp); + + entry_size = PMD_SIZE; + fw->level = FW_LEVEL_PMD; + fw->pmdp = pmdp; + fw->pmd = pmd; + + if (pmd_none(pmd)) { + spin_unlock(ptl); + goto not_found; + } else if (!pmd_leaf(pmd)) { + spin_unlock(ptl); + goto pte_table; + } else if (pmd_present(pmd)) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page) { + goto found; + } else if ((flags & FW_ZEROPAGE) && + is_huge_zero_pmd(pmd)) { + page = pfn_to_page(pmd_pfn(pmd)); + expose_page = false; + goto found; + } + } else if ((flags & FW_MIGRATION) && + is_pmd_migration_entry(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + page = pfn_swap_entry_to_page(entry); + expose_page = false; + goto found; + } + spin_unlock(ptl); + goto not_found; + } + +pte_table: + VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); + ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); + if (!ptep) + goto not_found; + pte = ptep_get(ptep); + + entry_size = PAGE_SIZE; + fw->level = FW_LEVEL_PTE; + fw->ptep = ptep; + fw->pte = pte; + + if (pte_present(pte)) { + page = vm_normal_page(vma, addr, pte); + if (page) + goto found; + if ((flags & FW_ZEROPAGE) && + is_zero_pfn(pte_pfn(pte))) { + page = pfn_to_page(pte_pfn(pte)); + expose_page = false; + goto found; + } + } else if (!pte_none(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if ((flags & FW_MIGRATION) && + is_migration_entry(entry)) { + page = pfn_swap_entry_to_page(entry); + expose_page = false; + goto found; + } + } + pte_unmap_unlock(ptep, ptl); +not_found: + vma_pgtable_walk_end(vma); + return NULL; +found: + if (expose_page) + /* Note: Offset from the mapped page, not the folio start. */ + fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); + else + fw->page = NULL; + fw->ptl = ptl; + return page_folio(page); +} diff --git a/mm/percpu.c b/mm/percpu.c index 20d91af8c033..da21680ff294 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2216,37 +2216,6 @@ static void pcpu_balance_workfn(struct work_struct *work) mutex_unlock(&pcpu_alloc_mutex); } -/** - * pcpu_alloc_size - the size of the dynamic percpu area - * @ptr: pointer to the dynamic percpu area - * - * Returns the size of the @ptr allocation. This is undefined for statically - * defined percpu variables as there is no corresponding chunk->bound_map. - * - * RETURNS: - * The size of the dynamic percpu area. - * - * CONTEXT: - * Can be called from atomic context. - */ -size_t pcpu_alloc_size(void __percpu *ptr) -{ - struct pcpu_chunk *chunk; - unsigned long bit_off, end; - void *addr; - - if (!ptr) - return 0; - - addr = __pcpu_ptr_to_addr(ptr); - /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */ - chunk = pcpu_chunk_addr_search(addr); - bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE; - end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), - bit_off + 1); - return (end - bit_off) * PCPU_MIN_ALLOC_SIZE; -} - /** * free_percpu - free percpu area * @ptr: pointer to area to free diff --git a/mm/rmap.c b/mm/rmap.c index 2490e727e2dc..a8797d1b3d49 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -75,6 +75,7 @@ #include #include #include +#include #include @@ -870,6 +871,20 @@ static bool folio_referenced_one(struct folio *folio, continue; } + /* + * Skip the non-shared swapbacked folio mapped solely by + * the exiting or OOM-reaped process. This avoids redundant + * swap-out followed by an immediate unmap. + */ + if ((!atomic_read(&vma->vm_mm->mm_users) || + check_stable_address_space(vma->vm_mm)) && + folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_likely_mapped_shared(folio)) { + pra->referenced = -1; + page_vma_mapped_walk_done(&pvmw); + return false; + } + if (pvmw.pte) { if (lru_gen_enabled() && pte_young(ptep_get(pvmw.pte))) { @@ -1143,25 +1158,25 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; - int first, nr = 0; + int first = 0, nr = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { - nr = atomic_inc_and_test(&page->_mapcount); + nr = atomic_inc_and_test(&folio->_mapcount); break; } do { - first = atomic_inc_and_test(&page->_mapcount); - if (first) { - first = atomic_inc_return_relaxed(mapped); - if (first < ENTIRELY_MAPPED) - nr++; - } + first += atomic_inc_and_test(&page->_mapcount); } while (page++, --nr_pages > 0); + + if (first && + atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED) + nr = first; + atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: @@ -1452,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } __folio_mod_stat(folio, nr, nr_pmdmapped); + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); } static __always_inline void __folio_add_file_rmap(struct folio *folio, @@ -1512,7 +1528,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; - int last, nr = 0, nr_pmdmapped = 0; + int last = 0, nr = 0, nr_pmdmapped = 0; bool partially_mapped = false; __folio_rmap_sanity_checks(folio, page, nr_pages, level); @@ -1520,20 +1536,19 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { - nr = atomic_add_negative(-1, &page->_mapcount); + nr = atomic_add_negative(-1, &folio->_mapcount); break; } atomic_sub(nr_pages, &folio->_large_mapcount); do { - last = atomic_add_negative(-1, &page->_mapcount); - if (last) { - last = atomic_dec_return_relaxed(mapped); - if (last < ENTIRELY_MAPPED) - nr++; - } + last += atomic_add_negative(-1, &page->_mapcount); } while (page++, --nr_pages > 0); + if (last && + atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED) + nr = last; + partially_mapped = nr && atomic_read(mapped); break; case RMAP_LEVEL_PMD: @@ -1553,22 +1568,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, } } - partially_mapped = nr < nr_pmdmapped; + partially_mapped = nr && nr < nr_pmdmapped; break; } - if (nr) { - /* - * Queue anon large folio for deferred split if at least one - * page of the folio is unmapped and at least one page - * is still mapped. - * - * Check partially_mapped first to ensure it is a large folio. - */ - if (folio_test_anon(folio) && partially_mapped && - list_empty(&folio->_deferred_list)) - deferred_split_folio(folio); - } + /* + * Queue anon large folio for deferred split if at least one page of + * the folio is unmapped and at least one page is still mapped. + * + * Check partially_mapped first to ensure it is a large folio. + */ + if (partially_mapped && folio_test_anon(folio) && + !folio_test_partially_mapped(folio)) + deferred_split_folio(folio, true); + __folio_mod_stat(folio, -nr, -nr_pmdmapped); /* diff --git a/mm/shmem.c b/mm/shmem.c index b875852df51f..6eff8771d9cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -155,7 +155,7 @@ static unsigned long shmem_default_max_inodes(void) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct mm_struct *fault_mm, vm_fault_t *fault_type); + struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -502,8 +502,8 @@ static int shmem_replace_entry(struct address_space *mapping, * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back from swap by a racing thread. * - * Checking page is not enough: by the time a SwapCache page is locked, it - * might be reused, and again be SwapCache, using the same swap as before. + * Checking folio is not enough: by the time a swapcache folio is locked, it + * might be reused, and again be swapcache, using the same swap as before. */ static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) @@ -548,10 +548,12 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool __shmem_is_huge(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, - unsigned long vm_flags) +static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) { + struct mm_struct *mm = vma ? vma->vm_mm : NULL; loff_t i_size; if (!S_ISREG(inode->i_mode)) @@ -568,7 +570,8 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, return true; case SHMEM_HUGE_WITHIN_SIZE: index = round_up(index + 1, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= index) return true; fallthrough; @@ -581,14 +584,15 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, } } -bool shmem_is_huge(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, - unsigned long vm_flags) +static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, unsigned long vm_flags) { if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) return false; - return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags); + return __shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); } #if defined(CONFIG_SYSFS) @@ -634,15 +638,14 @@ static const char *shmem_format_huge(int huge) #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) + struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; - LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; - int split = 0; + unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -660,13 +663,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, goto next; } - /* Check if there's anything to gain */ - if (round_up(inode->i_size, PAGE_SIZE) == - round_up(inode->i_size, HPAGE_PMD_SIZE)) { - list_move(&info->shrinklist, &to_remove); - goto next; - } - list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; @@ -675,34 +671,36 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } spin_unlock(&sbinfo->shrinklist_lock); - list_for_each_safe(pos, next, &to_remove) { - info = list_entry(pos, struct shmem_inode_info, shrinklist); - inode = &info->vfs_inode; - list_del_init(&info->shrinklist); - iput(inode); - } - list_for_each_safe(pos, next, &list) { + pgoff_t next, end; + loff_t i_size; int ret; - pgoff_t index; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; - if (nr_to_split && split >= nr_to_split) + if (nr_to_free && freed >= nr_to_free) goto move_back; - index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; - folio = filemap_get_folio(inode->i_mapping, index); - if (IS_ERR(folio)) + i_size = i_size_read(inode); + folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); + if (!folio || xa_is_value(folio)) goto drop; - /* No huge page at the end of the file: nothing to split */ + /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } + /* Check if there is anything to gain from splitting */ + next = folio_next_index(folio); + end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); + if (end <= folio->index || end >= next) { + folio_put(folio); + goto drop; + } + /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. @@ -723,6 +721,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (ret) goto move_back; + freed += next - end; split++; drop: list_del_init(&info->shrinklist); @@ -767,10 +766,17 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) + struct shrink_control *sc, unsigned long nr_to_free) { return 0; } + +static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, unsigned long vm_flags) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* @@ -786,7 +792,6 @@ static int shmem_add_to_page_cache(struct folio *folio, VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - VM_BUG_ON(expected && folio_test_large(folio)); folio_ref_add(folio, nr); folio->mapping = mapping; @@ -842,23 +847,27 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); xa_unlock_irq(&mapping->i_pages); - folio_put(folio); + folio_put_refs(folio, nr); BUG_ON(error); } /* - * Remove swap entry from page cache, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. Returns + * the number of pages being freed. 0 means entry not found in XArray (0 pages + * being freed). */ -static int shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +static long shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) { + int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) - return -ENOENT; - free_swap_and_cache(radix_to_swp_entry(radswap)); - return 0; + return 0; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + + return 1 << order; } /* @@ -881,7 +890,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) - swapped++; + swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { @@ -971,7 +980,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } @@ -1010,7 +1019,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } @@ -1077,14 +1086,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; + if (unfalloc) continue; - if (shmem_free_swap(mapping, indices[i], folio)) { + swaps_freed = shmem_free_swap(mapping, indices[i], folio); + if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; + nr_swaps_freed += swaps_freed; continue; } @@ -1156,7 +1168,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_is_huge(inode, 0, false, NULL, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1443,6 +1455,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); swp_entry_t swap; pgoff_t index; + int nr_pages; + bool split = false; /* * Our capabilities prevent regular writeback or sync from ever calling @@ -1461,20 +1475,33 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) goto redirty; /* - * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or - * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, - * and its shmem_writeback() needs them to be split when swapping. + * If CONFIG_THP_SWAP is not enabled, the large folio should be + * split when swapping. + * + * And shrinkage of pages beyond i_size does not split swap, so + * swapout of a large folio crossing i_size needs to split too + * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { + index = shmem_fallocend(inode, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + if ((index > folio->index && index < folio_next_index(folio)) || + !IS_ENABLED(CONFIG_THP_SWAP)) + split = true; + } + + if (split) { +try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page(page) < 0) + if (split_huge_page_to_list_to_order(page, wbc->list, 0)) goto redirty; folio = page_folio(page); folio_clear_dirty(folio); } index = folio->index; + nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC @@ -1509,8 +1536,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } swap = folio_alloc_swap(folio); - if (!swap.val) + if (!swap.val) { + if (nr_pages > 1) + goto try_split; + goto redirty; + } /* * Add inode to shmem_unuse()'s list of swapped-out inodes, @@ -1527,8 +1558,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { - shmem_recalc_inode(inode, 0, 1); - swap_shmem_alloc(swap); + shmem_recalc_inode(inode, 0, nr_pages); + swap_shmem_alloc(swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); @@ -1624,22 +1655,33 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge) + loff_t write_end, bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); - unsigned long vm_flags = vma->vm_flags; + unsigned long vm_flags = vma ? vma->vm_flags : 0; + bool global_huge; loff_t i_size; int order; - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (vma && ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return 0; /* If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) return 0; + global_huge = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + if (!vma || !vma_is_anon_shmem(vma)) { + /* + * For tmpfs, we now only support PMD sized THP if huge page + * is enabled, otherwise fallback to order 0. + */ + return global_huge ? BIT(HPAGE_PMD_ORDER) : 0; + } + /* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts. @@ -1680,20 +1722,30 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault struct address_space *mapping, pgoff_t index, unsigned long orders) { - struct vm_area_struct *vma = vmf->vma; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; pgoff_t aligned_index; unsigned long pages; int order; - orders = thp_vma_suitable_orders(vma, vmf->address, orders); - if (!orders) - return 0; + if (vma) { + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + } /* Find the highest order that can add into the page cache */ order = highest_order(orders); while (orders) { pages = 1UL << order; aligned_index = round_down(index, pages); + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ if (!xa_find(&mapping->i_pages, &aligned_index, aligned_index + pages - 1, XA_PRESENT)) break; @@ -1731,7 +1783,6 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct vm_area_struct *vma = vmf ? vmf->vma : NULL; unsigned long suitable_orders = 0; struct folio *folio = NULL; long pages; @@ -1741,26 +1792,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, orders = 0; if (orders > 0) { - if (vma && vma_is_anon_shmem(vma)) { - suitable_orders = shmem_suitable_orders(inode, vmf, + suitable_orders = shmem_suitable_orders(inode, vmf, mapping, index, orders); - } else if (orders & BIT(HPAGE_PMD_ORDER)) { - pages = HPAGE_PMD_NR; - suitable_orders = BIT(HPAGE_PMD_ORDER); - index = round_down(index, HPAGE_PMD_NR); - - /* - * Check for conflict before waiting on a huge allocation. - * Conflict might be that a huge page has just been allocated - * and added to page cache by a racing thread, or that there - * is already at least one small page in the huge extent. - * Be careful to retry when appropriate, but not forever! - * Elsewhere -EEXIST would be the right code, but not here. - */ - if (xa_find(&mapping->i_pages, &index, - index + HPAGE_PMD_NR - 1, XA_PRESENT)) - return ERR_PTR(-E2BIG); - } order = highest_order(suitable_orders); while (suitable_orders) { @@ -1772,9 +1805,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); -#endif order = next_order(&suitable_orders, order); } } else { @@ -1799,10 +1830,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); -#endif } goto unlock; } @@ -1819,7 +1848,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem. */ - shmem_unused_huge_shrink(sbinfo, NULL, 2); + shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced. @@ -1867,30 +1896,35 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + struct vm_area_struct *vma) { - struct folio *old, *new; - struct address_space *swap_mapping; - swp_entry_t entry; - pgoff_t swap_index; - int error; - - old = *foliop; - entry = old->swap; - swap_index = swap_cache_index(entry); - swap_mapping = swap_address_space(entry); + struct folio *new, *old = *foliop; + swp_entry_t entry = old->swap; + struct address_space *swap_mapping = swap_address_space(entry); + pgoff_t swap_index = swap_cache_index(entry); + XA_STATE(xas, &swap_mapping->i_pages, swap_index); + int nr_pages = folio_nr_pages(old); + int error = 0, i; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, 0, info, index); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages > 1) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } +#endif + + new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; - folio_get(new); + folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); @@ -1900,26 +1934,34 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* - * Our caller will very soon move newpage out of swapcache, but it's - * a nice clean interface for us to replace oldpage by newpage there. - */ + /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, old, new); + for (i = 0; i < nr_pages; i++) { + void *item = xas_load(&xas); + + if (item != old) { + error = -ENOENT; + break; + } + + xas_store(&xas, new); + xas_next(&xas); + } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); - __lruvec_stat_mod_folio(new, NR_SHMEM, 1); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); - __lruvec_stat_mod_folio(old, NR_SHMEM, -1); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); + __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* - * Is this possible? I think not, now that our callers check - * both PageSwapCache and page_private after getting page lock; - * but be defensive. Reverse old to newpage for clear and free. + * Is this possible? I think not, now that our callers + * check both the swapcache flag and folio->private + * after getting the folio lock; but be defensive. + * Reverse old to newpage for clear and free. */ old = new; } else { @@ -1931,7 +1973,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, old->private = NULL; folio_unlock(old); - folio_put_refs(old, 2); + /* + * The old folio are removed from swap cache, drop the 'nr_pages' + * reference, as well as one temporary reference getting from swap + * cache. + */ + folio_put_refs(old, nr_pages + 1); return error; } @@ -1941,6 +1988,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; + int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, @@ -1949,6 +1997,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, if (old != swp_to_radix_entry(swap)) return; + nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); delete_from_swap_cache(folio); /* @@ -1956,8 +2005,86 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ - shmem_recalc_inode(inode, -1, -1); - swap_free(swap); + shmem_recalc_inode(inode, -nr_pages, -nr_pages); + swap_free_nr(swap, nr_pages); +} + +static int shmem_split_large_entry(struct inode *inode, pgoff_t index, + swp_entry_t swap, gfp_t gfp) +{ + struct address_space *mapping = inode->i_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); + void *alloced_shadow = NULL; + int alloced_order = 0, i; + + /* Convert user data gfp flags to xarray node gfp flags */ + gfp &= GFP_RECLAIM_MASK; + + for (;;) { + int order = -1, split_order = 0; + void *old = NULL; + + xas_lock_irq(&xas); + old = xas_load(&xas); + if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + + order = xas_get_order(&xas); + + /* Swap entry may have changed before we re-acquire the lock */ + if (alloced_order && + (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; + } + + /* Try to split large swap entry in pagecache */ + if (order > 0) { + if (!alloced_order) { + split_order = order; + goto unlock; + } + xas_split(&xas, old, order); + + /* + * Re-set the swap entry after splitting, and the swap + * offset of the original large entry must be continuous. + */ + for (i = 0; i < 1 << order; i++) { + pgoff_t aligned_index = round_down(index, 1 << order); + swp_entry_t tmp; + + tmp = swp_entry(swp_type(swap), swp_offset(swap) + i); + __xa_store(&mapping->i_pages, aligned_index + i, + swp_to_radix_entry(tmp), 0); + } + } + +unlock: + xas_unlock_irq(&xas); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + } + +error: + if (xas_error(&xas)) + return xas_error(&xas); + + return alloced_order; } /* @@ -1968,15 +2095,16 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct mm_struct *fault_mm, + gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; + struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; - int error; + int error, nr_pages; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -1996,12 +2124,37 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { + int split_order; + /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + + /* + * Now swap device can only swap in order 0 folio, then we + * should split the large swap entry stored in the pagecache + * if necessary. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + /* Here we actually start the io */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { @@ -2023,6 +2176,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } folio_wait_writeback(folio); + nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the @@ -2031,24 +2185,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_folio(&folio, gfp, info, index); + error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } - error = shmem_add_to_page_cache(folio, mapping, index, + error = shmem_add_to_page_cache(folio, mapping, + round_down(index, nr_pages), swp_to_radix_entry(swap), gfp); if (error) goto failed; - shmem_recalc_inode(inode, 0, -1); + shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); delete_from_swap_cache(folio); folio_mark_dirty(folio); - swap_free(swap); + swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; @@ -2078,14 +2233,14 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_fault *vmf, vm_fault_t *fault_type) + loff_t write_end, struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; - bool alloced, huge; + bool alloced; unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) @@ -2111,7 +2266,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, fault_mm, fault_type); + sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; @@ -2158,14 +2313,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - huge = shmem_is_huge(inode, index, false, fault_mm, - vma ? vma->vm_flags : 0); - /* Find hugepage orders that are allowed for anonymous shmem. */ - if (vma && vma_is_anon_shmem(vma)) - orders = shmem_allowable_huge_orders(inode, vma, index, huge); - else if (huge) - orders = BIT(HPAGE_PMD_ORDER); - + /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ + orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); if (orders > 0) { gfp_t huge_gfp; @@ -2176,9 +2325,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); -#endif goto alloced; } if (PTR_ERR(folio) == -EEXIST) @@ -2198,7 +2345,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, alloced = true; if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - folio_next_index(folio) - 1) { + folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* @@ -2268,6 +2415,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. + * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * @@ -2287,10 +2435,10 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, * Context: May sleep. * Return: 0 if successful, else a negative error code. */ -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp) { - return shmem_get_folio_gfp(inode, index, foliop, sgp, + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); @@ -2385,7 +2533,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } WARN_ON_ONCE(vmf->page != NULL); - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); @@ -2895,7 +3043,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; @@ -2965,7 +3113,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_get_folio(inode, index, &folio, SGP_READ); + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -3141,7 +3289,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (*ppos >= i_size_read(inode)) break; - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) @@ -3331,8 +3479,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + error = shmem_get_folio(inode, index, offset + len, + &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -3683,7 +3831,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; @@ -3729,7 +3877,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) @@ -5197,7 +5345,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c index ce514e700d2f..d1e32ac01407 100644 --- a/mm/shmem_quota.c +++ b/mm/shmem_quota.c @@ -34,8 +34,6 @@ #include #include -#ifdef CONFIG_TMPFS_QUOTA - /* * The following constants define the amount of time given a user * before the soft limits are treated as hard limits (usually resulting @@ -351,4 +349,3 @@ const struct dquot_operations shmem_quota_operations = { .mark_dirty = shmem_mark_dquot_dirty, .get_next_id = shmem_get_next_id, }; -#endif /* CONFIG_TMPFS_QUOTA */ diff --git a/mm/show_mem.c b/mm/show_mem.c index bdb439551eef..ec885a398fa0 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -435,15 +435,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) struct codetag *ct = tags[i].ct; struct alloc_tag *tag = ct_to_alloc_tag(ct); struct alloc_tag_counters counter = alloc_tag_read(tag); + char bytes[10]; + + string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes)); /* Same as alloc_tag_to_text() but w/o intermediate buffer */ if (ct->modname) - pr_notice("%12lli %8llu %s:%u [%s] func:%s\n", - counter.bytes, counter.calls, ct->filename, + pr_notice("%12s %8llu %s:%u [%s] func:%s\n", + bytes, counter.calls, ct->filename, ct->lineno, ct->modname, ct->function); else - pr_notice("%12lli %8llu %s:%u func:%s\n", - counter.bytes, counter.calls, ct->filename, + pr_notice("%12s %8llu %s:%u func:%s\n", + bytes, counter.calls, ct->filename, ct->lineno, ct->function); } } diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 12ea5486a3e9..4a85b94d12ce 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -114,7 +114,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, int nid; char kbuf[72]; - read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); + read_len = min(size, sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) return -EFAULT; kbuf[read_len] = '\0'; diff --git a/mm/slab_common.c b/mm/slab_common.c index 61f32420230a..744324465615 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1205,6 +1205,13 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) /* If the object still fits, repoison it precisely. */ if (ks >= new_size) { + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) { + kasan_disable_current(); + memset((void *)p + new_size, 0, ks - new_size); + kasan_enable_current(); + } + p = kasan_krealloc((void *)p, new_size, flags); return (void *)p; } @@ -1226,11 +1233,27 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) * @new_size: how many bytes of memory are required. * @flags: the type of memory to allocate. * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes (__GFP_ZERO flag is effectively ignored). * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * This is the case, since krealloc() only knows about the bucket size of an + * allocation (but not the exact size it was allocated with) and hence + * implements the following semantics for shrinking and growing buffers with + * __GFP_ZERO. + * + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * * Return: pointer to the allocated memory or %NULL in case of error */ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) diff --git a/mm/swap.c b/mm/swap.c index 9caf6b017cf0..835bdf324b76 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,31 +47,27 @@ int page_cluster; const int page_cluster_max = 31; -/* Protecting only lru_rotate.fbatch which requires disabling interrupts */ -struct lru_rotate { - local_lock_t lock; - struct folio_batch fbatch; -}; -static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { - .lock = INIT_LOCAL_LOCK(lock), -}; - -/* - * The following folio batches are grouped together because they are protected - * by disabling preemption (and interrupts remain enabled). - */ struct cpu_fbatches { + /* + * The following folio batches are grouped together because they are protected + * by disabling preemption (and interrupts remain enabled). + */ local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP - struct folio_batch activate; + struct folio_batch lru_activate; #endif + /* Protecting the following batches which require disabling interrupts */ + local_lock_t lock_irq; + struct folio_batch lru_move_tail; }; + static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), + .lock_irq = INIT_LOCAL_LOCK(lock_irq), }; static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, @@ -117,7 +113,9 @@ void __folio_put(struct folio *folio) if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; - } else if (folio_test_hugetlb(folio)) { + } + + if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } @@ -162,7 +160,7 @@ EXPORT_SYMBOL(put_pages_list); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); -static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_add(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); @@ -222,23 +220,50 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) folios_put(fbatch); } -static void folio_batch_add_and_move(struct folio_batch *fbatch, - struct folio *folio, move_fn_t move_fn) +static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, + struct folio *folio, move_fn_t move_fn, + bool on_lru, bool disable_irq) { - if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && - !lru_cache_disabled()) + unsigned long flags; + + if (on_lru && !folio_test_clear_lru(folio)) return; - folio_batch_move_lru(fbatch, move_fn); + + folio_get(folio); + + if (disable_irq) + local_lock_irqsave(&cpu_fbatches.lock_irq, flags); + else + local_lock(&cpu_fbatches.lock); + + if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || + lru_cache_disabled()) + folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); + + if (disable_irq) + local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); + else + local_unlock(&cpu_fbatches.lock); } -static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) +#define folio_batch_add_and_move(folio, op, on_lru) \ + __folio_batch_add_and_move( \ + &cpu_fbatches.op, \ + folio, \ + op, \ + on_lru, \ + offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ + ) + +static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) { - if (!folio_test_unevictable(folio)) { - lruvec_del_folio(lruvec, folio); - folio_clear_active(folio); - lruvec_add_folio_tail(lruvec, folio); - __count_vm_events(PGROTATED, folio_nr_pages(folio)); - } + if (folio_test_unevictable(folio)) + return; + + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + lruvec_add_folio_tail(lruvec, folio); + __count_vm_events(PGROTATED, folio_nr_pages(folio)); } /* @@ -250,22 +275,11 @@ static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) */ void folio_rotate_reclaimable(struct folio *folio) { - if (!folio_test_locked(folio) && !folio_test_dirty(folio) && - !folio_test_unevictable(folio)) { - struct folio_batch *fbatch; - unsigned long flags; + if (folio_test_locked(folio) || folio_test_dirty(folio) || + folio_test_unevictable(folio)) + return; - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } - - local_lock_irqsave(&lru_rotate.lock, flags); - fbatch = this_cpu_ptr(&lru_rotate.fbatch); - folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); - local_unlock_irqrestore(&lru_rotate.lock, flags); - } + folio_batch_add_and_move(folio, lru_move_tail, true); } void lru_note_cost(struct lruvec *lruvec, bool file, @@ -326,47 +340,38 @@ void lru_note_cost_refault(struct folio *folio) folio_nr_pages(folio), 0); } -static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_activate(struct lruvec *lruvec, struct folio *folio) { - if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { - long nr_pages = folio_nr_pages(folio); + long nr_pages = folio_nr_pages(folio); - lruvec_del_folio(lruvec, folio); - folio_set_active(folio); - lruvec_add_folio(lruvec, folio); - trace_mm_lru_activate(folio); + if (folio_test_active(folio) || folio_test_unevictable(folio)) + return; - __count_vm_events(PGACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, - nr_pages); - } + + lruvec_del_folio(lruvec, folio); + folio_set_active(folio); + lruvec_add_folio(lruvec, folio); + trace_mm_lru_activate(folio); + + __count_vm_events(PGACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { - struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); + struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu); if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, folio_activate_fn); + folio_batch_move_lru(fbatch, lru_activate); } void folio_activate(struct folio *folio) { - if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { - struct folio_batch *fbatch; + if (folio_test_active(folio) || folio_test_unevictable(folio)) + return; - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } - - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.activate); - folio_batch_add_and_move(fbatch, folio, folio_activate_fn); - local_unlock(&cpu_fbatches.lock); - } + folio_batch_add_and_move(folio, lru_activate, true); } #else @@ -378,12 +383,13 @@ void folio_activate(struct folio *folio) { struct lruvec *lruvec; - if (folio_test_clear_lru(folio)) { - lruvec = folio_lruvec_lock_irq(folio); - folio_activate_fn(lruvec, folio); - unlock_page_lruvec_irq(lruvec); - folio_set_lru(folio); - } + if (!folio_test_clear_lru(folio)) + return; + + lruvec = folio_lruvec_lock_irq(folio); + lru_activate(lruvec, folio); + unlock_page_lruvec_irq(lruvec); + folio_set_lru(folio); } #endif @@ -482,7 +488,7 @@ void folio_mark_accessed(struct folio *folio) } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via - * cpu_fbatches.activate. Otherwise, assume the folio is in a + * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ @@ -509,8 +515,6 @@ EXPORT_SYMBOL(folio_mark_accessed); */ void folio_add_lru(struct folio *folio) { - struct folio_batch *fbatch; - VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); @@ -520,11 +524,7 @@ void folio_add_lru(struct folio *folio) lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); - folio_get(folio); - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); - folio_batch_add_and_move(fbatch, folio, lru_add_fn); - local_unlock(&cpu_fbatches.lock); + folio_batch_add_and_move(folio, lru_add, false); } EXPORT_SYMBOL(folio_add_lru); @@ -567,7 +567,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ -static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio); long nr_pages = folio_nr_pages(folio); @@ -608,43 +608,43 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) } } -static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) { - if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { - long nr_pages = folio_nr_pages(folio); + long nr_pages = folio_nr_pages(folio); - lruvec_del_folio(lruvec, folio); - folio_clear_active(folio); - folio_clear_referenced(folio); - lruvec_add_folio(lruvec, folio); + if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + return; - __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, - nr_pages); - } + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + lruvec_add_folio(lruvec, folio); + + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } -static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) { - if (folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { - long nr_pages = folio_nr_pages(folio); + long nr_pages = folio_nr_pages(folio); - lruvec_del_folio(lruvec, folio); - folio_clear_active(folio); - folio_clear_referenced(folio); - /* - * Lazyfree folios are clean anonymous folios. They have - * the swapbacked flag cleared, to distinguish them from normal - * anonymous folios - */ - folio_clear_swapbacked(folio); - lruvec_add_folio(lruvec, folio); + if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + folio_test_swapcache(folio) || folio_test_unevictable(folio)) + return; - __count_vm_events(PGLAZYFREE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, - nr_pages); - } + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + /* + * Lazyfree folios are clean anonymous folios. They have + * the swapbacked flag cleared, to distinguish them from normal + * anonymous folios + */ + folio_clear_swapbacked(folio); + lruvec_add_folio(lruvec, folio); + + __count_vm_events(PGLAZYFREE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* @@ -658,30 +658,30 @@ void lru_add_drain_cpu(int cpu) struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, lru_add_fn); + folio_batch_move_lru(fbatch, lru_add); - fbatch = &per_cpu(lru_rotate.fbatch, cpu); + fbatch = &fbatches->lru_move_tail; /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_lock_irqsave(&lru_rotate.lock, flags); - folio_batch_move_lru(fbatch, lru_move_tail_fn); - local_unlock_irqrestore(&lru_rotate.lock, flags); + local_lock_irqsave(&cpu_fbatches.lock_irq, flags); + folio_batch_move_lru(fbatch, lru_move_tail); + local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, lru_deactivate_file_fn); + folio_batch_move_lru(fbatch, lru_deactivate_file); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, lru_deactivate_fn); + folio_batch_move_lru(fbatch, lru_deactivate); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, lru_lazyfree_fn); + folio_batch_move_lru(fbatch, lru_lazyfree); folio_activate_drain(cpu); } @@ -698,22 +698,11 @@ void lru_add_drain_cpu(int cpu) */ void deactivate_file_folio(struct folio *folio) { - struct folio_batch *fbatch; - /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } - - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); - folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); - local_unlock(&cpu_fbatches.lock); + folio_batch_add_and_move(folio, lru_deactivate_file, true); } /* @@ -726,21 +715,10 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (!folio_test_unevictable(folio) && (folio_test_active(folio) || - lru_gen_enabled())) { - struct folio_batch *fbatch; + if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + return; - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } - - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); - folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); - local_unlock(&cpu_fbatches.lock); - } + folio_batch_add_and_move(folio, lru_deactivate, true); } /** @@ -752,21 +730,11 @@ void folio_deactivate(struct folio *folio) */ void folio_mark_lazyfree(struct folio *folio) { - if (folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { - struct folio_batch *fbatch; + if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + folio_test_swapcache(folio) || folio_test_unevictable(folio)) + return; - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } - - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); - folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); - local_unlock(&cpu_fbatches.lock); - } + folio_batch_add_and_move(folio, lru_lazyfree, true); } void lru_add_drain(void) @@ -816,11 +784,11 @@ static bool cpu_needs_drain(unsigned int cpu) /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || - data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || + folio_batch_count(&fbatches->lru_move_tail) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || - folio_batch_count(&fbatches->activate) || + folio_batch_count(&fbatches->lru_activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } @@ -938,8 +906,8 @@ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling - * a list of pages to be migrated using isolate_lru_page(). - * It drains pages on LRU cache and then disable on all cpus until + * a list of folios to be migrated using folio_isolate_lru(). + * It drains folios on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). diff --git a/mm/swap.h b/mm/swap.h index baa1fa946b34..ad2f121de970 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -59,7 +59,7 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry); +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); struct folio *filemap_get_incore_folio(struct address_space *mapping, @@ -73,13 +73,39 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, bool skip_if_exists); struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); -struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); +struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); static inline unsigned int folio_swap_flags(struct folio *folio) { return swp_swap_info(folio->swap)->flags; } + +/* + * Return the count of contiguous swap entries that share the same + * zeromap status as the starting entry. If is_zeromap is not NULL, + * it will return the zeromap status of the starting entry. + */ +static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *is_zeromap) +{ + struct swap_info_struct *sis = swp_swap_info(entry); + unsigned long start = swp_offset(entry); + unsigned long end = start + max_nr; + bool first_bit; + + first_bit = test_bit(start, sis->zeromap); + if (is_zeromap) + *is_zeromap = first_bit; + + if (max_nr <= 1) + return max_nr; + if (first_bit) + return find_next_zero_bit(sis->zeromap, end, start) - start; + else + return find_next_bit(sis->zeromap, end, start) - start; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -109,7 +135,7 @@ static inline struct folio *swap_cluster_readahead(swp_entry_t entry, return NULL; } -static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, +static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_fault *vmf) { return NULL; @@ -120,7 +146,7 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } -static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { } @@ -171,5 +197,13 @@ static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; } + +static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *has_zeromap) +{ + return 0; +} + #endif /* CONFIG_SWAP */ + #endif /* _MM_SWAP_H */ diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index db6c4a26cf59..da1278f0563b 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -161,6 +161,8 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { + if (mem_cgroup_disabled()) + return 0; return lookup_swap_cgroup(ent, NULL)->id; } diff --git a/mm/swap_state.c b/mm/swap_state.c index a1726e49a5eb..4669f29cf555 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -435,6 +435,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, { struct swap_info_struct *si; struct folio *folio; + struct folio *new_folio = NULL; + struct folio *result = NULL; void *shadow = NULL; *new_page_allocated = false; @@ -463,27 +465,28 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * else swap_off will be aborted if we return NULL. */ if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) - goto fail_put_swap; + goto put_and_return; /* - * Get a new folio to read into from swap. Allocate it now, - * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will - * cause any racers to loop around until we add it to cache. + * Get a new folio to read into from swap. Allocate it now if + * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, + * when -EEXIST will cause any racers to loop around until we + * add it to cache. */ - folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!folio) - goto fail_put_swap; + if (!new_folio) { + new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); + if (!new_folio) + goto put_and_return; + } /* * Swap entry may have been freed since our caller observed it. */ - err = swapcache_prepare(entry); + err = swapcache_prepare(entry, 1); if (!err) break; - - folio_put(folio); - if (err != -EEXIST) - goto fail_put_swap; + else if (err != -EEXIST) + goto put_and_return; /* * Protect against a recursive call to __read_swap_cache_async() @@ -494,7 +497,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * __read_swap_cache_async() in the writeback path. */ if (skip_if_exists) - goto fail_put_swap; + goto put_and_return; /* * We might race against __delete_from_swap_cache(), and @@ -509,36 +512,37 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * The swap entry is ours to swap in. Prepare the new folio. */ + __folio_set_locked(new_folio); + __folio_set_swapbacked(new_folio); - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - mem_cgroup_swapin_uncharge_swap(entry); + mem_cgroup_swapin_uncharge_swap(entry, 1); if (shadow) - workingset_refault(folio, shadow); + workingset_refault(new_folio, shadow); - /* Caller will initiate read into locked folio */ - folio_add_lru(folio); + /* Caller will initiate read into locked new_folio */ + folio_add_lru(new_folio); *new_page_allocated = true; + folio = new_folio; got_folio: - put_swap_device(si); - return folio; + result = folio; + goto put_and_return; fail_unlock: - put_swap_folio(folio, entry); - folio_unlock(folio); - folio_put(folio); -fail_put_swap: + put_swap_folio(new_folio, entry); + folio_unlock(new_folio); +put_and_return: put_swap_device(si); - return NULL; + if (!(*new_page_allocated) && new_folio) + folio_put(new_folio); + return result; } /* @@ -698,10 +702,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (unlikely(page_allocated)) { - zswap_folio_swapin(folio); + if (unlikely(page_allocated)) swap_read_folio(folio, NULL); - } return folio; } @@ -850,10 +852,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); - if (unlikely(page_allocated)) { - zswap_folio_swapin(folio); + if (unlikely(page_allocated)) swap_read_folio(folio, NULL); - } return folio; } @@ -863,13 +863,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, * @gfp_mask: memory allocation flags * @vmf: fault information * - * Returns the struct page for entry and addr, after queueing swapin. + * Returns the struct folio for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ -struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, +struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; @@ -882,9 +882,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); - if (!folio) - return NULL; - return folio_file_page(folio, swp_offset(entry)); + return folio; } #ifdef CONFIG_SYSFS diff --git a/mm/swapfile.c b/mm/swapfile.c index 38bdc439651a..0cded32414a1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,6 +53,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, + unsigned int nr_pages); +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries); +static bool folio_swapcache_freeable(struct folio *folio); +static struct swap_cluster_info *lock_cluster_or_swap_info( + struct swap_info_struct *si, unsigned long offset); +static void unlock_cluster_or_swap_info(struct swap_info_struct *si, + struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -127,8 +136,44 @@ static inline unsigned char swap_count(unsigned char ent) * corresponding page */ #define TTRS_UNMAPPED 0x2 -/* Reclaim the swap entry if swap is getting full*/ +/* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 +/* Reclaim directly, bypass the slot cache and don't touch device lock */ +#define TTRS_DIRECT 0x8 + +static bool swap_is_has_cache(struct swap_info_struct *si, + unsigned long offset, int nr_pages) +{ + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + + do { + VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); + if (*map != SWAP_HAS_CACHE) + return false; + } while (++map < map_end); + + return true; +} + +static bool swap_is_last_map(struct swap_info_struct *si, + unsigned long offset, int nr_pages, bool *has_cache) +{ + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + unsigned char count = *map; + + if (swap_count(count) != 1) + return false; + + while (++map < map_end) { + if (*map != count) + return false; + } + + *has_cache = !!(count & SWAP_HAS_CACHE); + return true; +} /* * returns number of pages in the folio that backs the swap entry. If positive, @@ -139,12 +184,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); + struct address_space *address_space = swap_address_space(entry); + struct swap_cluster_info *ci; struct folio *folio; - int ret = 0; + int ret, nr_pages; + bool need_reclaim; - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); + folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0; + + /* offset could point to the middle of a large folio */ + entry = folio->swap; + offset = swp_offset(entry); + nr_pages = folio_nr_pages(folio); + ret = -nr_pages; + /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock @@ -152,14 +207,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (folio_trylock(folio)) { - if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) - ret = folio_free_swap(folio); - folio_unlock(folio); + if (!folio_trylock(folio)) + goto out; + + need_reclaim = ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); + if (!need_reclaim || !folio_swapcache_freeable(folio)) + goto out_unlock; + + /* + * It's safe to delete the folio from swap cache only if the folio's + * swap_map is HAS_CACHE only, which means the slots have no page table + * reference or pending writeback, and can't be allocated to others. + */ + ci = lock_cluster_or_swap_info(si, offset); + need_reclaim = swap_is_has_cache(si, offset, nr_pages); + unlock_cluster_or_swap_info(si, ci); + if (!need_reclaim) + goto out_unlock; + + if (!(flags & TTRS_DIRECT)) { + /* Free through slot cache */ + delete_from_swap_cache(folio); + folio_set_dirty(folio); + ret = nr_pages; + goto out_unlock; } - ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); + + xa_lock_irq(&address_space->i_pages); + __delete_from_swap_cache(folio, entry, NULL); + xa_unlock_irq(&address_space->i_pages); + folio_ref_sub(folio, nr_pages); + folio_set_dirty(folio); + + spin_lock(&si->lock); + /* Only sinple page folio can be backed by zswap */ + if (nr_pages == 1) + zswap_invalidate(entry); + swap_entry_range_free(si, entry, nr_pages); + spin_unlock(&si->lock); + ret = nr_pages; +out_unlock: + folio_unlock(folio); +out: folio_put(folio); return ret; } @@ -290,62 +381,21 @@ static void discard_swap_cluster(struct swap_info_struct *si, #endif #define LATENCY_LIMIT 256 -static inline void cluster_set_flag(struct swap_cluster_info *info, - unsigned int flag) -{ - info->flags = flag; -} - -static inline unsigned int cluster_count(struct swap_cluster_info *info) -{ - return info->data; -} - -static inline void cluster_set_count(struct swap_cluster_info *info, - unsigned int c) -{ - info->data = c; -} - -static inline void cluster_set_count_flag(struct swap_cluster_info *info, - unsigned int c, unsigned int f) -{ - info->flags = f; - info->data = c; -} - -static inline unsigned int cluster_next(struct swap_cluster_info *info) -{ - return info->data; -} - -static inline void cluster_set_next(struct swap_cluster_info *info, - unsigned int n) -{ - info->data = n; -} - -static inline void cluster_set_next_flag(struct swap_cluster_info *info, - unsigned int n, unsigned int f) -{ - info->flags = f; - info->data = n; -} - static inline bool cluster_is_free(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_FREE; } -static inline bool cluster_is_null(struct swap_cluster_info *info) +static inline unsigned int cluster_index(struct swap_info_struct *si, + struct swap_cluster_info *ci) { - return info->flags & CLUSTER_FLAG_NEXT_NULL; + return ci - si->cluster_info; } -static inline void cluster_set_null(struct swap_cluster_info *info) +static inline unsigned int cluster_offset(struct swap_info_struct *si, + struct swap_cluster_info *ci) { - info->flags = CLUSTER_FLAG_NEXT_NULL; - info->data = 0; + return cluster_index(si, ci) * SWAPFILE_CLUSTER; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, @@ -394,65 +444,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, spin_unlock(&si->lock); } -static inline bool cluster_list_empty(struct swap_cluster_list *list) -{ - return cluster_is_null(&list->head); -} - -static inline unsigned int cluster_list_first(struct swap_cluster_list *list) -{ - return cluster_next(&list->head); -} - -static void cluster_list_init(struct swap_cluster_list *list) -{ - cluster_set_null(&list->head); - cluster_set_null(&list->tail); -} - -static void cluster_list_add_tail(struct swap_cluster_list *list, - struct swap_cluster_info *ci, - unsigned int idx) -{ - if (cluster_list_empty(list)) { - cluster_set_next_flag(&list->head, idx, 0); - cluster_set_next_flag(&list->tail, idx, 0); - } else { - struct swap_cluster_info *ci_tail; - unsigned int tail = cluster_next(&list->tail); - - /* - * Nested cluster lock, but both cluster locks are - * only acquired when we held swap_info_struct->lock - */ - ci_tail = ci + tail; - spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); - cluster_set_next(ci_tail, idx); - spin_unlock(&ci_tail->lock); - cluster_set_next_flag(&list->tail, idx, 0); - } -} - -static unsigned int cluster_list_del_first(struct swap_cluster_list *list, - struct swap_cluster_info *ci) -{ - unsigned int idx; - - idx = cluster_next(&list->head); - if (cluster_next(&list->tail) == idx) { - cluster_set_null(&list->head); - cluster_set_null(&list->tail); - } else - cluster_set_next_flag(&list->head, - cluster_next(&ci[idx]), 0); - - return idx; -} - /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, - unsigned int idx) + struct swap_cluster_info *ci) { + unsigned int idx = cluster_index(si, ci); /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't @@ -462,17 +458,23 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); - + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + list_move_tail(&ci->list, &si->discard_clusters); + ci->flags = 0; schedule_work(&si->discard_work); } -static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - struct swap_cluster_info *ci = si->cluster_info; + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); - cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); - cluster_list_add_tail(&si->free_clusters, ci, idx); + if (ci->flags) + list_move_tail(&ci->list, &si->free_clusters); + else + list_add_tail(&ci->list, &si->free_clusters); + ci->flags = CLUSTER_FLAG_FREE; + ci->order = 0; } /* @@ -481,24 +483,24 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { - struct swap_cluster_info *info, *ci; + struct swap_cluster_info *ci; unsigned int idx; - info = si->cluster_info; - - while (!cluster_list_empty(&si->discard_clusters)) { - idx = cluster_list_del_first(&si->discard_clusters, info); + while (!list_empty(&si->discard_clusters)) { + ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); + list_del(&ci->list); + idx = cluster_index(si, ci); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&si->lock); - ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); - __free_cluster(si, idx); + spin_lock(&ci->lock); + __free_cluster(si, ci); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); + spin_unlock(&ci->lock); } } @@ -521,20 +523,15 @@ static void swap_users_ref_free(struct percpu_ref *ref) complete(&si->comp); } -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - struct swap_cluster_info *ci = si->cluster_info; + VM_BUG_ON(ci->count != 0); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); - VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); - cluster_list_del_first(&si->free_clusters, ci); - cluster_set_count_flag(ci + idx, 0, 0); -} + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; -static void free_cluster(struct swap_info_struct *si, unsigned long idx) -{ - struct swap_cluster_info *ci = si->cluster_info + idx; - - VM_BUG_ON(cluster_count(ci) != 0); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -542,175 +539,371 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { - swap_cluster_schedule_discard(si, idx); + swap_cluster_schedule_discard(si, ci); return; } - __free_cluster(si, idx); + __free_cluster(si, ci); } /* - * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased by - * count. + * The cluster corresponding to page_nr will be used. The cluster will not be + * added to free cluster list and its usage counter will be increased by 1. + * Only used for initialization. */ -static void add_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr, - unsigned long count) -{ - unsigned long idx = page_nr / SWAPFILE_CLUSTER; - - if (!cluster_info) - return; - if (cluster_is_free(&cluster_info[idx])) - alloc_cluster(p, idx); - - VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); - cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) + count); -} - -/* - * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased by 1. - */ -static void inc_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) -{ - add_cluster_info_page(p, cluster_info, page_nr, 1); -} - -/* - * The cluster corresponding to page_nr decreases one usage. If the usage - * counter becomes 0, which means no page in the cluster is in using, we can - * optionally discard the cluster and add it to free cluster list. - */ -static void dec_cluster_info_page(struct swap_info_struct *p, +static void inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; if (!cluster_info) return; - VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); - cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) - 1); + ci = cluster_info + idx; + ci->count++; - if (cluster_count(&cluster_info[idx]) == 0) - free_cluster(p, idx); + VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); + VM_BUG_ON(ci->flags); } /* - * It's possible scan_swap_map_slots() uses a free cluster in the middle of free - * cluster list. Avoiding such abuse to avoid list corruption. + * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, + * which means no page in the cluster is in use, we can optionally discard + * the cluster and add it to free cluster list. */ -static bool -scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, - unsigned long offset, int order) +static void dec_cluster_info_page(struct swap_info_struct *si, + struct swap_cluster_info *ci, int nr_pages) { - struct percpu_cluster *percpu_cluster; - bool conflict; + if (!si->cluster_info) + return; - offset /= SWAPFILE_CLUSTER; - conflict = !cluster_list_empty(&si->free_clusters) && - offset != cluster_list_first(&si->free_clusters) && - cluster_is_free(&si->cluster_info[offset]); + VM_BUG_ON(ci->count < nr_pages); + VM_BUG_ON(cluster_is_free(ci)); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); + ci->count -= nr_pages; - if (!conflict) - return false; + if (!ci->count) { + free_cluster(si, ci); + return; + } - percpu_cluster = this_cpu_ptr(si->percpu_cluster); - percpu_cluster->next[order] = SWAP_NEXT_INVALID; - return true; + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; + list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } } -static inline bool swap_range_empty(char *swap_map, unsigned int start, - unsigned int nr_pages) +static bool cluster_reclaim_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned long end) { - unsigned int i; + unsigned char *map = si->swap_map; + unsigned long offset; - for (i = 0; i < nr_pages; i++) { - if (swap_map[start + i]) + spin_unlock(&ci->lock); + spin_unlock(&si->lock); + + for (offset = start; offset < end; offset++) { + switch (READ_ONCE(map[offset])) { + case 0: + continue; + case SWAP_HAS_CACHE: + if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) + continue; + goto out; + default: + goto out; + } + } +out: + spin_lock(&si->lock); + spin_lock(&ci->lock); + + /* + * Recheck the range no matter reclaim succeeded or not, the slot + * could have been be freed while we are not holding the lock. + */ + for (offset = start; offset < end; offset++) + if (READ_ONCE(map[offset])) return false; - } return true; } +static bool cluster_scan_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned int nr_pages) +{ + unsigned long offset, end = start + nr_pages; + unsigned char *map = si->swap_map; + bool need_reclaim = false; + + for (offset = start; offset < end; offset++) { + switch (READ_ONCE(map[offset])) { + case 0: + continue; + case SWAP_HAS_CACHE: + if (!vm_swap_full()) + return false; + need_reclaim = true; + continue; + default: + return false; + } + } + + if (need_reclaim) + return cluster_reclaim_range(si, ci, start, end); + + return true; +} + +static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, + unsigned int start, unsigned char usage, + unsigned int order) +{ + unsigned int nr_pages = 1 << order; + + if (cluster_is_free(ci)) { + if (nr_pages < SWAPFILE_CLUSTER) { + list_move_tail(&ci->list, &si->nonfull_clusters[order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } + ci->order = order; + } + + memset(si->swap_map + start, usage, nr_pages); + swap_range_alloc(si, start, nr_pages); + ci->count += nr_pages; + + if (ci->count == SWAPFILE_CLUSTER) { + VM_BUG_ON(!(ci->flags & + (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; + list_move_tail(&ci->list, &si->full_clusters); + ci->flags = CLUSTER_FLAG_FULL; + } +} + +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, + unsigned int *foundp, unsigned int order, + unsigned char usage) +{ + unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); + unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); + unsigned int nr_pages = 1 << order; + struct swap_cluster_info *ci; + + if (end < nr_pages) + return SWAP_NEXT_INVALID; + end -= nr_pages; + + ci = lock_cluster(si, offset); + if (ci->count + nr_pages > SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; + goto done; + } + + while (offset <= end) { + if (cluster_scan_range(si, ci, offset, nr_pages)) { + cluster_alloc_range(si, ci, offset, usage, order); + *foundp = offset; + if (ci->count == SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; + goto done; + } + offset += nr_pages; + break; + } + offset += nr_pages; + } + if (offset > end) + offset = SWAP_NEXT_INVALID; +done: + unlock_cluster(ci); + return offset; +} + +static void swap_reclaim_full_clusters(struct swap_info_struct *si) +{ + long to_scan = 1; + unsigned long offset, end; + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; + int nr_reclaim, total_reclaimed = 0; + + if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) + to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + + while (!list_empty(&si->full_clusters)) { + ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); + list_move_tail(&ci->list, &si->full_clusters); + offset = cluster_offset(si, ci); + end = min(si->max, offset + SWAPFILE_CLUSTER); + to_scan--; + + while (offset < end) { + if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + spin_unlock(&si->lock); + nr_reclaim = __try_to_reclaim_swap(si, offset, + TTRS_ANYWAY | TTRS_DIRECT); + spin_lock(&si->lock); + if (nr_reclaim > 0) { + offset += nr_reclaim; + total_reclaimed += nr_reclaim; + continue; + } else if (nr_reclaim < 0) { + offset += -nr_reclaim; + continue; + } + } + offset++; + } + if (to_scan <= 0 || total_reclaimed) + break; + } +} + /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU * too. */ -static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, - unsigned long *offset, unsigned long *scan_base, int order) +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, + unsigned char usage) { - unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; - unsigned int tmp, max; + unsigned int offset, found = 0; new_cluster: + lockdep_assert_held(&si->lock); cluster = this_cpu_ptr(si->percpu_cluster); - tmp = cluster->next[order]; - if (tmp == SWAP_NEXT_INVALID) { - if (!cluster_list_empty(&si->free_clusters)) { - tmp = cluster_next(&si->free_clusters.head) * - SWAPFILE_CLUSTER; - } else if (!cluster_list_empty(&si->discard_clusters)) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock - */ - swap_do_scheduled_discard(si); - *scan_base = this_cpu_read(*si->cluster_next_cpu); - *offset = *scan_base; - goto new_cluster; - } else - return false; + offset = cluster->next[order]; + if (offset) { + offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); + if (found) + goto done; } - /* - * Other CPUs can use our cluster if they can't find a free cluster, - * check if there is still free entry in the cluster, maintaining - * natural alignment. - */ - max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); - if (tmp < max) { - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (swap_range_empty(si->swap_map, tmp, nr_pages)) - break; - tmp += nr_pages; - } - unlock_cluster(ci); + if (!list_empty(&si->free_clusters)) { + ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); + VM_BUG_ON(!found); + goto done; } - if (tmp >= max) { - cluster->next[order] = SWAP_NEXT_INVALID; + + if (order < PMD_ORDER) { + unsigned int frags = 0; + + while (!list_empty(&si->nonfull_clusters[order])) { + ci = list_first_entry(&si->nonfull_clusters[order], + struct swap_cluster_info, list); + list_move_tail(&ci->list, &si->frag_clusters[order]); + ci->flags = CLUSTER_FLAG_FRAG; + si->frag_cluster_nr[order]++; + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + frags++; + if (found) + break; + } + + if (!found) { + /* + * Nonfull clusters are moved to frag tail if we reached + * here, count them too, don't over scan the frag list. + */ + while (frags < si->frag_cluster_nr[order]) { + ci = list_first_entry(&si->frag_clusters[order], + struct swap_cluster_info, list); + /* + * Rotate the frag list to iterate, they were all failing + * high order allocation or moved here due to per-CPU usage, + * this help keeping usable cluster ahead. + */ + list_move_tail(&ci->list, &si->frag_clusters[order]); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + frags++; + if (found) + break; + } + } + } + + if (found) + goto done; + + if (!list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + swap_do_scheduled_discard(si); goto new_cluster; } - *offset = tmp; - *scan_base = tmp; - tmp += nr_pages; - cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; - return true; + + if (order) + goto done; + + /* Order 0 stealing from higher order */ + for (int o = 1; o < SWAP_NR_ORDERS; o++) { + /* + * Clusters here have at least one usable slots and can't fail order 0 + * allocation, but reclaim may drop si->lock and race with another user. + */ + while (!list_empty(&si->frag_clusters[o])) { + ci = list_first_entry(&si->frag_clusters[o], + struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); + if (found) + goto done; + } + + while (!list_empty(&si->nonfull_clusters[o])) { + ci = list_first_entry(&si->nonfull_clusters[o], + struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); + if (found) + goto done; + } + } + +done: + /* Try reclaim from full clusters if device is nearfull */ + if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { + swap_reclaim_full_clusters(si); + if (!found && !order && si->pages != si->inuse_pages) + goto new_cluster; + } + + cluster->next[order] = offset; + return found; } -static void __del_from_avail_list(struct swap_info_struct *p) +static void __del_from_avail_list(struct swap_info_struct *si) { int nid; - assert_spin_locked(&p->lock); + assert_spin_locked(&si->lock); for_each_node(nid) - plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); } -static void del_from_avail_list(struct swap_info_struct *p) +static void del_from_avail_list(struct swap_info_struct *si) { spin_lock(&swap_avail_lock); - __del_from_avail_list(p); + __del_from_avail_list(si); spin_unlock(&swap_avail_lock); } @@ -731,13 +924,13 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, } } -static void add_to_avail_list(struct swap_info_struct *p) +static void add_to_avail_list(struct swap_info_struct *si) { int nid; spin_lock(&swap_avail_lock); for_each_node(nid) - plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); spin_unlock(&swap_avail_lock); } @@ -747,6 +940,14 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); + unsigned int i; + + /* + * Use atomic clear_bit operations only on zeromap instead of non-atomic + * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. + */ + for (i = 0; i < nr_entries; i++) + clear_bit(offset + i, si->zeromap); if (offset < si->lowest_bit) si->lowest_bit = offset; @@ -822,11 +1023,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, return false; } +static int cluster_alloc_swap(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[], int order) +{ + int n_ret = 0; + + VM_BUG_ON(!si->cluster_info); + + while (n_ret < nr) { + unsigned long offset = cluster_alloc_swap_entry(si, order, usage); + + if (!offset) + break; + slots[n_ret++] = swp_entry(si->type, offset); + } + + return n_ret; +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { - struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; @@ -865,26 +1084,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return 0; } + if (si->cluster_info) + return cluster_alloc_swap(si, usage, nr, slots, order); + si->flags += SWP_SCANNING; - /* - * Use percpu scan base for SSD to reduce lock contention on - * cluster and swap cache. For HDD, sequential access is more - * important. - */ - if (si->flags & SWP_SOLIDSTATE) - scan_base = this_cpu_read(*si->cluster_next_cpu); - else - scan_base = si->cluster_next; + + /* For HDD, sequential access is more important. */ + scan_base = si->cluster_next; offset = scan_base; - /* SSD algorithm */ - if (si->cluster_info) { - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { - if (order > 0) - goto no_page; - goto scan; - } - } else if (unlikely(!si->cluster_nr--)) { + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -895,8 +1104,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info - * case, just handled by scan_swap_map_try_ssd_cluster() above. */ scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; @@ -924,19 +1131,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } checks: - if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { - /* take a break if we already got some slots */ - if (n_ret) - goto done; - if (!scan_swap_map_try_ssd_cluster(si, &offset, - &scan_base, order)) { - if (order > 0) - goto no_page; - goto scan; - } - } - } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -944,13 +1138,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; - ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; - unlock_cluster(ci); spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed > 0) @@ -959,15 +1151,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } if (si->swap_map[offset]) { - unlock_cluster(ci); if (!n_ret) goto scan; else goto done; } memset(si->swap_map + offset, usage, nr_pages); - add_cluster_info_page(si, si->cluster_info, offset, nr_pages); - unlock_cluster(ci); swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); @@ -988,13 +1177,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, latency_ration = LATENCY_LIMIT; } - /* try to get more slots in cluster */ - if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) - goto checks; - if (order > 0) - goto done; - } else if (si->cluster_nr && !si->swap_map[++offset]) { + if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; @@ -1055,19 +1238,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return n_ret; } -static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) -{ - unsigned long offset = idx * SWAPFILE_CLUSTER; - struct swap_cluster_info *ci; - - ci = lock_cluster(si, offset); - memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); - cluster_set_count_flag(ci, 0, 0); - free_cluster(si, idx); - unlock_cluster(ci); - swap_range_free(si, offset, SWAPFILE_CLUSTER); -} - int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { int order = swap_entry_order(entry_order); @@ -1148,22 +1318,22 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { - struct swap_info_struct *p; + struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; - p = swp_swap_info(entry); - if (!p) + si = swp_swap_info(entry); + if (!si) goto bad_nofile; - if (data_race(!(p->flags & SWP_USED))) + if (data_race(!(si->flags & SWP_USED))) goto bad_device; offset = swp_offset(entry); - if (offset >= p->max) + if (offset >= si->max) goto bad_offset; - if (data_race(!p->swap_map[swp_offset(entry)])) + if (data_race(!si->swap_map[swp_offset(entry)])) goto bad_free; - return p; + return si; bad_free: pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); @@ -1196,14 +1366,14 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } -static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, +static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) { unsigned char count; unsigned char has_cache; - count = p->swap_map[offset]; + count = si->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; @@ -1219,7 +1389,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { - if (swap_count_continued(p, offset, count)) + if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; else count = SWAP_MAP_MAX; @@ -1229,9 +1399,9 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, usage = count | has_cache; if (usage) - WRITE_ONCE(p->swap_map[offset], usage); + WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); + WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); return usage; } @@ -1310,66 +1480,121 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *p, +static unsigned char __swap_entry_free(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; - ci = lock_cluster_or_swap_info(p, offset); - usage = __swap_entry_free_locked(p, offset, 1); - unlock_cluster_or_swap_info(p, ci); + ci = lock_cluster_or_swap_info(si, offset); + usage = __swap_entry_free_locked(si, offset, 1); + unlock_cluster_or_swap_info(si, ci); if (!usage) free_swap_slot(entry); return usage; } -static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) +static bool __swap_entries_free(struct swap_info_struct *si, + swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); + unsigned int type = swp_type(entry); + struct swap_cluster_info *ci; + bool has_cache = false; unsigned char count; + int i; - ci = lock_cluster(p, offset); - count = p->swap_map[offset]; - VM_BUG_ON(count != SWAP_HAS_CACHE); - p->swap_map[offset] = 0; - dec_cluster_info_page(p, p->cluster_info, offset); - unlock_cluster(ci); + if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + goto fallback; + /* cross into another cluster */ + if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) + goto fallback; - mem_cgroup_uncharge_swap(entry, 1); - swap_range_free(p, offset, 1); + ci = lock_cluster_or_swap_info(si, offset); + if (!swap_is_last_map(si, offset, nr, &has_cache)) { + unlock_cluster_or_swap_info(si, ci); + goto fallback; + } + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); + unlock_cluster_or_swap_info(si, ci); + + if (!has_cache) { + for (i = 0; i < nr; i++) + zswap_invalidate(swp_entry(si->type, offset + i)); + spin_lock(&si->lock); + swap_entry_range_free(si, entry, nr); + spin_unlock(&si->lock); + } + return has_cache; + +fallback: + for (i = 0; i < nr; i++) { + if (data_race(si->swap_map[offset + i])) { + count = __swap_entry_free(si, swp_entry(type, offset + i)); + if (count == SWAP_HAS_CACHE) + has_cache = true; + } else { + WARN_ON_ONCE(1); + } + } + return has_cache; } -static void cluster_swap_free_nr(struct swap_info_struct *sis, - unsigned long offset, int nr_pages) +/* + * Drop the last HAS_CACHE flag of swap entries, caller have to + * ensure all entries belong to the same cgroup. + */ +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, + unsigned int nr_pages) +{ + unsigned long offset = swp_offset(entry); + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + do { + VM_BUG_ON(*map != SWAP_HAS_CACHE); + *map = 0; + } while (++map < map_end); + dec_cluster_info_page(si, ci, nr_pages); + unlock_cluster(ci); + + mem_cgroup_uncharge_swap(entry, nr_pages); + swap_range_free(si, offset, nr_pages); +} + +static void cluster_swap_free_nr(struct swap_info_struct *si, + unsigned long offset, int nr_pages, + unsigned char usage) { struct swap_cluster_info *ci; DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; int i, nr; - ci = lock_cluster_or_swap_info(sis, offset); + ci = lock_cluster_or_swap_info(si, offset); while (nr_pages) { nr = min(BITS_PER_LONG, nr_pages); for (i = 0; i < nr; i++) { - if (!__swap_entry_free_locked(sis, offset + i, 1)) + if (!__swap_entry_free_locked(si, offset + i, usage)) bitmap_set(to_free, i, 1); } if (!bitmap_empty(to_free, BITS_PER_LONG)) { - unlock_cluster_or_swap_info(sis, ci); + unlock_cluster_or_swap_info(si, ci); for_each_set_bit(i, to_free, BITS_PER_LONG) - free_swap_slot(swp_entry(sis->type, offset + i)); + free_swap_slot(swp_entry(si->type, offset + i)); if (nr == nr_pages) return; bitmap_clear(to_free, 0, BITS_PER_LONG); - ci = lock_cluster_or_swap_info(sis, offset); + ci = lock_cluster_or_swap_info(si, offset); } offset += nr; nr_pages -= nr; } - unlock_cluster_or_swap_info(sis, ci); + unlock_cluster_or_swap_info(si, ci); } /* @@ -1388,7 +1613,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr); + cluster_swap_free_nr(sis, offset, nr, 1); offset += nr; nr_pages -= nr; } @@ -1400,12 +1625,8 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); - unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; struct swap_info_struct *si; - unsigned char *map; - unsigned int i, free_entries = 0; - unsigned char val; int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); @@ -1413,24 +1634,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) return; ci = lock_cluster_or_swap_info(si, offset); - if (size == SWAPFILE_CLUSTER) { - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) { - val = map[i]; - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); - if (val == SWAP_HAS_CACHE) - free_entries++; - } - if (free_entries == SWAPFILE_CLUSTER) { - unlock_cluster_or_swap_info(si, ci); - spin_lock(&si->lock); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); - return; - } + if (size > 1 && swap_is_has_cache(si, offset, size)) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); + swap_entry_range_free(si, entry, size); + spin_unlock(&si->lock); + return; } - for (i = 0; i < size; i++, entry.val++) { + for (int i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { unlock_cluster_or_swap_info(si, ci); free_swap_slot(entry); @@ -1470,7 +1681,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) - swap_entry_free(p, entries[i]); + swap_entry_range_free(p, entries[i], 1); prev = p; } if (p) @@ -1509,28 +1720,28 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; - struct swap_info_struct *p; + struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; - p = _swap_info_get(entry); - if (!p) + si = _swap_info_get(entry); + if (!si) return 0; offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); + ci = lock_cluster_or_swap_info(si, offset); - count = swap_count(p->swap_map[offset]); + count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; - page = vmalloc_to_page(p->swap_map + offset); + page = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); @@ -1544,7 +1755,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster_or_swap_info(p, ci); + unlock_cluster_or_swap_info(si, ci); return count; } @@ -1590,16 +1801,7 @@ static bool folio_swapped(struct folio *folio) return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } -/** - * folio_free_swap() - Free the swap space used for this folio. - * @folio: The folio to remove. - * - * If swap is getting full, or if there are no more mappings of this folio, - * then call folio_free_swap to free its swap space. - * - * Return: true if we were able to release the swap space. - */ -bool folio_free_swap(struct folio *folio) +static bool folio_swapcache_freeable(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -1607,8 +1809,6 @@ bool folio_free_swap(struct folio *folio) return false; if (folio_test_writeback(folio)) return false; - if (folio_swapped(folio)) - return false; /* * Once hibernation has begun to create its image of memory, @@ -1628,6 +1828,25 @@ bool folio_free_swap(struct folio *folio) if (pm_suspended_storage()) return false; + return true; +} + +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. + */ +bool folio_free_swap(struct folio *folio) +{ + if (!folio_swapcache_freeable(folio)) + return false; + if (folio_swapped(folio)) + return false; + delete_from_swap_cache(folio); folio_set_dirty(folio); return true; @@ -1647,11 +1866,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; - unsigned int type = swp_type(entry); struct swap_info_struct *si; bool any_only_cache = false; unsigned long offset; - unsigned char count; if (non_swap_entry(entry)) return; @@ -1666,15 +1883,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - for (offset = start_offset; offset < end_offset; offset++) { - if (data_race(si->swap_map[offset])) { - count = __swap_entry_free(si, swp_entry(type, offset)); - if (count == SWAP_HAS_CACHE) - any_only_cache = true; - } else { - WARN_ON_ONCE(1); - } - } + any_only_cache = __swap_entries_free(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1704,7 +1913,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) * to the next boundary. */ nr = __try_to_reclaim_swap(si, offset, - TTRS_UNMAPPED | TTRS_FULL); + TTRS_UNMAPPED | TTRS_FULL); if (nr == 0) nr = 1; else if (nr < 0) @@ -1979,7 +2188,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { - struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, @@ -1987,10 +2195,8 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, .pmd = pmd, }; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); - if (page) - folio = page_folio(page); } if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); @@ -2397,52 +2603,54 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *p) +static int swap_node(struct swap_info_struct *si) { struct block_device *bdev; - if (p->bdev) - bdev = p->bdev; + if (si->bdev) + bdev = si->bdev; else - bdev = p->swap_file->f_inode->i_sb->s_bdev; + bdev = si->swap_file->f_inode->i_sb->s_bdev; return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void setup_swap_info(struct swap_info_struct *p, int prio, +static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, - struct swap_cluster_info *cluster_info) + struct swap_cluster_info *cluster_info, + unsigned long *zeromap) { int i; if (prio >= 0) - p->prio = prio; + si->prio = prio; else - p->prio = --least_priority; + si->prio = --least_priority; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ - p->list.prio = -p->prio; + si->list.prio = -si->prio; for_each_node(i) { - if (p->prio >= 0) - p->avail_lists[i].prio = -p->prio; + if (si->prio >= 0) + si->avail_lists[i].prio = -si->prio; else { - if (swap_node(p) == i) - p->avail_lists[i].prio = 1; + if (swap_node(si) == i) + si->avail_lists[i].prio = 1; else - p->avail_lists[i].prio = -p->prio; + si->avail_lists[i].prio = -si->prio; } } - p->swap_map = swap_map; - p->cluster_info = cluster_info; + si->swap_map = swap_map; + si->cluster_info = cluster_info; + si->zeromap = zeromap; } -static void _enable_swap_info(struct swap_info_struct *p) +static void _enable_swap_info(struct swap_info_struct *si) { - p->flags |= SWP_WRITEOK; - atomic_long_add(p->pages, &nr_swap_pages); - total_swap_pages += p->pages; + si->flags |= SWP_WRITEOK; + atomic_long_add(si->pages, &nr_swap_pages); + total_swap_pages += si->pages; assert_spin_locked(&swap_lock); /* @@ -2455,40 +2663,41 @@ static void _enable_swap_info(struct swap_info_struct *p) * which allocates swap pages from the highest available priority * swap_info_struct. */ - plist_add(&p->list, &swap_active_head); + plist_add(&si->list, &swap_active_head); /* add to available list iff swap device is not full */ - if (p->highest_bit) - add_to_avail_list(p); + if (si->highest_bit) + add_to_avail_list(si); } -static void enable_swap_info(struct swap_info_struct *p, int prio, +static void enable_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, - struct swap_cluster_info *cluster_info) + struct swap_cluster_info *cluster_info, + unsigned long *zeromap) { spin_lock(&swap_lock); - spin_lock(&p->lock); - setup_swap_info(p, prio, swap_map, cluster_info); - spin_unlock(&p->lock); + spin_lock(&si->lock); + setup_swap_info(si, prio, swap_map, cluster_info, zeromap); + spin_unlock(&si->lock); spin_unlock(&swap_lock); /* * Finished initializing swap device, now it's safe to reference it. */ - percpu_ref_resurrect(&p->users); + percpu_ref_resurrect(&si->users); spin_lock(&swap_lock); - spin_lock(&p->lock); - _enable_swap_info(p); - spin_unlock(&p->lock); + spin_lock(&si->lock); + _enable_swap_info(si); + spin_unlock(&si->lock); spin_unlock(&swap_lock); } -static void reinsert_swap_info(struct swap_info_struct *p) +static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); - spin_lock(&p->lock); - setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); - _enable_swap_info(p); - spin_unlock(&p->lock); + spin_lock(&si->lock); + setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); + _enable_swap_info(si); + spin_unlock(&si->lock); spin_unlock(&swap_lock); } @@ -2511,6 +2720,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; @@ -2633,6 +2843,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; + zeromap = p->zeromap; + p->zeromap = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; spin_unlock(&p->lock); @@ -2645,6 +2857,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; vfree(swap_map); + kvfree(zeromap); kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -2874,20 +3087,20 @@ static struct swap_info_struct *alloc_swap_info(void) return p; } -static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) +static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) { if (S_ISBLK(inode->i_mode)) { - p->bdev = I_BDEV(inode); + si->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ - if (bdev_is_zoned(p->bdev)) + if (bdev_is_zoned(si->bdev)) return -EINVAL; - p->flags |= SWP_BLKDEV; + si->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { - p->bdev = inode->i_sb->s_bdev; + si->bdev = inode->i_sb->s_bdev; } return 0; @@ -2922,7 +3135,7 @@ __weak unsigned long arch_max_swapfile_size(void) return generic_max_swapfile_size(); } -static unsigned long read_swap_header(struct swap_info_struct *p, +static unsigned long read_swap_header(struct swap_info_struct *si, union swap_header *swap_header, struct inode *inode) { @@ -2953,9 +3166,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return 0; } - p->lowest_bit = 1; - p->cluster_next = 1; - p->cluster_nr = 0; + si->lowest_bit = 1; + si->cluster_next = 1; + si->cluster_nr = 0; maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; @@ -2973,7 +3186,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } - p->highest_bit = maxpages - 1; + si->highest_bit = maxpages - 1; if (!maxpages) return 0; @@ -2997,25 +3210,18 @@ static unsigned long read_swap_header(struct swap_info_struct *p, #define SWAP_CLUSTER_COLS \ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) -static int setup_swap_map_and_extents(struct swap_info_struct *p, +static int setup_swap_map_and_extents(struct swap_info_struct *si, union swap_header *swap_header, unsigned char *swap_map, - struct swap_cluster_info *cluster_info, unsigned long maxpages, sector_t *span) { - unsigned int j, k; unsigned int nr_good_pages; + unsigned long i; int nr_extents; - unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; - unsigned long i, idx; nr_good_pages = maxpages - 1; /* omit header page */ - cluster_list_init(&p->free_clusters); - cluster_list_init(&p->discard_clusters); - for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) @@ -3023,40 +3229,87 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; - /* - * Haven't marked the cluster free yet, no list - * operation involved - */ - inc_cluster_info_page(p, cluster_info, page_nr); } } - /* Haven't marked the cluster free yet, no list operation involved */ - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) - inc_cluster_info_page(p, cluster_info, i); - if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; - /* - * Not mark the cluster free yet, no list - * operation involved - */ - inc_cluster_info_page(p, cluster_info, 0); - p->max = maxpages; - p->pages = nr_good_pages; - nr_extents = setup_swap_extents(p, span); + si->max = maxpages; + si->pages = nr_good_pages; + nr_extents = setup_swap_extents(si, span); if (nr_extents < 0) return nr_extents; - nr_good_pages = p->pages; + nr_good_pages = si->pages; } if (!nr_good_pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } - if (!cluster_info) - return nr_extents; + return nr_extents; +} +static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, + union swap_header *swap_header, + unsigned long maxpages) +{ + unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; + struct swap_cluster_info *cluster_info; + unsigned long i, j, k, idx; + int cpu, err = -ENOMEM; + + cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); + if (!cluster_info) + goto err; + + for (i = 0; i < nr_clusters; i++) + spin_lock_init(&cluster_info[i].lock); + + si->cluster_next_cpu = alloc_percpu(unsigned int); + if (!si->cluster_next_cpu) + goto err_free; + + /* Random start position to help with wear leveling */ + for_each_possible_cpu(cpu) + per_cpu(*si->cluster_next_cpu, cpu) = + get_random_u32_inclusive(1, si->highest_bit); + + si->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!si->percpu_cluster) + goto err_free; + + for_each_possible_cpu(cpu) { + struct percpu_cluster *cluster; + + cluster = per_cpu_ptr(si->percpu_cluster, cpu); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_NEXT_INVALID; + } + + /* + * Mark unusable pages as unavailable. The clusters aren't + * marked free yet, so no list operations are involved yet. + * + * See setup_swap_map_and_extents(): header page, bad pages, + * and the EOF part of the last cluster. + */ + inc_cluster_info_page(si, cluster_info, 0); + for (i = 0; i < swap_header->info.nr_badpages; i++) + inc_cluster_info_page(si, cluster_info, + swap_header->info.badpages[i]); + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) + inc_cluster_info_page(si, cluster_info, i); + + INIT_LIST_HEAD(&si->free_clusters); + INIT_LIST_HEAD(&si->full_clusters); + INIT_LIST_HEAD(&si->discard_clusters); + + for (i = 0; i < SWAP_NR_ORDERS; i++) { + INIT_LIST_HEAD(&si->nonfull_clusters[i]); + INIT_LIST_HEAD(&si->frag_clusters[i]); + si->frag_cluster_nr[i] = 0; + } /* * Reduce false cache line sharing between cluster_info and @@ -3065,22 +3318,32 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = (k + col) % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { + struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; + ci = cluster_info + idx; if (idx >= nr_clusters) continue; - if (cluster_count(&cluster_info[idx])) + if (ci->count) { + ci->flags = CLUSTER_FLAG_NONFULL; + list_add_tail(&ci->list, &si->nonfull_clusters[0]); continue; - cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - cluster_list_add_tail(&p->free_clusters, cluster_info, - idx); + } + ci->flags = CLUSTER_FLAG_FREE; + list_add_tail(&ci->list, &si->free_clusters); } } - return nr_extents; + + return cluster_info; + +err_free: + kvfree(cluster_info); +err: + return ERR_PTR(err); } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { - struct swap_info_struct *p; + struct swap_info_struct *si; struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; @@ -3092,8 +3355,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + unsigned long *zeromap = NULL; struct swap_cluster_info *cluster_info = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3106,11 +3370,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!swap_avail_heads) return -ENOMEM; - p = alloc_swap_info(); - if (IS_ERR(p)) - return PTR_ERR(p); + si = alloc_swap_info(); + if (IS_ERR(si)) + return PTR_ERR(si); - INIT_WORK(&p->discard_work, swap_discard_work); + INIT_WORK(&si->discard_work, swap_discard_work); name = getname(specialfile); if (IS_ERR(name)) { @@ -3125,12 +3389,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - p->swap_file = swap_file; + si->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; - error = claim_swapfile(p, inode); + error = claim_swapfile(si, inode); if (unlikely(error)) goto bad_swap; @@ -3151,14 +3415,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; goto bad_swap_unlock_inode; } - page = read_mapping_page(mapping, 0, swap_file); - if (IS_ERR(page)) { - error = PTR_ERR(page); + folio = read_mapping_folio(mapping, 0, swap_file); + if (IS_ERR(folio)) { + error = PTR_ERR(folio); goto bad_swap_unlock_inode; } - swap_header = kmap(page); + swap_header = kmap_local_folio(folio, 0); - maxpages = read_swap_header(p, swap_header, inode); + maxpages = read_swap_header(si, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap_unlock_inode; @@ -3171,79 +3435,57 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } - if (p->bdev && bdev_stable_writes(p->bdev)) - p->flags |= SWP_STABLE_WRITES; + error = swap_cgroup_swapon(si->type, maxpages); + if (error) + goto bad_swap_unlock_inode; - if (p->bdev && bdev_synchronous(p->bdev)) - p->flags |= SWP_SYNCHRONOUS_IO; + nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, + maxpages, &span); + if (unlikely(nr_extents < 0)) { + error = nr_extents; + goto bad_swap_unlock_inode; + } - if (p->bdev && bdev_nonrot(p->bdev)) { - int cpu, i; - unsigned long ci, nr_cluster; + /* + * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might + * be above MAX_PAGE_ORDER incase of a large swap file. + */ + zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), + GFP_KERNEL | __GFP_ZERO); + if (!zeromap) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } - p->flags |= SWP_SOLIDSTATE; - p->cluster_next_cpu = alloc_percpu(unsigned int); - if (!p->cluster_next_cpu) { - error = -ENOMEM; + if (si->bdev && bdev_stable_writes(si->bdev)) + si->flags |= SWP_STABLE_WRITES; + + if (si->bdev && bdev_synchronous(si->bdev)) + si->flags |= SWP_SYNCHRONOUS_IO; + + if (si->bdev && bdev_nonrot(si->bdev)) { + si->flags |= SWP_SOLIDSTATE; + + cluster_info = setup_clusters(si, swap_header, maxpages); + if (IS_ERR(cluster_info)) { + error = PTR_ERR(cluster_info); + cluster_info = NULL; goto bad_swap_unlock_inode; } - /* - * select a random position to start with to help wear leveling - * SSD - */ - for_each_possible_cpu(cpu) { - per_cpu(*p->cluster_next_cpu, cpu) = - get_random_u32_inclusive(1, p->highest_bit); - } - nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - - cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), - GFP_KERNEL); - if (!cluster_info) { - error = -ENOMEM; - goto bad_swap_unlock_inode; - } - - for (ci = 0; ci < nr_cluster; ci++) - spin_lock_init(&((cluster_info + ci)->lock)); - - p->percpu_cluster = alloc_percpu(struct percpu_cluster); - if (!p->percpu_cluster) { - error = -ENOMEM; - goto bad_swap_unlock_inode; - } - for_each_possible_cpu(cpu) { - struct percpu_cluster *cluster; - - cluster = per_cpu_ptr(p->percpu_cluster, cpu); - for (i = 0; i < SWAP_NR_ORDERS; i++) - cluster->next[i] = SWAP_NEXT_INVALID; - } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } - error = swap_cgroup_swapon(p->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - - nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, - cluster_info, maxpages, &span); - if (unlikely(nr_extents < 0)) { - error = nr_extents; - goto bad_swap_unlock_inode; - } - if ((swap_flags & SWAP_FLAG_DISCARD) && - p->bdev && bdev_max_discard_sectors(p->bdev)) { + si->bdev && bdev_max_discard_sectors(si->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ - p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); /* @@ -3253,24 +3495,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) - p->flags &= ~SWP_PAGE_DISCARD; + si->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) - p->flags &= ~SWP_AREA_DISCARD; + si->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ - if (p->flags & SWP_AREA_DISCARD) { - int err = discard_swap(p); + if (si->flags & SWP_AREA_DISCARD) { + int err = discard_swap(si); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d\n", - p, err); + si, err); } } - error = init_swap_address_space(p->type, maxpages); + error = init_swap_address_space(si->type, maxpages); if (error) goto bad_swap_unlock_inode; - error = zswap_swapon(p->type, maxpages); + error = zswap_swapon(si->type, maxpages); if (error) goto free_swap_address_space; @@ -3290,15 +3532,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map, cluster_info); + enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", - K(p->pages), name->name, p->prio, nr_extents, + K(si->pages), name->name, si->prio, nr_extents, K((unsigned long long)span), - (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : "", - (p->flags & SWP_AREA_DISCARD) ? "s" : "", - (p->flags & SWP_PAGE_DISCARD) ? "c" : ""); + (si->flags & SWP_SOLIDSTATE) ? "SS" : "", + (si->flags & SWP_DISCARDABLE) ? "D" : "", + (si->flags & SWP_AREA_DISCARD) ? "s" : "", + (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); @@ -3307,34 +3549,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; free_swap_zswap: - zswap_swapoff(p->type); + zswap_swapoff(si->type); free_swap_address_space: - exit_swap_address_space(p->type); + exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: - free_percpu(p->percpu_cluster); - p->percpu_cluster = NULL; - free_percpu(p->cluster_next_cpu); - p->cluster_next_cpu = NULL; + free_percpu(si->percpu_cluster); + si->percpu_cluster = NULL; + free_percpu(si->cluster_next_cpu); + si->cluster_next_cpu = NULL; inode = NULL; - destroy_swap_extents(p); - swap_cgroup_swapoff(p->type); + destroy_swap_extents(si); + swap_cgroup_swapoff(si->type); spin_lock(&swap_lock); - p->swap_file = NULL; - p->flags = 0; + si->swap_file = NULL; + si->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); + kvfree(zeromap); kvfree(cluster_info); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) filp_close(swap_file, NULL); out: - if (page && !IS_ERR(page)) { - kunmap(page); - put_page(page); - } + if (!IS_ERR_OR_NULL(folio)) + folio_release_kmap(folio, swap_header); if (name) putname(name); if (inode) @@ -3362,7 +3603,7 @@ void si_swapinfo(struct sysinfo *val) } /* - * Verify that a swap entry is valid and increment its swap map count. + * Verify that nr swap entries are valid and increment their swap map counts. * * Returns error code in following case. * - success -> 0 @@ -3372,63 +3613,76 @@ void si_swapinfo(struct sysinfo *val) * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, unsigned char usage) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { - struct swap_info_struct *p; + struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; - int err; + int err, i; - p = swp_swap_info(entry); + si = swp_swap_info(entry); offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); + VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); + VM_WARN_ON(usage == 1 && nr > 1); + ci = lock_cluster_or_swap_info(si, offset); - count = p->swap_map[offset]; + err = 0; + for (i = 0; i < nr; i++) { + count = si->swap_map[offset + i]; - /* - * swapin_readahead() doesn't check if a swap entry is valid, so the - * swap entry could be SWAP_MAP_BAD. Check here with lock held. - */ - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { - err = -ENOENT; - goto unlock_out; + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } + + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + + if (!count && !has_cache) { + err = -ENOENT; + } else if (usage == SWAP_HAS_CACHE) { + if (has_cache) + err = -EEXIST; + } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { + err = -EINVAL; + } + + if (err) + goto unlock_out; } - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - err = 0; + for (i = 0; i < nr; i++) { + count = si->swap_map[offset + i]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; - if (usage == SWAP_HAS_CACHE) { - - /* set SWAP_HAS_CACHE if there is no cache and entry is used */ - if (!has_cache && count) + if (usage == SWAP_HAS_CACHE) has_cache = SWAP_HAS_CACHE; - else if (has_cache) /* someone else added cache */ - err = -EEXIST; - else /* no users remaining */ - err = -ENOENT; - - } else if (count || has_cache) { - - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; - else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) - err = -EINVAL; - else if (swap_count_continued(p, offset, count)) + else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; - else + else { + /* + * Don't need to rollback changes, because if + * usage == 1, there must be nr == 1. + */ err = -ENOMEM; - } else - err = -ENOENT; /* unused swap entry */ + goto unlock_out; + } - if (!err) - WRITE_ONCE(p->swap_map[offset], count | has_cache); + WRITE_ONCE(si->swap_map[offset + i], count | has_cache); + } unlock_out: - unlock_cluster_or_swap_info(p, ci); + unlock_cluster_or_swap_info(si, ci); return err; } @@ -3436,9 +3690,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ -void swap_shmem_alloc(swp_entry_t entry) +void swap_shmem_alloc(swp_entry_t entry, int nr) { - __swap_duplicate(entry, SWAP_MAP_SHMEM); + __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); } /* @@ -3452,35 +3706,29 @@ int swap_duplicate(swp_entry_t entry) { int err = 0; - while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* - * @entry: swap entry for which we allocate swap cache. + * @entry: first swap entry from which we allocate nr swap cache. * - * Called when allocating swap cache for existing swap entry, + * Called when allocating swap cache for existing swap entries, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ -int swapcache_prepare(swp_entry_t entry) +int swapcache_prepare(swp_entry_t entry, int nr) { - return __swap_duplicate(entry, SWAP_HAS_CACHE); + return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char usage; - ci = lock_cluster_or_swap_info(si, offset); - usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); - unlock_cluster_or_swap_info(si, ci); - if (!usage) - free_swap_slot(entry); + cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index acc56c75ba99..ce13c4062647 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -391,7 +391,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; @@ -1763,3 +1763,171 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, VM_WARN_ON(!moved && !err); return moved ? moved : err; } + +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t flags) +{ + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + + vm_flags_reset(vma, flags); + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) + vma_set_page_prot(vma); +} + +static void userfaultfd_set_ctx(struct vm_area_struct *vma, + struct userfaultfd_ctx *ctx, + unsigned long flags) +{ + vma_start_write(vma); + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; + userfaultfd_set_vm_flags(vma, + (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); +} + +void userfaultfd_reset_ctx(struct vm_area_struct *vma) +{ + userfaultfd_set_ctx(vma, NULL, 0); +} + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + struct vm_area_struct *ret; + + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(vma, start, end - start, false); + + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, + vma->vm_flags & ~__VM_UFFD_FLAGS, + NULL_VM_UFFD_CTX); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + if (!IS_ERR(ret)) + userfaultfd_reset_ctx(ret); + + return ret; +} + +/* Assumes mmap write lock taken, and mm_struct pinned. */ +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + unsigned long vm_flags, + unsigned long start, unsigned long end, + bool wp_async) +{ + VMA_ITERATOR(vmi, ctx->mm, start); + struct vm_area_struct *prev = vma_prev(&vmi); + unsigned long vma_end; + unsigned long new_flags; + + if (vma->vm_start < start) + prev = vma; + + for_each_vma_range(vmi, vma, end) { + cond_resched(); + + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); + BUG_ON(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + userfaultfd_set_ctx(vma, ctx, vm_flags); + + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); + +skip: + prev = vma; + start = vma->vm_end; + } + + return 0; +} + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + /* the various vma->vm_userfaultfd_ctx still points to it */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + if (vma->vm_userfaultfd_ctx.ctx == ctx) + userfaultfd_reset_ctx(vma); + } + mmap_write_unlock(mm); +} + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx) +{ + struct vm_area_struct *vma, *prev; + VMA_ITERATOR(vmi, mm, 0); + + if (!mmget_not_zero(mm)) + return; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_lock. So + * it's critical that released is set to true (above), before + * taking the mmap_lock for writing. + */ + mmap_write_lock(mm); + prev = NULL; + for_each_vma(vmi, vma) { + cond_resched(); + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & __VM_UFFD_FLAGS)); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + + vma = userfaultfd_clear_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end); + prev = vma; + } + mmap_write_unlock(mm); + mmput(mm); +} diff --git a/mm/util.c b/mm/util.c index bd283e2132e0..4f1275023eb7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -463,7 +463,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) if (gap + pad > gap) gap += pad; - if (gap < MIN_GAP) + if (gap < MIN_GAP && MIN_GAP < MAX_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; @@ -608,6 +608,28 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + /** * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. @@ -627,32 +649,15 @@ EXPORT_SYMBOL(vm_mmap); */ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) { - gfp_t kmalloc_flags = flags; void *ret; - /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - kmalloc_flags |= __GFP_NOWARN; - - if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) - kmalloc_flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - kmalloc_flags &= ~__GFP_NOFAIL; - } - - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), kmalloc_flags, node); - /* * It doesn't really make sense to fallback to vmalloc for sub page * requests */ + ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), + kmalloc_gfp_adjust(flags, size), + node); if (ret || size <= PAGE_SIZE) return ret; @@ -715,18 +720,53 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); -void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) { - void *newp; + void *n; - if (oldsize >= newsize) - return (void *)p; - newp = kvmalloc_noprof(newsize, flags); - if (!newp) - return NULL; - memcpy(newp, p, oldsize); - kvfree(p); - return newp; + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; } EXPORT_SYMBOL(kvrealloc_noprof); diff --git a/mm/vma.c b/mm/vma.c new file mode 100644 index 000000000000..4737afcb064c --- /dev/null +++ b/mm/vma.c @@ -0,0 +1,2068 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * VMA-specific functions. + */ + +#include "vma_internal.h" +#include "vma.h" + +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) +{ + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; + + if (!mpol_equal(vmg->policy, vma_policy(vma))) + return false; + /* + * VM_SOFTDIRTY should not prevent from VMA merging, if we + * match the flags but dirty bit -- the caller should mark + * merged VMA as dirty. If dirty bit won't be excluded from + * comparison, we increase pressure on the memory system forcing + * the kernel to generate new VMAs when old one could be + * extended instead. + */ + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) + return false; + if (vma->vm_file != vmg->file) + return false; + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) + return false; + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) + return false; + return true; +} + +static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, + struct anon_vma *anon_vma2, struct vm_area_struct *vma) +{ + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + if ((!anon_vma1 || !anon_vma2) && (!vma || + list_is_singular(&vma->anon_vma_chain))) + return true; + return anon_vma1 == anon_vma2; +} + +/* Are the anon_vma's belonging to each VMA compatible with one another? */ +static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, + struct vm_area_struct *vma2) +{ + return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL); +} + +/* + * init_multi_vma_prep() - Initializer for struct vma_prepare + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + * @next: The next vma if it is to be adjusted + * @remove: The first vma to be removed + * @remove2: The second vma to be removed + */ +static void init_multi_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma, + struct vm_area_struct *next, + struct vm_area_struct *remove, + struct vm_area_struct *remove2) +{ + memset(vp, 0, sizeof(struct vma_prepare)); + vp->vma = vma; + vp->anon_vma = vma->anon_vma; + vp->remove = remove; + vp->remove2 = remove2; + vp->adj_next = next; + if (!vp->anon_vma && next) + vp->anon_vma = next->anon_vma; + + vp->file = vma->vm_file; + if (vp->file) + vp->mapping = vma->vm_file->f_mapping; + +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * in front of (at a lower virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We don't check here for the merged mmap wrapping around the end of pagecache + * indices (16TB on ia32) because do_mmap() does not permit mmap's which + * wrap, nor mmaps which cover the final page at index -1UL. + * + * We assume the vma may be removed as part of the merge. + */ +static bool can_vma_merge_before(struct vma_merge_struct *vmg) +{ + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); + + if (is_mergeable_vma(vmg, /* merge_next = */ true) && + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) { + if (vmg->next->vm_pgoff == vmg->pgoff + pglen) + return true; + } + + return false; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * beyond (at a higher virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We assume that vma is not removed as part of the merge. + */ +static bool can_vma_merge_after(struct vma_merge_struct *vmg) +{ + if (is_mergeable_vma(vmg, /* merge_next = */ false) && + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) + return true; + } + return false; +} + +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma_is_shared_maywrite(vma)) + mapping_allow_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * Requires inode->i_mapping->i_mmap_rwsem + */ +static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma_is_shared_maywrite(vma)) + mapping_unmap_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * vma_prepare() - Helper function for handling locking VMAs prior to altering + * @vp: The initialized vma_prepare struct + */ +static void vma_prepare(struct vma_prepare *vp) +{ + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + + if (vp->adj_next) + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, + vp->adj_next->vm_end); + + i_mmap_lock_write(vp->mapping); + if (vp->insert && vp->insert->vm_file) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(vp->insert, + vp->insert->vm_file->f_mapping); + } + } + + if (vp->anon_vma) { + anon_vma_lock_write(vp->anon_vma); + anon_vma_interval_tree_pre_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_pre_update_vma(vp->adj_next); + } + + if (vp->file) { + flush_dcache_mmap_lock(vp->mapping); + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); + if (vp->adj_next) + vma_interval_tree_remove(vp->adj_next, + &vp->mapping->i_mmap); + } + +} + +/* + * vma_complete- Helper function for handling the unlocking after altering VMAs, + * or for inserting a VMA. + * + * @vp: The vma_prepare struct + * @vmi: The vma iterator + * @mm: The mm_struct + */ +static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, + struct mm_struct *mm) +{ + if (vp->file) { + if (vp->adj_next) + vma_interval_tree_insert(vp->adj_next, + &vp->mapping->i_mmap); + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); + flush_dcache_mmap_unlock(vp->mapping); + } + + if (vp->remove && vp->file) { + __remove_shared_vm_struct(vp->remove, vp->mapping); + if (vp->remove2) + __remove_shared_vm_struct(vp->remove2, vp->mapping); + } else if (vp->insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + vma_iter_store(vmi, vp->insert); + mm->map_count++; + } + + if (vp->anon_vma) { + anon_vma_interval_tree_post_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_post_update_vma(vp->adj_next); + anon_vma_unlock_write(vp->anon_vma); + } + + if (vp->file) { + i_mmap_unlock_write(vp->mapping); + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } + + if (vp->remove) { +again: + vma_mark_detached(vp->remove, true); + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); + fput(vp->file); + } + if (vp->remove->anon_vma) + anon_vma_merge(vp->vma, vp->remove); + mm->map_count--; + mpol_put(vma_policy(vp->remove)); + if (!vp->remove2) + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); + vm_area_free(vp->remove); + + /* + * In mprotect's case 6 (see comments on vma_merge), + * we are removing both mid and next vmas + */ + if (vp->remove2) { + vp->remove = vp->remove2; + vp->remove2 = NULL; + goto again; + } + } + if (vp->insert && vp->file) + uprobe_mmap(vp->insert); +} + +/* + * init_vma_prep() - Initializer wrapper for vma_prepare struct + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + */ +static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) +{ + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); +} + +/* + * Can the proposed VMA be merged with the left (previous) VMA taking into + * account the start position of the proposed range. + */ +static bool can_vma_merge_left(struct vma_merge_struct *vmg) + +{ + return vmg->prev && vmg->prev->vm_end == vmg->start && + can_vma_merge_after(vmg); +} + +/* + * Can the proposed VMA be merged with the right (next) VMA taking into + * account the end position of the proposed range. + * + * In addition, if we can merge with the left VMA, ensure that left and right + * anon_vma's are also compatible. + */ +static bool can_vma_merge_right(struct vma_merge_struct *vmg, + bool can_merge_left) +{ + if (!vmg->next || vmg->end != vmg->next->vm_start || + !can_vma_merge_before(vmg)) + return false; + + if (!can_merge_left) + return true; + + /* + * If we can merge with prev (left) and next (right), indicating that + * each VMA's anon_vma is compatible with the proposed anon_vma, this + * does not mean prev and next are compatible with EACH OTHER. + * + * We therefore check this in addition to mergeability to either side. + */ + return are_anon_vmas_compatible(vmg->prev, vmg->next); +} + +/* + * Close a vm structure and free it. + */ +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed) +{ + might_sleep(); + if (!closed && vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); + if (unreachable) + __vm_area_free(vma); + else + vm_area_free(vma); +} + +/* + * Get rid of page table information in the indicated region. + * + * Called with the mm semaphore held. + */ +void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct vm_area_struct *next) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + update_hiwater_rss(mm); + unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, + /* mm_wr_locked = */ true); + mas_set(mas, vma->vm_end); + free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : USER_PGTABLES_CEILING, + /* mm_wr_locked = */ true); + tlb_finish_mmu(&tlb); +} + +/* + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it + * has already been checked or doesn't make sense to fail. + * VMA Iterator will point to the original VMA. + */ +static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + struct vma_prepare vp; + struct vm_area_struct *new; + int err; + + WARN_ON(vma->vm_start >= addr); + WARN_ON(vma->vm_end <= addr); + + if (vma->vm_ops && vma->vm_ops->may_split) { + err = vma->vm_ops->may_split(vma, addr); + if (err) + return err; + } + + new = vm_area_dup(vma); + if (!new) + return -ENOMEM; + + if (new_below) { + new->vm_end = addr; + } else { + new->vm_start = addr; + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + } + + err = -ENOMEM; + vma_iter_config(vmi, new->vm_start, new->vm_end); + if (vma_iter_prealloc(vmi, new)) + goto out_free_vma; + + err = vma_dup_policy(vma, new); + if (err) + goto out_free_vmi; + + err = anon_vma_clone(new, vma); + if (err) + goto out_free_mpol; + + if (new->vm_file) + get_file(new->vm_file); + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + vma_start_write(vma); + vma_start_write(new); + + init_vma_prep(&vp, vma); + vp.insert = new; + vma_prepare(&vp); + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + + if (new_below) { + vma->vm_start = addr; + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; + } else { + vma->vm_end = addr; + } + + /* vma_complete stores the new vma */ + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + + /* Success. */ + if (new_below) + vma_next(vmi); + else + vma_prev(vmi); + + return 0; + +out_free_mpol: + mpol_put(vma_policy(new)); +out_free_vmi: + vma_iter_free(vmi); +out_free_vma: + vm_area_free(new); + return err; +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the tail. + */ +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + if (vma->vm_mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + return __split_vma(vmi, vma, addr, new_below); +} + +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_lock and by + * the root anon_vma's mutex. + */ +void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +/* + * dup_anon_vma() - Helper function to duplicate anon_vma + * @dst: The destination VMA + * @src: The source VMA + * @dup: Pointer to the destination VMA when successful. + * + * Returns: 0 on success. + */ +static int dup_anon_vma(struct vm_area_struct *dst, + struct vm_area_struct *src, struct vm_area_struct **dup) +{ + /* + * Easily overlooked: when mprotect shifts the boundary, make sure the + * expanding vma has anon_vma set if the shrinking vma had, to cover any + * anon pages imported. + */ + if (src->anon_vma && !dst->anon_vma) { + int ret; + + vma_assert_write_locked(dst); + dst->anon_vma = src->anon_vma; + ret = anon_vma_clone(dst, src); + if (ret) + return ret; + + *dup = dst; + } + + return 0; +} + +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE +void validate_mm(struct mm_struct *mm) +{ + int bug = 0; + int i = 0; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + mt_validate(&mm->mm_mt); + for_each_vma(vmi, vma) { +#ifdef CONFIG_DEBUG_VM_RB + struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; +#endif + unsigned long vmi_start, vmi_end; + bool warn = 0; + + vmi_start = vma_iter_addr(&vmi); + vmi_end = vma_iter_end(&vmi); + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) + warn = 1; + + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) + warn = 1; + + if (warn) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma); + pr_emerg("tree range: %px start %lx end %lx\n", vma, + vmi_start, vmi_end - 1); + vma_iter_dump_tree(&vmi); + } + +#ifdef CONFIG_DEBUG_VM_RB + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } +#endif + i++; + } + if (i != mm->map_count) { + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); + bug = 1; + } + VM_BUG_ON_MM(bug, mm); +} +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ + +/* Actually perform the VMA merge operation. */ +static int commit_merge(struct vma_merge_struct *vmg, + struct vm_area_struct *adjust, + struct vm_area_struct *remove, + struct vm_area_struct *remove2, + long adj_start, + bool expanded) +{ + struct vma_prepare vp; + + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2); + + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + if (expanded) { + /* Note: vma iterator must be pointing to 'start'. */ + vma_iter_config(vmg->vmi, vmg->start, vmg->end); + } else { + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start, + adjust->vm_end); + } + + if (vma_iter_prealloc(vmg->vmi, vmg->vma)) + return -ENOMEM; + + vma_prepare(&vp); + vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start); + vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff); + + if (expanded) + vma_iter_store(vmg->vmi, vmg->vma); + + if (adj_start) { + adjust->vm_start += adj_start; + adjust->vm_pgoff += PHYS_PFN(adj_start); + if (adj_start < 0) { + WARN_ON(expanded); + vma_iter_store(vmg->vmi, adjust); + } + } + + vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm); + + return 0; +} + +/* We can only remove VMAs when merging if they do not have a close hook. */ +static bool can_merge_remove_vma(struct vm_area_struct *vma) +{ + return !vma->vm_ops || !vma->vm_ops->close; +} + +/* + * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its + * attributes modified. + * + * @vmg: Describes the modifications being made to a VMA and associated + * metadata. + * + * When the attributes of a range within a VMA change, then it might be possible + * for immediately adjacent VMAs to be merged into that VMA due to having + * identical properties. + * + * This function checks for the existence of any such mergeable VMAs and updates + * the maple tree describing the @vmg->vma->vm_mm address space to account for + * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge. + * + * As part of this operation, if a merge occurs, the @vmg object will have its + * vma, start, end, and pgoff fields modified to execute the merge. Subsequent + * calls to this function should reset these fields. + * + * Returns: The merged VMA if merge succeeds, or NULL otherwise. + * + * ASSUMPTIONS: + * - The caller must assign the VMA to be modifed to @vmg->vma. + * - The caller must have set @vmg->prev to the previous VMA, if there is one. + * - The caller must not set @vmg->next, as we determine this. + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). + */ +static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *prev = vmg->prev; + struct vm_area_struct *next, *res; + struct vm_area_struct *anon_dup = NULL; + struct vm_area_struct *adjust = NULL; + unsigned long start = vmg->start; + unsigned long end = vmg->end; + bool left_side = vma && start == vma->vm_start; + bool right_side = vma && end == vma->vm_end; + int err = 0; + long adj_start = 0; + bool merge_will_delete_vma, merge_will_delete_next; + bool merge_left, merge_right, merge_both; + bool expanded; + + mmap_assert_write_locked(vmg->mm); + VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */ + VM_WARN_ON(vmg->next); /* We set this. */ + VM_WARN_ON(prev && start <= prev->vm_start); + VM_WARN_ON(start >= end); + /* + * If vma == prev, then we are offset into a VMA. Otherwise, if we are + * not, we must span a portion of the VMA. + */ + VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) || + vmg->end > vma->vm_end)); + /* The vmi must be positioned within vmg->vma. */ + VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && + vma_iter_addr(vmg->vmi) < vma->vm_end)); + + vmg->state = VMA_MERGE_NOMERGE; + + /* + * If a special mapping or if the range being modified is neither at the + * furthermost left or right side of the VMA, then we have no chance of + * merging and should abort. + */ + if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) + return NULL; + + if (left_side) + merge_left = can_vma_merge_left(vmg); + else + merge_left = false; + + if (right_side) { + next = vmg->next = vma_iter_next_range(vmg->vmi); + vma_iter_prev_range(vmg->vmi); + + merge_right = can_vma_merge_right(vmg, merge_left); + } else { + merge_right = false; + next = NULL; + } + + if (merge_left) /* If merging prev, position iterator there. */ + vma_prev(vmg->vmi); + else if (!merge_right) /* If we have nothing to merge, abort. */ + return NULL; + + merge_both = merge_left && merge_right; + /* If we span the entire VMA, a merge implies it will be deleted. */ + merge_will_delete_vma = left_side && right_side; + + /* + * If we need to remove vma in its entirety but are unable to do so, + * we have no sensible recourse but to abort the merge. + */ + if (merge_will_delete_vma && !can_merge_remove_vma(vma)) + return NULL; + + /* + * If we merge both VMAs, then next is also deleted. This implies + * merge_will_delete_vma also. + */ + merge_will_delete_next = merge_both; + + /* + * If we cannot delete next, then we can reduce the operation to merging + * prev and vma (thereby deleting vma). + */ + if (merge_will_delete_next && !can_merge_remove_vma(next)) { + merge_will_delete_next = false; + merge_right = false; + merge_both = false; + } + + /* No matter what happens, we will be adjusting vma. */ + vma_start_write(vma); + + if (merge_left) + vma_start_write(prev); + + if (merge_right) + vma_start_write(next); + + if (merge_both) { + /* + * |<----->| + * |-------*********-------| + * prev vma next + * extend delete delete + */ + + vmg->vma = prev; + vmg->start = prev->vm_start; + vmg->end = next->vm_end; + vmg->pgoff = prev->vm_pgoff; + + /* + * We already ensured anon_vma compatibility above, so now it's + * simply a case of, if prev has no anon_vma object, which of + * next or vma contains the anon_vma we must duplicate. + */ + err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup); + } else if (merge_left) { + /* + * |<----->| OR + * |<--------->| + * |-------************* + * prev vma + * extend shrink/delete + */ + + vmg->vma = prev; + vmg->start = prev->vm_start; + vmg->pgoff = prev->vm_pgoff; + + if (!merge_will_delete_vma) { + adjust = vma; + adj_start = vmg->end - vma->vm_start; + } + + err = dup_anon_vma(prev, vma, &anon_dup); + } else { /* merge_right */ + /* + * |<----->| OR + * |<--------->| + * *************-------| + * vma next + * shrink/delete extend + */ + + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); + + VM_WARN_ON(!merge_right); + /* If we are offset into a VMA, then prev must be vma. */ + VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev); + + if (merge_will_delete_vma) { + vmg->vma = next; + vmg->end = next->vm_end; + vmg->pgoff = next->vm_pgoff - pglen; + } else { + /* + * We shrink vma and expand next. + * + * IMPORTANT: This is the ONLY case where the final + * merged VMA is NOT vmg->vma, but rather vmg->next. + */ + + vmg->start = vma->vm_start; + vmg->end = start; + vmg->pgoff = vma->vm_pgoff; + + adjust = next; + adj_start = -(vma->vm_end - start); + } + + err = dup_anon_vma(next, vma, &anon_dup); + } + + if (err) + goto abort; + + /* + * In nearly all cases, we expand vmg->vma. There is one exception - + * merge_right where we partially span the VMA. In this case we shrink + * the end of vmg->vma and adjust the start of vmg->next accordingly. + */ + expanded = !merge_right || merge_will_delete_vma; + + if (commit_merge(vmg, adjust, + merge_will_delete_vma ? vma : NULL, + merge_will_delete_next ? next : NULL, + adj_start, expanded)) { + if (anon_dup) + unlink_anon_vmas(anon_dup); + + vmg->state = VMA_MERGE_ERROR_NOMEM; + return NULL; + } + + res = merge_left ? prev : next; + khugepaged_enter_vma(res, vmg->flags); + + vmg->state = VMA_MERGE_SUCCESS; + return res; + +abort: + vma_iter_set(vmg->vmi, start); + vma_iter_load(vmg->vmi); + vmg->state = VMA_MERGE_ERROR_NOMEM; + return NULL; +} + +/* + * vma_merge_new_range - Attempt to merge a new VMA into address space + * + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end + * (exclusive), which we try to merge with any adjacent VMAs if possible. + * + * We are about to add a VMA to the address space starting at @vmg->start and + * ending at @vmg->end. There are three different possible scenarios: + * + * 1. There is a VMA with identical properties immediately adjacent to the + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - + * EXPAND that VMA: + * + * Proposed: |-----| or |-----| + * Existing: |----| |----| + * + * 2. There are VMAs with identical properties immediately adjacent to the + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - + * EXPAND the former and REMOVE the latter: + * + * Proposed: |-----| + * Existing: |----| |----| + * + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those + * VMAs do not have identical attributes - NO MERGE POSSIBLE. + * + * In instances where we can merge, this function returns the expanded VMA which + * will have its range adjusted accordingly and the underlying maple tree also + * adjusted. + * + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer + * to the VMA we expanded. + * + * This function adjusts @vmg to provide @vmg->next if not already specified, + * and adjusts [@vmg->start, @vmg->end) to span the expanded range. + * + * ASSUMPTIONS: + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - The caller must have determined that [@vmg->start, @vmg->end) is empty, + other than VMAs that will be unmapped should the operation succeed. + * - The caller must have specified the previous vma in @vmg->prev. + * - The caller must have specified the next vma in @vmg->next. + * - The caller must have positioned the vmi at or before the gap. + */ +struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *prev = vmg->prev; + struct vm_area_struct *next = vmg->next; + unsigned long start = vmg->start; + unsigned long end = vmg->end; + pgoff_t pgoff = vmg->pgoff; + pgoff_t pglen = PHYS_PFN(end - start); + bool can_merge_left, can_merge_right; + + mmap_assert_write_locked(vmg->mm); + VM_WARN_ON(vmg->vma); + /* vmi must point at or before the gap. */ + VM_WARN_ON(vma_iter_addr(vmg->vmi) > end); + + vmg->state = VMA_MERGE_NOMERGE; + + /* Special VMAs are unmergeable, also if no prev/next. */ + if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) + return NULL; + + can_merge_left = can_vma_merge_left(vmg); + can_merge_right = can_vma_merge_right(vmg, can_merge_left); + + /* If we can merge with the next VMA, adjust vmg accordingly. */ + if (can_merge_right) { + vmg->end = next->vm_end; + vmg->vma = next; + vmg->pgoff = next->vm_pgoff - pglen; + } + + /* If we can merge with the previous VMA, adjust vmg accordingly. */ + if (can_merge_left) { + vmg->start = prev->vm_start; + vmg->vma = prev; + vmg->pgoff = prev->vm_pgoff; + + /* + * If this merge would result in removal of the next VMA but we + * are not permitted to do so, reduce the operation to merging + * prev and vma. + */ + if (can_merge_right && !can_merge_remove_vma(next)) + vmg->end = end; + + vma_prev(vmg->vmi); /* Equivalent to going to the previous range */ + } + + /* + * Now try to expand adjacent VMA(s). This takes care of removing the + * following VMA if we have VMAs on both sides. + */ + if (vmg->vma && !vma_expand(vmg)) { + khugepaged_enter_vma(vmg->vma, vmg->flags); + vmg->state = VMA_MERGE_SUCCESS; + return vmg->vma; + } + + /* If expansion failed, reset state. Allows us to retry merge later. */ + vmg->vma = NULL; + vmg->start = start; + vmg->end = end; + vmg->pgoff = pgoff; + if (vmg->vma == prev) + vma_iter_set(vmg->vmi, start); + + return NULL; +} + +/* + * vma_expand - Expand an existing VMA + * + * @vmg: Describes a VMA expansion operation. + * + * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. + * Will expand over vmg->next if it's different from vmg->vma and vmg->end == + * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with + * vmg->next needs to be handled by the caller. + * + * Returns: 0 on success. + * + * ASSUMPTIONS: + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. + * - The caller must have set @vmg->vma and @vmg->next. + */ +int vma_expand(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *anon_dup = NULL; + bool remove_next = false; + struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *next = vmg->next; + + mmap_assert_write_locked(vmg->mm); + + vma_start_write(vma); + if (next && (vma != next) && (vmg->end == next->vm_end)) { + int ret; + + remove_next = true; + /* This should already have been checked by this point. */ + VM_WARN_ON(!can_merge_remove_vma(next)); + vma_start_write(next); + ret = dup_anon_vma(vma, next, &anon_dup); + if (ret) + return ret; + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_WARN_ON(next && !remove_next && + next != vma && vmg->end > next->vm_start); + /* Only handles expanding */ + VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end); + + if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true)) + goto nomem; + + return 0; + +nomem: + vmg->state = VMA_MERGE_ERROR_NOMEM; + if (anon_dup) + unlink_anon_vmas(anon_dup); + return -ENOMEM; +} + +/* + * vma_shrink() - Reduce an existing VMAs memory area + * @vmi: The vma iterator + * @vma: The VMA to modify + * @start: The new start + * @end: The new end + * + * Returns: 0 on success, -ENOMEM otherwise + */ +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff) +{ + struct vma_prepare vp; + + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); + + if (vma->vm_start < start) + vma_iter_config(vmi, vma->vm_start, start); + else + vma_iter_config(vmi, end, vma->vm_end); + + if (vma_iter_prealloc(vmi, NULL)) + return -ENOMEM; + + vma_start_write(vma); + + init_vma_prep(&vp, vma); + vma_prepare(&vp); + vma_adjust_trans_huge(vma, start, end, 0); + + vma_iter_clear(vmi); + vma_set_range(vma, start, end, pgoff); + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + return 0; +} + +static inline void vms_clear_ptes(struct vma_munmap_struct *vms, + struct ma_state *mas_detach, bool mm_wr_locked) +{ + struct mmu_gather tlb; + + if (!vms->clear_ptes) /* Nothing to do */ + return; + + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + mas_set(mas_detach, 1); + lru_add_drain(); + tlb_gather_mmu(&tlb, vms->vma->vm_mm); + update_hiwater_rss(vms->vma->vm_mm); + unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, + vms->vma_count, mm_wr_locked); + + mas_set(mas_detach, 1); + /* start and end may be different if there is no prev or next vma. */ + free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, + vms->unmap_end, mm_wr_locked); + tlb_finish_mmu(&tlb); + vms->clear_ptes = false; +} + +void vms_clean_up_area(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct vm_area_struct *vma; + + if (!vms->nr_pages) + return; + + vms_clear_ptes(vms, mas_detach, true); + mas_set(mas_detach, 0); + mas_for_each(mas_detach, vma, ULONG_MAX) + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + vms->closed_vm_ops = true; +} + +/* + * vms_complete_munmap_vmas() - Finish the munmap() operation + * @vms: The vma munmap struct + * @mas_detach: The maple state of the detached vmas + * + * This updates the mm_struct, unmaps the region, frees the resources + * used for the munmap() and may downgrade the lock - if requested. Everything + * needed to be done once the vma maple tree is updated. + */ +void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + + mm = current->mm; + mm->map_count -= vms->vma_count; + mm->locked_vm -= vms->locked_vm; + if (vms->unlock) + mmap_write_downgrade(mm); + + if (!vms->nr_pages) + return; + + vms_clear_ptes(vms, mas_detach, !vms->unlock); + /* Update high watermark before we lower total_vm */ + update_hiwater_vm(mm); + /* Stat accounting */ + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); + /* Paranoid bookkeeping */ + VM_WARN_ON(vms->exec_vm > mm->exec_vm); + VM_WARN_ON(vms->stack_vm > mm->stack_vm); + VM_WARN_ON(vms->data_vm > mm->data_vm); + mm->exec_vm -= vms->exec_vm; + mm->stack_vm -= vms->stack_vm; + mm->data_vm -= vms->data_vm; + + /* Remove and clean up vmas */ + mas_set(mas_detach, 0); + mas_for_each(mas_detach, vma, ULONG_MAX) + remove_vma(vma, /* = */ false, vms->closed_vm_ops); + + vm_unacct_memory(vms->nr_accounted); + validate_mm(mm); + if (vms->unlock) + mmap_read_unlock(mm); + + __mt_destroy(mas_detach->tree); +} + +/* + * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree + * for removal at a later date. Handles splitting first and last if necessary + * and marking the vmas as isolated. + * + * @vms: The vma munmap struct + * @mas_detach: The maple state tracking the detached tree + * + * Return: 0 on success, error otherwise + */ +int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct vm_area_struct *next = NULL; + int error; + + /* + * If we need to split any vma, do it now to save pain later. + * Does it split the first one? + */ + if (vms->start > vms->vma->vm_start) { + + /* + * Make sure that map_count on return from munmap() will + * not exceed its limit; but let map_count go just above + * its limit temporarily, to help free resources as expected. + */ + if (vms->end < vms->vma->vm_end && + vms->vma->vm_mm->map_count >= sysctl_max_map_count) { + error = -ENOMEM; + goto map_count_exceeded; + } + + /* Don't bother splitting the VMA if we can't unmap it anyway */ + if (!can_modify_vma(vms->vma)) { + error = -EPERM; + goto start_split_failed; + } + + error = __split_vma(vms->vmi, vms->vma, vms->start, 1); + if (error) + goto start_split_failed; + } + vms->prev = vma_prev(vms->vmi); + if (vms->prev) + vms->unmap_start = vms->prev->vm_end; + + /* + * Detach a range of VMAs from the mm. Using next as a temp variable as + * it is always overwritten. + */ + for_each_vma_range(*(vms->vmi), next, vms->end) { + long nrpages; + + if (!can_modify_vma(next)) { + error = -EPERM; + goto modify_vma_failed; + } + /* Does it split the end? */ + if (next->vm_end > vms->end) { + error = __split_vma(vms->vmi, next, vms->end, 0); + if (error) + goto end_split_failed; + } + vma_start_write(next); + mas_set(mas_detach, vms->vma_count++); + error = mas_store_gfp(mas_detach, next, GFP_KERNEL); + if (error) + goto munmap_gather_failed; + + vma_mark_detached(next, true); + nrpages = vma_pages(next); + + vms->nr_pages += nrpages; + if (next->vm_flags & VM_LOCKED) + vms->locked_vm += nrpages; + + if (next->vm_flags & VM_ACCOUNT) + vms->nr_accounted += nrpages; + + if (is_exec_mapping(next->vm_flags)) + vms->exec_vm += nrpages; + else if (is_stack_mapping(next->vm_flags)) + vms->stack_vm += nrpages; + else if (is_data_mapping(next->vm_flags)) + vms->data_vm += nrpages; + + if (unlikely(vms->uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain split, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + error = userfaultfd_unmap_prep(next, vms->start, + vms->end, vms->uf); + if (error) + goto userfaultfd_error; + } +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + BUG_ON(next->vm_start < vms->start); + BUG_ON(next->vm_start > vms->end); +#endif + } + + vms->next = vma_next(vms->vmi); + if (vms->next) + vms->unmap_end = vms->next->vm_start; + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + /* Make sure no VMAs are about to be lost. */ + { + MA_STATE(test, mas_detach->tree, 0, 0); + struct vm_area_struct *vma_mas, *vma_test; + int test_count = 0; + + vma_iter_set(vms->vmi, vms->start); + rcu_read_lock(); + vma_test = mas_find(&test, vms->vma_count - 1); + for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { + BUG_ON(vma_mas != vma_test); + test_count++; + vma_test = mas_next(&test, vms->vma_count - 1); + } + rcu_read_unlock(); + BUG_ON(vms->vma_count != test_count); + } +#endif + + while (vma_iter_addr(vms->vmi) > vms->start) + vma_iter_prev_range(vms->vmi); + + vms->clear_ptes = true; + return 0; + +userfaultfd_error: +munmap_gather_failed: +end_split_failed: +modify_vma_failed: + reattach_vmas(mas_detach); +start_split_failed: +map_count_exceeded: + return error; +} + +/* + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. + * @vmi: The vma iterator + * @vma: The starting vm_area_struct + * @mm: The mm_struct + * @start: The aligned start address to munmap. + * @end: The aligned end address to munmap. + * @uf: The userfaultfd list_head + * @unlock: Set to true to drop the mmap_lock. unlocking only happens on + * success. + * + * Return: 0 on success and drops the lock if so directed, error and leaves the + * lock held otherwise. + */ +int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, unsigned long end, + struct list_head *uf, bool unlock) +{ + struct maple_tree mt_detach; + MA_STATE(mas_detach, &mt_detach, 0, 0); + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); + mt_on_stack(mt_detach); + struct vma_munmap_struct vms; + int error; + + init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); + error = vms_gather_munmap_vmas(&vms, &mas_detach); + if (error) + goto gather_failed; + + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); + if (error) + goto clear_tree_failed; + + /* Point of no return */ + vms_complete_munmap_vmas(&vms, &mas_detach); + return 0; + +clear_tree_failed: + reattach_vmas(&mas_detach); +gather_failed: + validate_mm(mm); + return error; +} + +/* + * do_vmi_munmap() - munmap a given range. + * @vmi: The vma iterator + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length of the range to munmap + * @uf: The userfaultfd list_head + * @unlock: set to true if the user wants to drop the mmap_lock on success + * + * This function takes a @mas that is either pointing to the previous VMA or set + * to MA_START and sets it up to remove the mapping(s). The @len will be + * aligned. + * + * Return: 0 on success and drops the lock if so directed, error and leaves the + * lock held otherwise. + */ +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool unlock) +{ + unsigned long end; + struct vm_area_struct *vma; + + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + end = start + PAGE_ALIGN(len); + if (end == start) + return -EINVAL; + + /* Find the first overlapping VMA */ + vma = vma_find(vmi, end); + if (!vma) { + if (unlock) + mmap_write_unlock(mm); + return 0; + } + + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); +} + +/* + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd + * context and anonymous VMA name within the range [start, end). + * + * As a result, we might be able to merge the newly modified VMA range with an + * adjacent VMA with identical properties. + * + * If no merge is possible and the range does not span the entirety of the VMA, + * we then need to split the VMA to accommodate the change. + * + * The function returns either the merged VMA, the original VMA if a split was + * required instead, or an error if the split failed. + */ +static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *merged; + + /* First, try to merge. */ + merged = vma_merge_existing_range(vmg); + if (merged) + return merged; + + /* Split any preceding portion of the VMA. */ + if (vma->vm_start < vmg->start) { + int err = split_vma(vmg->vmi, vma, vmg->start, 1); + + if (err) + return ERR_PTR(err); + } + + /* Split any trailing portion of the VMA. */ + if (vma->vm_end > vmg->end) { + int err = split_vma(vmg->vmi, vma, vmg->end, 0); + + if (err) + return ERR_PTR(err); + } + + return vma; +} + +struct vm_area_struct *vma_modify_flags( + struct vma_iterator *vmi, struct vm_area_struct *prev, + struct vm_area_struct *vma, unsigned long start, unsigned long end, + unsigned long new_flags) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long new_flags, + struct anon_vma_name *new_name) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + vmg.anon_name = new_name; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.policy = new_pol; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags, + struct vm_userfaultfd_ctx new_ctx) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + vmg.uffd_ctx = new_ctx; + + return vma_modify(&vmg); +} + +/* + * Expand vma by delta bytes, potentially merging with an immediately adjacent + * VMA with identical properties. + */ +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta) +{ + VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); + + vmg.next = vma_iter_next_rewind(vmi, NULL); + vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */ + + return vma_merge_new_range(&vmg); +} + +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) +{ + vb->count = 0; +} + +static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) +{ + struct address_space *mapping; + int i; + + mapping = vb->vmas[0]->vm_file->f_mapping; + i_mmap_lock_write(mapping); + for (i = 0; i < vb->count; i++) { + VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); + __remove_shared_vm_struct(vb->vmas[i], mapping); + } + i_mmap_unlock_write(mapping); + + unlink_file_vma_batch_init(vb); +} + +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, + struct vm_area_struct *vma) +{ + if (vma->vm_file == NULL) + return; + + if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || + vb->count == ARRAY_SIZE(vb->vmas)) + unlink_file_vma_batch_process(vb); + + vb->vmas[vb->count] = vma; + vb->count++; +} + +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) +{ + if (vb->count > 0) + unlink_file_vma_batch_process(vb); +} + +/* + * Unlink a file-based vm structure from its interval tree, to hide + * vma from rmap and vmtruncate before freeing its page tables. + */ +void unlink_file_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (file) { + struct address_space *mapping = file->f_mapping; + + i_mmap_lock_write(mapping); + __remove_shared_vm_struct(vma, mapping); + i_mmap_unlock_write(mapping); + } +} + +void vma_link_file(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct address_space *mapping; + + if (file) { + mapping = file->f_mapping; + i_mmap_lock_write(mapping); + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); + } +} + +int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) +{ + VMA_ITERATOR(vmi, mm, 0); + + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + vma_start_write(vma); + vma_iter_store(&vmi, vma); + vma_link_file(vma); + mm->map_count++; + validate_mm(mm); + return 0; +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) +{ + struct vm_area_struct *vma = *vmap; + unsigned long vma_start = vma->vm_start; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma; + bool faulted_in_anon_vma = true; + VMA_ITERATOR(vmi, mm, addr); + VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); + + /* + * If anonymous vma has not yet been faulted, update new pgoff + * to match new location, to increase its chance of merging. + */ + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { + pgoff = addr >> PAGE_SHIFT; + faulted_in_anon_vma = false; + } + + new_vma = find_vma_prev(mm, addr, &vmg.prev); + if (new_vma && new_vma->vm_start < addr + len) + return NULL; /* should never get here */ + + vmg.vma = NULL; /* New VMA range. */ + vmg.pgoff = pgoff; + vmg.next = vma_iter_next_rewind(&vmi, NULL); + new_vma = vma_merge_new_range(&vmg); + + if (new_vma) { + /* + * Source vma may have been merged into new_vma + */ + if (unlikely(vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end)) { + /* + * The only way we can get a vma_merge with + * self during an mremap is if the vma hasn't + * been faulted in yet and we were allowed to + * reset the dst vma->vm_pgoff to the + * destination address of the mremap to allow + * the merge to happen. mremap must change the + * vm_pgoff linearity between src and dst vmas + * (in turn preventing a vma_merge) to be + * safe. It is only safe to keep the vm_pgoff + * linear if there are no pages mapped yet. + */ + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); + *vmap = vma = new_vma; + } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); + } else { + new_vma = vm_area_dup(vma); + if (!new_vma) + goto out; + vma_set_range(new_vma, addr, addr + len, pgoff); + if (vma_dup_policy(vma, new_vma)) + goto out_free_vma; + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + if (vma_link(mm, new_vma)) + goto out_vma_link; + *need_rmap_locks = false; + } + return new_vma; + +out_vma_link: + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); + + if (new_vma->vm_file) + fput(new_vma->vm_file); + + unlink_anon_vmas(new_vma); +out_free_mempol: + mpol_put(vma_policy(new_vma)); +out_free_vma: + vm_area_free(new_vma); +out: + return NULL; +} + +/* + * Rough compatibility check to quickly see if it's even worth looking + * at sharing an anon_vma. + * + * They need to have the same vm_file, and the flags can only differ + * in things that mprotect may change. + * + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that + * we can merge the two vma's. For example, we refuse to merge a vma if + * there is a vm_ops->close() function, because that indicates that the + * driver is doing some kind of reference counting. But that doesn't + * really matter for the anon_vma sharing case. + */ +static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) +{ + return a->vm_end == b->vm_start && + mpol_equal(vma_policy(a), vma_policy(b)) && + a->vm_file == b->vm_file && + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); +} + +/* + * Do some basic sanity checking to see if we can re-use the anon_vma + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be + * the same as 'old', the other will be the new one that is trying + * to share the anon_vma. + * + * NOTE! This runs with mmap_lock held for reading, so it is possible that + * the anon_vma of 'old' is concurrently in the process of being set up + * by another page fault trying to merge _that_. But that's ok: if it + * is being set up, that automatically means that it will be a singleton + * acceptable for merging, so we can do all of this optimistically. But + * we do that READ_ONCE() to make sure that we never re-load the pointer. + * + * IOW: that the "list_is_singular()" test on the anon_vma_chain only + * matters for the 'stable anon_vma' case (ie the thing we want to avoid + * is to return an anon_vma that is "complex" due to having gone through + * a fork). + * + * We also make sure that the two vma's are compatible (adjacent, + * and with the same memory policies). That's all stable, even with just + * a read lock on the mmap_lock. + */ +static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, + struct vm_area_struct *a, + struct vm_area_struct *b) +{ + if (anon_vma_compatible(a, b)) { + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); + + if (anon_vma && list_is_singular(&old->anon_vma_chain)) + return anon_vma; + } + return NULL; +} + +/* + * find_mergeable_anon_vma is used by anon_vma_prepare, to check + * neighbouring vmas for a suitable anon_vma, before it goes off + * to allocate a new anon_vma. It checks because a repetitive + * sequence of mprotects and faults may otherwise lead to distinct + * anon_vmas being allocated, preventing vma merge in subsequent + * mprotect. + */ +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = NULL; + struct vm_area_struct *prev, *next; + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); + + /* Try next first. */ + next = vma_iter_load(&vmi); + if (next) { + anon_vma = reusable_anon_vma(next, vma, next); + if (anon_vma) + return anon_vma; + } + + prev = vma_prev(&vmi); + VM_BUG_ON_VMA(prev != vma, vma); + prev = vma_prev(&vmi); + /* Try prev next. */ + if (prev) + anon_vma = reusable_anon_vma(prev, prev, vma); + + /* + * We might reach here with anon_vma == NULL if we can't find + * any reusable anon_vma. + * There's no absolute need to look only at touching neighbours: + * we could search further afield for "compatible" anon_vmas. + * But it would probably just be a waste of time searching, + * or lead to too many vmas hanging off the same anon_vma. + * We're trying to allow mprotect remerging later on, + * not trying to minimize memory used for anon_vmas. + */ + return anon_vma; +} + +static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) +{ + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); +} + +static bool vma_is_shared_writable(struct vm_area_struct *vma) +{ + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == + (VM_WRITE | VM_SHARED); +} + +static bool vma_fs_can_writeback(struct vm_area_struct *vma) +{ + /* No managed pages to writeback. */ + if (vma->vm_flags & VM_PFNMAP) + return false; + + return vma->vm_file && vma->vm_file->f_mapping && + mapping_can_writeback(vma->vm_file->f_mapping); +} + +/* + * Does this VMA require the underlying folios to have their dirty state + * tracked? + */ +bool vma_needs_dirty_tracking(struct vm_area_struct *vma) +{ + /* Only shared, writable VMAs require dirty tracking. */ + if (!vma_is_shared_writable(vma)) + return false; + + /* Does the filesystem need to be notified? */ + if (vm_ops_needs_writenotify(vma->vm_ops)) + return true; + + /* + * Even if the filesystem doesn't indicate a need for writenotify, if it + * can writeback, dirty tracking is still required. + */ + return vma_fs_can_writeback(vma); +} + +/* + * Some shared mappings will want the pages marked read-only + * to track write events. If so, we'll downgrade vm_page_prot + * to the private version (using protection_map[] without the + * VM_SHARED bit). + */ +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) +{ + /* If it was private or non-writable, the write bit is already clear */ + if (!vma_is_shared_writable(vma)) + return false; + + /* The backer wishes to know when pages are first written to? */ + if (vm_ops_needs_writenotify(vma->vm_ops)) + return true; + + /* The open routine did something to the protections that pgprot_modify + * won't preserve? */ + if (pgprot_val(vm_page_prot) != + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) + return false; + + /* + * Do we need to track softdirty? hugetlb does not support softdirty + * tracking yet. + */ + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) + return true; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_wp(vma)) + return true; + + /* Can the mapping track the dirty pages? */ + return vma_fs_can_writeback(vma); +} + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); + /* + * We can safely modify head.next after taking the + * anon_vma->root->rwsem. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->root->rwsem. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->root->rb_root.rb_root.rb_node)) + BUG(); + } +} + +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_lock in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_lock until mm_drop_all_locks() returns. + * + * mmap_lock in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout. It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We take locks in following order, accordingly to comment at beginning + * of mm/rmap.c: + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for + * hugetlb mapping); + * - all vmas marked locked + * - all i_mmap_rwsem locks; + * - all anon_vma->rwseml + * + * We can take all locks within these types randomly because the VM code + * doesn't nest them and we protected from parallel mm_take_all_locks() by + * mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct anon_vma_chain *avc; + VMA_ITERATOR(vmi, mm, 0); + + mmap_assert_write_locked(mm); + + mutex_lock(&mm_all_locks_mutex); + + /* + * vma_start_write() does not have a complement in mm_drop_all_locks() + * because vma_start_write() is always asymmetrical; it marks a VMA as + * being written to until mmap_write_unlock() or mmap_write_downgrade() + * is reached. + */ + for_each_vma(vmi, vma) { + if (signal_pending(current)) + goto out_unlock; + vma_start_write(vma); + } + + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + !is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_lock_anon_vma(mm, avc->anon_vma); + } + + return 0; + +out_unlock: + mm_drop_all_locks(mm); + return -EINTR; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->rb_root will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->root->rwsem. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->root->rb_root.rb_root.rb_node)) + BUG(); + anon_vma_unlock_write(anon_vma); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + i_mmap_unlock_write(mapping); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_lock cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct anon_vma_chain *avc; + VMA_ITERATOR(vmi, mm, 0); + + mmap_assert_write_locked(mm); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + for_each_vma(vmi, vma) { + if (vma->anon_vma) + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_unlock_anon_vma(avc->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} diff --git a/mm/vma.h b/mm/vma.h new file mode 100644 index 000000000000..819f994cf727 --- /dev/null +++ b/mm/vma.h @@ -0,0 +1,558 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * vma.h + * + * Core VMA manipulation API implemented in vma.c. + */ +#ifndef __MM_VMA_H +#define __MM_VMA_H + +/* + * VMA lock generalization + */ +struct vma_prepare { + struct vm_area_struct *vma; + struct vm_area_struct *adj_next; + struct file *file; + struct address_space *mapping; + struct anon_vma *anon_vma; + struct vm_area_struct *insert; + struct vm_area_struct *remove; + struct vm_area_struct *remove2; +}; + +struct unlink_vma_file_batch { + int count; + struct vm_area_struct *vmas[8]; +}; + +/* + * vma munmap operation + */ +struct vma_munmap_struct { + struct vma_iterator *vmi; + struct vm_area_struct *vma; /* The first vma to munmap */ + struct vm_area_struct *prev; /* vma before the munmap area */ + struct vm_area_struct *next; /* vma after the munmap area */ + struct list_head *uf; /* Userfaultfd list_head */ + unsigned long start; /* Aligned start addr (inclusive) */ + unsigned long end; /* Aligned end addr (exclusive) */ + unsigned long unmap_start; /* Unmap PTE start */ + unsigned long unmap_end; /* Unmap PTE end */ + int vma_count; /* Number of vmas that will be removed */ + bool unlock; /* Unlock after the munmap */ + bool clear_ptes; /* If there are outstanding PTE to be cleared */ + bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */ + /* 1 byte hole */ + unsigned long nr_pages; /* Number of pages being removed */ + unsigned long locked_vm; /* Number of locked pages */ + unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ + unsigned long exec_vm; + unsigned long stack_vm; + unsigned long data_vm; +}; + +enum vma_merge_state { + VMA_MERGE_START, + VMA_MERGE_ERROR_NOMEM, + VMA_MERGE_NOMERGE, + VMA_MERGE_SUCCESS, +}; + +/* Represents a VMA merge operation. */ +struct vma_merge_struct { + struct mm_struct *mm; + struct vma_iterator *vmi; + pgoff_t pgoff; + struct vm_area_struct *prev; + struct vm_area_struct *next; /* Modified by vma_merge(). */ + struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */ + unsigned long start; + unsigned long end; + unsigned long flags; + struct file *file; + struct anon_vma *anon_vma; + struct mempolicy *policy; + struct vm_userfaultfd_ctx uffd_ctx; + struct anon_vma_name *anon_name; + enum vma_merge_state state; +}; + +static inline bool vmg_nomem(struct vma_merge_struct *vmg) +{ + return vmg->state == VMA_MERGE_ERROR_NOMEM; +} + +/* Assumes addr >= vma->vm_start. */ +static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, + unsigned long addr) +{ + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); +} + +#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ + struct vma_merge_struct name = { \ + .mm = mm_, \ + .vmi = vmi_, \ + .start = start_, \ + .end = end_, \ + .flags = flags_, \ + .pgoff = pgoff_, \ + .state = VMA_MERGE_START, \ + } + +#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ + struct vma_merge_struct name = { \ + .mm = vma_->vm_mm, \ + .vmi = vmi_, \ + .prev = prev_, \ + .next = NULL, \ + .vma = vma_, \ + .start = start_, \ + .end = end_, \ + .flags = vma_->vm_flags, \ + .pgoff = vma_pgoff_offset(vma_, start_), \ + .file = vma_->vm_file, \ + .anon_vma = vma_->anon_vma, \ + .policy = vma_policy(vma_), \ + .uffd_ctx = vma_->vm_userfaultfd_ctx, \ + .anon_name = anon_vma_name(vma_), \ + .state = VMA_MERGE_START, \ + } + +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE +void validate_mm(struct mm_struct *mm); +#else +#define validate_mm(mm) do { } while (0) +#endif + +/* Required for expand_downwards(). */ +void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); + +/* Required for expand_downwards(). */ +void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); + +int vma_expand(struct vma_merge_struct *vmg); +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); + +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, + struct vm_area_struct *vma, gfp_t gfp) + +{ + if (vmi->mas.status != ma_start && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); + mas_store_gfp(&vmi->mas, vma, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +#ifdef CONFIG_MMU +/* + * init_vma_munmap() - Initializer wrapper for vma_munmap_struct + * @vms: The vma munmap struct + * @vmi: The vma iterator + * @vma: The first vm_area_struct to munmap + * @start: The aligned start address to munmap + * @end: The aligned end address to munmap + * @uf: The userfaultfd list_head + * @unlock: Unlock after the operation. Only unlocked on success + */ +static inline void init_vma_munmap(struct vma_munmap_struct *vms, + struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, struct list_head *uf, + bool unlock) +{ + vms->vmi = vmi; + vms->vma = vma; + if (vma) { + vms->start = start; + vms->end = end; + } else { + vms->start = vms->end = 0; + } + vms->unlock = unlock; + vms->uf = uf; + vms->vma_count = 0; + vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; + vms->exec_vm = vms->stack_vm = vms->data_vm = 0; + vms->unmap_start = FIRST_USER_ADDRESS; + vms->unmap_end = USER_PGTABLES_CEILING; + vms->clear_ptes = false; + vms->closed_vm_ops = false; +} +#endif + +int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach); + +void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach); + +void vms_clean_up_area(struct vma_munmap_struct *vms, + struct ma_state *mas_detach); + +/* + * reattach_vmas() - Undo any munmap work and free resources + * @mas_detach: The maple state with the detached maple tree + * + * Reattach any detached vmas and free up the maple tree used to track the vmas. + */ +static inline void reattach_vmas(struct ma_state *mas_detach) +{ + struct vm_area_struct *vma; + + mas_set(mas_detach, 0); + mas_for_each(mas_detach, vma, ULONG_MAX) + vma_mark_detached(vma, false); + + __mt_destroy(mas_detach->tree); +} + +/* + * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() + * operation. + * @vms: The vma unmap structure + * @mas_detach: The maple state with the detached maple tree + * + * Reattach any detached vmas, free up the maple tree used to track the vmas. + * If that's not possible because the ptes are cleared (and vm_ops->closed() may + * have been called), then a NULL is written over the vmas and the vmas are + * removed (munmap() completed). + */ +static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct ma_state *mas = &vms->vmi->mas; + if (!vms->nr_pages) + return; + + if (vms->clear_ptes) + return reattach_vmas(mas_detach); + + /* + * Aborting cannot just call the vm_ops open() because they are often + * not symmetrical and state data has been lost. Resort to the old + * failure method of leaving a gap where the MAP_FIXED mapping failed. + */ + mas_set_range(mas, vms->start, vms->end - 1); + if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) { + pr_warn_once("%s: (%d) Unable to abort munmap() operation\n", + current->comm, current->pid); + /* Leaving vmas detached and in-tree may hamper recovery */ + reattach_vmas(mas_detach); + } else { + /* Clean up the insertion of the unfortunate gap */ + vms_complete_munmap_vmas(vms, mas_detach); + } +} + +int +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool unlock); + +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool unlock); + +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed); + +void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct vm_area_struct *next); + +/* We are about to modify the VMA's flags. */ +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags); + +/* We are about to modify the VMA's flags and/or anon_name. */ +struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long new_flags, + struct anon_vma_name *new_name); + +/* We are about to modify the VMA's memory policy. */ +struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol); + +/* We are about to modify the VMA's flags and/or uffd context. */ +struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags, + struct vm_userfaultfd_ctx new_ctx); + +struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); + +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); + +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); + +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); + +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, + struct vm_area_struct *vma); + +void unlink_file_vma(struct vm_area_struct *vma); + +void vma_link_file(struct vm_area_struct *vma); + +int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); + +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks); + +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); + +bool vma_needs_dirty_tracking(struct vm_area_struct *vma); +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); + +int mm_take_all_locks(struct mm_struct *mm); +void mm_drop_all_locks(struct mm_struct *mm); + +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) +{ + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + return vma_wants_writenotify(vma, vma->vm_page_prot); + return !!(vma->vm_flags & VM_WRITE); +} + +#ifdef CONFIG_MMU +static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +{ + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); +} +#endif + +static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, + unsigned long min) +{ + return mas_prev(&vmi->mas, min); +} + +/* + * These three helpers classifies VMAs for virtual memory accounting. + */ + +/* + * Executable code area - executable, not writable, not stack + */ +static inline bool is_exec_mapping(vm_flags_t flags) +{ + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; +} + +/* + * Stack area (including shadow stacks) + * + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: + * do_mmap() forbids all other combinations. + */ +static inline bool is_stack_mapping(vm_flags_t flags) +{ + return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); +} + +/* + * Data area - private, writable, not stack + */ +static inline bool is_data_mapping(vm_flags_t flags) +{ + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; +} + + +static inline void vma_iter_config(struct vma_iterator *vmi, + unsigned long index, unsigned long last) +{ + __mas_set_range(&vmi->mas, index, last - 1); +} + +static inline void vma_iter_reset(struct vma_iterator *vmi) +{ + mas_reset(&vmi->mas); +} + +static inline +struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) +{ + return mas_prev_range(&vmi->mas, min); +} + +static inline +struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) +{ + return mas_next_range(&vmi->mas, max); +} + +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area(&vmi->mas, min, max - 1, size); +} + +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area_rev(&vmi->mas, min, max - 1, size); +} + +/* + * VMA Iterator functions shared between nommu and mmap + */ +static inline int vma_iter_prealloc(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); +} + +static inline void vma_iter_clear(struct vma_iterator *vmi) +{ + mas_store_prealloc(&vmi->mas, NULL); +} + +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) +{ + return mas_walk(&vmi->mas); +} + +/* Store a VMA with preallocated memory */ +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && + vmi->mas.index > vma->vm_start)) { + pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", + vmi->mas.index, vma->vm_start, vma->vm_start, + vma->vm_end, vmi->mas.index, vmi->mas.last); + } + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && + vmi->mas.last < vma->vm_start)) { + pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", + vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, + vmi->mas.index, vmi->mas.last); + } +#endif + + if (vmi->mas.status != ma_start && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(&vmi->mas, vma); +} + +static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) +{ + return vmi->mas.index; +} + +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) +{ + return vmi->mas.last + 1; +} + +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, + unsigned long count) +{ + return mas_expected_entries(&vmi->mas, count); +} + +static inline +struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) +{ + return mas_prev_range(&vmi->mas, 0); +} + +/* + * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or + * if no previous VMA, to index 0. + */ +static inline +struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *next = vma_next(vmi); + struct vm_area_struct *prev = vma_prev(vmi); + + /* + * Consider the case where no previous VMA exists. We advance to the + * next VMA, skipping any gap, then rewind to the start of the range. + * + * If we were to unconditionally advance to the next range we'd wind up + * at the next VMA again, so we check to ensure there is a previous VMA + * to skip over. + */ + if (prev) + vma_iter_next_range(vmi); + + if (pprev) + *pprev = prev; + + return next; +} + +#ifdef CONFIG_64BIT + +static inline bool vma_is_sealed(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_SEALED); +} + +/* + * check if a vma is sealed for modification. + * return true, if modification is allowed. + */ +static inline bool can_modify_vma(struct vm_area_struct *vma) +{ + if (unlikely(vma_is_sealed(vma))) + return false; + + return true; +} + +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); + +#else + +static inline bool can_modify_vma(struct vm_area_struct *vma) +{ + return true; +} + +static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) +{ + return true; +} + +#endif + +#endif /* __MM_VMA_H */ diff --git a/mm/vma_internal.h b/mm/vma_internal.h new file mode 100644 index 000000000000..b930ab12a587 --- /dev/null +++ b/mm/vma_internal.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * vma_internal.h + * + * Headers required by vma.c, which can be substituted accordingly when testing + * VMA functionality. + */ + +#ifndef __MM_VMA_INTERNAL_H +#define __MM_VMA_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +#endif /* __MM_VMA_INTERNAL_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a0df1e2e155a..634162271c00 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -105,7 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - if (!pte_none(ptep_get(pte))) { + if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { page = pfn_to_page(pfn); dump_page(page, "remapping already mapped page"); @@ -1940,7 +1940,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, { vm->flags = flags; vm->addr = (void *)va->va_start; - vm->size = va->va_end - va->va_start; + vm->size = va_size(va); vm->caller = caller; va->vm = vm; } @@ -2018,7 +2018,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (vm) { vm->addr = (void *)va->va_start; - vm->size = va->va_end - va->va_start; + vm->size = va_size(va); va->vm = vm; } @@ -2131,23 +2131,18 @@ reclaim_list_global(struct list_head *head) static void decay_va_pool_node(struct vmap_node *vn, bool full_decay) { + LIST_HEAD(decay_list); + struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; - struct list_head decay_list; - struct rb_root decay_root; unsigned long n_decay; int i; - decay_root = RB_ROOT; - INIT_LIST_HEAD(&decay_list); - for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { - struct list_head tmp_list; + LIST_HEAD(tmp_list); if (list_empty(&vn->pool[i].head)) continue; - INIT_LIST_HEAD(&tmp_list); - /* Detach the pool, so no-one can access it. */ spin_lock(&vn->pool_lock); list_replace_init(&vn->pool[i].head, &tmp_list); @@ -2198,7 +2193,7 @@ static void purge_vmap_node(struct work_struct *work) vn->nr_purged = 0; list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { - unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long nr = va_size(va) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; unsigned int vn_id = decode_vn_id(va->flags); @@ -2344,8 +2339,8 @@ static void free_vmap_area_noflush(struct vmap_area *va) if (WARN_ON_ONCE(!list_empty(&va->list))) return; - nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> - PAGE_SHIFT, &vmap_lazy_nr); + nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, + &vmap_lazy_nr); /* * If it was request by a certain node we would like to @@ -2941,8 +2936,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (WARN_ON_ONCE(!va)) return; - debug_check_no_locks_freed((void *)va->va_start, - (va->va_end - va->va_start)); + debug_check_no_locks_freed((void *)va->va_start, va_size(va)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -3518,8 +3512,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; - gfp_t alloc_gfp = gfp; - bool nofail = gfp & __GFP_NOFAIL; struct page *page; int i; @@ -3530,9 +3522,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { - /* bulk allocator doesn't support nofail req. officially */ - gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; - while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -3550,12 +3539,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, + nr = alloc_pages_bulk_array_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); - else - nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, + nr = alloc_pages_bulk_array_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); @@ -3569,30 +3557,24 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } - } else if (gfp & __GFP_NOFAIL) { - /* - * Higher order nofail allocations are really expensive and - * potentially dangerous (pre-mature OOM, disruptive reclaim - * and compaction etc. - */ - alloc_gfp &= ~__GFP_NOFAIL; } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { - if (!nofail && fatal_signal_pending(current)) + if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) - page = alloc_pages_noprof(alloc_gfp, order); + page = alloc_pages_noprof(gfp, order); else - page = alloc_pages_node_noprof(nid, alloc_gfp, order); + page = alloc_pages_node_noprof(nid, gfp, order); + if (unlikely(!page)) break; /* - * Higher order allocations must be able to be treated as - * indepdenent small pages by callers (as they can with + * High-order allocations must be able to be treated as + * independent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, * page->lru, etc. @@ -3653,7 +3635,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); - area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + /* + * High-order nofail allocations are really expensive and + * potentially dangerous (pre-mature OOM, disruptive reclaim + * and compaction etc. + * + * Please note, the __vmalloc_node_range_noprof() falls-back + * to order-0 pages if high-order attempt is unsuccessful. + */ + area->nr_pages = vm_area_alloc_pages((page_order ? + gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); @@ -4033,6 +4024,76 @@ void *vzalloc_node_noprof(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node_noprof); +/** + * vrealloc - reallocate virtually contiguous memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and + * @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or vfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory; %NULL if @size is zero or in case of + * failure + */ +void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + size_t old_size = 0; + void *n; + + if (!size) { + vfree(p); + return NULL; + } + + if (p) { + struct vm_struct *vm; + + vm = find_vm_area(p); + if (unlikely(!vm)) { + WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); + return NULL; + } + + old_size = get_vm_area_size(vm); + } + + /* + * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What + * would be a good heuristic for when to shrink the vm_area? + */ + if (size <= old_size) { + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) + memset((void *)p + size, 0, old_size - size); + + return (void *)p; + } + + /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ + n = __vmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + memcpy(n, p, old_size); + vfree(p); + } + + return n; +} + #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) @@ -4873,7 +4934,7 @@ static void show_purge_info(struct seq_file *m) list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + va_size(va)); } spin_unlock(&vn->lazy.lock); } @@ -4895,7 +4956,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (va->flags & VMAP_RAM) seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + va_size(va)); continue; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a8d61a8b6894..749cdc110c74 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -628,7 +628,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, - struct swap_iocb **plug) + struct swap_iocb **plug, struct list_head *folio_list) { /* * If the folio is dirty, only perform writeback if that write @@ -676,6 +676,14 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, .swap_plug = plug, }; + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is + * not enabled or contiguous swap entries are failed to + * allocate. + */ + if (shmem_mapping(mapping) && folio_test_large(folio)) + wbc.list = folio_list; + folio_set_reclaim(folio); res = mapping->a_ops->writepage(&folio->page, &wbc); if (res < 0) @@ -863,7 +871,12 @@ static enum folio_references folio_check_references(struct folio *folio, if (vm_flags & VM_LOCKED) return FOLIOREF_ACTIVATE; - /* rmap lock contention: rotate */ + /* + * There are two cases to consider. + * 1) Rmap lock contention: rotate. + * 2) Skip the non-shared swapbacked folio mapped solely by + * the exiting or OOM-reaped process. + */ if (referenced_ptes == -1) return FOLIOREF_KEEP; @@ -1003,9 +1016,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); - mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(), - nr_succeeded); - return nr_succeeded; } @@ -1222,13 +1232,14 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, goto keep_locked; if (folio_test_large(folio)) { /* cannot split folio, skip it */ - if (!can_split_folio(folio, NULL)) + if (!can_split_folio(folio, 1, NULL)) goto activate_locked; /* * Split partially mapped folios right away. * We can free the unmapped pages without IO. */ - if (data_race(!list_empty(&folio->_deferred_list)) && + if (data_race(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) && split_folio_to_list(folio, folio_list)) goto activate_locked; } @@ -1252,11 +1263,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, goto activate_locked_split; } } - } else if (folio_test_swapbacked(folio) && - folio_test_large(folio)) { - /* Split shmem folio */ - if (split_folio_to_list(folio, folio_list)) - goto keep_locked; } /* @@ -1357,12 +1363,25 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(folio, mapping, &plug)) { + switch (pageout(folio, mapping, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: + /* + * If shmem folio is split when writeback to swap, + * the tail pages will make their own pass through + * this function and be accounted then. + */ + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } goto activate_locked; case PAGE_SUCCESS: + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } stat->nr_pageout += nr_pages; if (folio_test_writeback(folio)) @@ -1495,7 +1514,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_reclaimed += demote_folio_list(&demote_folios, pgdat); + stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_reclaimed += stat->nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ @@ -1941,6 +1961,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); + __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), + stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) @@ -2239,10 +2261,11 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); /* - * Flush the memory cgroup stats, so that we read accurate per-memcg - * lruvec stats for heuristics. + * Flush the memory cgroup stats in rate-limited way as we don't need + * most accurate stats here. We may switch to regular stats flushing + * in the future once it is cheap enough. */ - mem_cgroup_flush_stats(sc->target_mem_cgroup); + mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup); /* * Determine the scan balance between anon and file LRUs. @@ -3456,7 +3479,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area goto next; if (!pmd_trans_huge(pmd[i])) { - if (should_clear_pmd_young()) + if (!walk->force_scan && should_clear_pmd_young()) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } @@ -3543,7 +3566,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (should_clear_pmd_young()) { + if (!walk->force_scan && should_clear_pmd_young()) { if (!pmd_young(val)) continue; @@ -6644,7 +6667,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) continue; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) - mark = wmark_pages(zone, WMARK_PROMO); + mark = promo_wmark_pages(zone); else mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) @@ -7519,7 +7542,9 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) ret = __node_reclaim(pgdat, gfp_mask, order); clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); - if (!ret) + if (ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS); + else count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); return ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index e875f2a4915f..b5a4cea423e1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1314,6 +1314,7 @@ const char * const vmstat_text[] = { "pgsteal_file", #ifdef CONFIG_NUMA + "zone_reclaim_success", "zone_reclaim_failed", #endif "pginodesteal", @@ -1384,6 +1385,7 @@ const char * const vmstat_text[] = { "thp_split_page", "thp_split_page_failed", "thp_deferred_split_page", + "thp_underused_split_page", "thp_split_pmd", "thp_scan_exceed_none_pte", "thp_scan_exceed_swap_pte", @@ -1435,6 +1437,30 @@ const char * const vmstat_text[] = { "vma_lock_retry", "vma_lock_miss", #endif +#ifdef CONFIG_DEBUG_STACK_USAGE + "kstack_1k", +#if THREAD_SIZE > 1024 + "kstack_2k", +#endif +#if THREAD_SIZE > 2048 + "kstack_4k", +#endif +#if THREAD_SIZE > 4096 + "kstack_8k", +#endif +#if THREAD_SIZE > 8192 + "kstack_16k", +#endif +#if THREAD_SIZE > 16384 + "kstack_32k", +#endif +#if THREAD_SIZE > 32768 + "kstack_64k", +#endif +#if THREAD_SIZE > 65536 + "kstack_rest", +#endif +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ @@ -1718,6 +1744,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" + "\n promo %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu" @@ -1727,6 +1754,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), + promo_wmark_pages(zone), zone->spanned_pages, zone->present_pages, zone_managed_pages(zone), diff --git a/mm/z3fold.c b/mm/z3fold.c index 2ebfed32871b..379d24b4fef9 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -144,7 +144,7 @@ struct z3fold_pool { const char *name; spinlock_t lock; spinlock_t stale_lock; - struct list_head *unbuddied; + struct list_head __percpu *unbuddied; struct list_head stale; atomic64_t pages_nr; struct kmem_cache *c_handle; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b572aa84823c..16a07def09c9 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -20,7 +20,7 @@ * page->index: links together all component pages of a zspage * For the huge page, this is always 0, so we use this field * to store handle. - * page->page_type: PG_zsmalloc, lower 16 bit locate the first object + * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object * offset in a subpage of a zspage * * Usage of struct page flags: @@ -463,13 +463,7 @@ static inline struct page *get_first_page(struct zspage *zspage) return first_page; } -#define FIRST_OBJ_PAGE_TYPE_MASK 0xffff - -static inline void reset_first_obj_offset(struct page *page) -{ - VM_WARN_ON_ONCE(!PageZsmalloc(page)); - page->page_type |= FIRST_OBJ_PAGE_TYPE_MASK; -} +#define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff static inline unsigned int get_first_obj_offset(struct page *page) { @@ -479,8 +473,8 @@ static inline unsigned int get_first_obj_offset(struct page *page) static inline void set_first_obj_offset(struct page *page, unsigned int offset) { - /* With 16 bit available, we can support offsets into 64 KiB pages. */ - BUILD_BUG_ON(PAGE_SIZE > SZ_64K); + /* With 24 bits available, we can support offsets into 16 MiB pages. */ + BUILD_BUG_ON(PAGE_SIZE > SZ_16M); VM_WARN_ON_ONCE(!PageZsmalloc(page)); VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK; @@ -819,7 +813,6 @@ static void reset_page(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); page->index = 0; - reset_first_obj_offset(page); __ClearPageZsmalloc(page); } diff --git a/mm/zswap.c b/mm/zswap.c index adeaf9c97fde..449914ea9919 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -44,8 +44,6 @@ **********************************/ /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); -/* The number of same-value filled pages currently stored in zswap */ -static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -185,8 +183,11 @@ static struct shrinker *zswap_shrinker; * * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during - * decompression. For a same value filled page length is 0, and both - * pool and lru are invalid and must be ignored. + * decompression. + * referenced - true if the entry recently entered the zswap pool. Unset by the + * writeback logic. The entry is only reclaimed by the writeback + * logic if referenced is unset. See comments in the shrinker + * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content @@ -196,11 +197,9 @@ static struct shrinker *zswap_shrinker; struct zswap_entry { swp_entry_t swpentry; unsigned int length; + bool referenced; struct zswap_pool *pool; - union { - unsigned long handle; - unsigned long value; - }; + unsigned long handle; struct obj_cgroup *objcg; struct list_head lru; }; @@ -700,11 +699,8 @@ static inline int entry_to_nid(struct zswap_entry *entry) static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; - struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of @@ -722,19 +718,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); - - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -752,7 +735,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) void zswap_lruvec_state_init(struct lruvec *lruvec) { - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); + atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0); } void zswap_folio_swapin(struct folio *folio) @@ -761,16 +744,29 @@ void zswap_folio_swapin(struct folio *folio) if (folio) { lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); } } +/* + * This function should be called when a memcg is being offlined. + * + * Since the global shrinker shrink_worker() may hold a reference + * of the memcg, we must check and release the reference in + * zswap_next_shrink. + * + * shrink_worker() must handle the case where this function releases + * the reference of memcg being shrunk. + */ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ spin_lock(&zswap_shrink_lock); - if (zswap_next_shrink == memcg) - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + if (zswap_next_shrink == memcg) { + do { + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + } while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink)); + } spin_unlock(&zswap_shrink_lock); } @@ -799,13 +795,9 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) */ static void zswap_entry_free(struct zswap_entry *entry) { - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zswap_lru_del(&zswap_list_lru, entry); - zpool_free(entry->pool->zpool, entry->handle); - zswap_pool_put(entry->pool); - } + zswap_lru_del(&zswap_list_lru, entry); + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); @@ -1082,6 +1074,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /********************************* * shrinker functions **********************************/ +/* + * The dynamic shrinker is modulated by the following factors: + * + * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving + * the entry a second chance) before rotating it in the LRU list. If the + * entry is considered again by the shrinker, with its referenced bit unset, + * it is written back. The writeback rate as a result is dynamically + * adjusted by the pool activities - if the pool is dominated by new entries + * (i.e lots of recent zswapouts), these entries will be protected and + * the writeback rate will slow down. On the other hand, if the pool has a + * lot of stagnant entries, these entries will be reclaimed immediately, + * effectively increasing the writeback rate. + * + * 2. Swapins counter: If we observe swapins, it is a sign that we are + * overshrinking and should slow down. We maintain a swapins counter, which + * is consumed and subtract from the number of eligible objects on the LRU + * in zswap_shrinker_count(). + * + * 3. Compression ratio. The better the workload compresses, the less gains we + * can expect from writeback. We scale down the number of objects available + * for reclaim by this ratio. + */ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { @@ -1091,6 +1105,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; + /* + * Second chance algorithm: if the entry has its referenced bit set, give it + * a second chance. Only clear the referenced bit and rotate it in the + * zswap's LRU list. + */ + if (entry->referenced) { + entry->referenced = false; + return LRU_ROTATE; + } + /* * As soon as we drop the LRU lock, the entry can be freed by * a concurrent invalidation. This means the following: @@ -1157,8 +1181,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; + unsigned long shrink_ret; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || @@ -1167,25 +1190,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&zswap_list_lru, sc); - - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). - */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); @@ -1200,7 +1204,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, { struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + atomic_long_t *nr_disk_swapins = + &lruvec->zswap_lruvec_state.nr_disk_swapins; + unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur, + nr_remain; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; @@ -1233,25 +1240,33 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!nr_stored) return 0; - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + if (!nr_freeable) + return 0; + /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. + * Subtract from the lru size the number of pages that are recently swapped + * in from disk. The idea is that had we protect the zswap's LRU by this + * amount of pages, these disk swapins would not have happened. */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins); + do { + if (nr_freeable >= nr_disk_swapins_cur) + nr_remain = 0; + else + nr_remain = nr_disk_swapins_cur - nr_freeable; + } while (!atomic_long_try_cmpxchg( + nr_disk_swapins, &nr_disk_swapins_cur, nr_remain)); + + nr_freeable -= nr_disk_swapins_cur - nr_remain; + if (!nr_freeable) + return 0; /* * Scale the number of freeable pages by the memory saving factor. * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). - * - * The memory saving factor calculated here takes same-filled pages into - * account, but those are not freeable since they almost occupy no - * space. Hence, we may scale nr_freeable down a little bit more than we - * should if we have a lot of same-filled pages. */ return mult_frac(nr_freeable, nr_backing, nr_stored); } @@ -1274,10 +1289,10 @@ static struct shrinker *zswap_alloc_shrinker(void) static int shrink_memcg(struct mem_cgroup *memcg) { - int nid, shrunk = 0; + int nid, shrunk = 0, scanned = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) - return -EINVAL; + return -ENOENT; /* * Skip zombies because their LRUs are reparented and we would be @@ -1291,63 +1306,94 @@ static int shrink_memcg(struct mem_cgroup *memcg) shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); + scanned += 1 - nr_to_walk; } + + if (!scanned) + return -ENOENT; + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; - int ret, failures = 0; + int ret, failures = 0, attempts = 0; unsigned long thr; /* Reclaim down to the accept threshold */ thr = zswap_accept_thr_pages(); - /* global reclaim will select cgroup in a round-robin fashion. */ + /* + * Global reclaim will select cgroup in a round-robin fashion from all + * online memcgs, but memcgs that have no pages in zswap and + * writeback-disabled memcgs (memory.zswap.writeback=0) are not + * candidates for shrinking. + * + * Shrinking will be aborted if we encounter the following + * MAX_RECLAIM_RETRIES times: + * - No writeback-candidate memcgs found in a memcg tree walk. + * - Shrinking a writeback-candidate memcg failed. + * + * We save iteration cursor memcg into zswap_next_shrink, + * which can be modified by the offline memcg cleaner + * zswap_memcg_offline_cleanup(). + * + * Since the offline cleaner is called only once, we cannot leave an + * offline memcg reference in zswap_next_shrink. + * We can rely on the cleaner only if we get online memcg under lock. + * + * If we get an offline memcg, we cannot determine if the cleaner has + * already been called or will be called later. We must put back the + * reference before returning from this function. Otherwise, the + * offline memcg left in zswap_next_shrink will hold the reference + * until the next run of shrink_worker(). + */ do { - spin_lock(&zswap_shrink_lock); - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); - memcg = zswap_next_shrink; - /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. + * Start shrinking from the next memcg after zswap_next_shrink. + * When the offline cleaner has already advanced the cursor, + * advancing the cursor here overlooks one memcg, but this + * should be negligibly rare. * - * Note that if we got an online memcg, we will keep the extra - * reference in case the original reference obtained by mem_cgroup_iter - * is dropped by the zswap memcg offlining callback, ensuring that the - * memcg is not killed when we are reclaiming. + * If we get an online memcg, keep the extra reference in case + * the original one obtained by mem_cgroup_iter() is dropped by + * zswap_memcg_offline_cleanup() while we are shrinking the + * memcg. */ - if (!memcg) { - spin_unlock(&zswap_shrink_lock); - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - - if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - zswap_next_shrink = NULL; - spin_unlock(&zswap_shrink_lock); - - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } + spin_lock(&zswap_shrink_lock); + do { + memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + zswap_next_shrink = memcg; + } while (memcg && !mem_cgroup_tryget_online(memcg)); spin_unlock(&zswap_shrink_lock); + if (!memcg) { + /* + * Continue shrinking without incrementing failures if + * we found candidate memcgs in the last tree walk. + */ + if (!attempts && ++failures == MAX_RECLAIM_RETRIES) + break; + + attempts = 0; + goto resched; + } + ret = shrink_memcg(memcg); /* drop the extra reference */ mem_cgroup_put(memcg); - if (ret == -EINVAL) - break; + /* + * There are no writeback-candidate pages in the memcg. + * This is not an issue as long as we can find another memcg + * with pages in zswap. Skip this without incrementing attempts + * and failures. + */ + if (ret == -ENOENT) + continue; + ++attempts; + if (ret && ++failures == MAX_RECLAIM_RETRIES) break; resched: @@ -1355,42 +1401,6 @@ static void shrink_worker(struct work_struct *w) } while (zswap_total_pages() > thr); } -/********************************* -* same-filled functions -**********************************/ -static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) -{ - unsigned long *data; - unsigned long val; - unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; - bool ret = false; - - data = kmap_local_folio(folio, 0); - val = data[0]; - - if (val != data[last_pos]) - goto out; - - for (pos = 1; pos < last_pos; pos++) { - if (val != data[pos]) - goto out; - } - - *value = val; - ret = true; -out: - kunmap_local(data); - return ret; -} - -static void zswap_fill_folio(struct folio *folio, unsigned long value) -{ - unsigned long *data = kmap_local_folio(folio, 0); - - memset_l(data, value, PAGE_SIZE / sizeof(unsigned long)); - kunmap_local(data); -} - /********************************* * main API **********************************/ @@ -1402,7 +1412,6 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - unsigned long value; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1435,13 +1444,6 @@ bool zswap_store(struct folio *folio) goto reject; } - if (zswap_is_folio_same_filled(folio, &value)) { - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto store_entry; - } - /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) @@ -1459,9 +1461,9 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; -store_entry: entry->swpentry = swp; entry->objcg = objcg; + entry->referenced = true; old = xa_store(tree, offset, entry, GFP_KERNEL); if (xa_is_err(old)) { @@ -1507,13 +1509,9 @@ bool zswap_store(struct folio *folio) return true; store_failed: - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zpool_free(entry->pool->zpool, entry->handle); + zpool_free(entry->pool->zpool, entry->handle); put_pool: - zswap_pool_put(entry->pool); - } + zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); reject: @@ -1576,10 +1574,7 @@ bool zswap_load(struct folio *folio) if (!entry) return false; - if (entry->length) - zswap_decompress(entry, folio); - else - zswap_fill_folio(folio, entry->value); + zswap_decompress(entry, folio); count_vm_event(ZSWPIN); if (entry->objcg) @@ -1682,8 +1677,6 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); - debugfs_create_atomic_t("same_filled_pages", 0444, - zswap_debugfs_root, &zswap_same_filled_pages); return 0; } diff --git a/samples/kmemleak/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c index f7470ed85a79..544c36d51d56 100644 --- a/samples/kmemleak/kmemleak-test.c +++ b/samples/kmemleak/kmemleak-test.c @@ -79,6 +79,8 @@ static int kmemleak_test_init(void) per_cpu(kmemleak_test_pointer, i)); } + pr_info("__alloc_percpu(64, 4) = %p\n", __alloc_percpu(64, 4)); + return 0; } module_init(kmemleak_test_init); diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 390658a2d5b7..aab4154af00a 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan @@ -22,30 +22,31 @@ endif ifdef CONFIG_KASAN_GENERIC ifdef CONFIG_KASAN_INLINE + # When the number of memory accesses in a function is less than this + # call threshold number, the compiler will use inline instrumentation. + # 10000 is chosen offhand as a sufficiently large number to make all + # kernel functions to be instrumented inline. call_threshold := 10000 else call_threshold := 0 endif -CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address +# First, enable -fsanitize=kernel-address together with providing the shadow +# mapping offset, as for GCC, -fasan-shadow-offset fails without -fsanitize +# (GCC accepts the shadow mapping offset via -fasan-shadow-offset instead of +# a --param like the other KASAN parameters). +# Instead of ifdef-checking the compiler, rely on cc-option. +CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \ + -fasan-shadow-offset=$(KASAN_SHADOW_OFFSET), \ + $(call cc-option, -fsanitize=kernel-address \ + -mllvm -asan-mapping-offset=$(KASAN_SHADOW_OFFSET))) -# -fasan-shadow-offset fails without -fsanitize -CFLAGS_KASAN_SHADOW := $(call cc-option, -fsanitize=kernel-address \ - -fasan-shadow-offset=$(KASAN_SHADOW_OFFSET), \ - $(call cc-option, -fsanitize=kernel-address \ - -mllvm -asan-mapping-offset=$(KASAN_SHADOW_OFFSET))) - -ifeq ($(strip $(CFLAGS_KASAN_SHADOW)),) - CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL) -else - # Now add all the compiler specific options that are valid standalone - CFLAGS_KASAN := $(CFLAGS_KASAN_SHADOW) \ - $(call cc-param,asan-globals=1) \ - $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ - $(call cc-param,asan-instrument-allocas=1) -endif - -CFLAGS_KASAN += $(call cc-param,asan-stack=$(stack_enable)) +# Now, add other parameters enabled similarly in both GCC and Clang. +# As some of them are not supported by older compilers, use cc-param. +CFLAGS_KASAN += $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ + $(call cc-param,asan-stack=$(stack_enable)) \ + $(call cc-param,asan-instrument-allocas=1) \ + $(call cc-param,asan-globals=1) # Instrument memcpy/memset/memmove calls by using instrumented __asan_mem*() # instead. With compilers that don't support this option, compiler-inserted @@ -57,9 +58,9 @@ endif # CONFIG_KASAN_GENERIC ifdef CONFIG_KASAN_SW_TAGS ifdef CONFIG_KASAN_INLINE - instrumentation_flags := $(call cc-param,hwasan-mapping-offset=$(KASAN_SHADOW_OFFSET)) + instrumentation_flags := $(call cc-param,hwasan-mapping-offset=$(KASAN_SHADOW_OFFSET)) else - instrumentation_flags := $(call cc-param,hwasan-instrument-with-calls=1) + instrumentation_flags := $(call cc-param,hwasan-instrument-with-calls=1) endif CFLAGS_KASAN := -fsanitize=kernel-hwaddress \ @@ -70,7 +71,7 @@ CFLAGS_KASAN := -fsanitize=kernel-hwaddress \ # Instrument memcpy/memset/memmove calls by using instrumented __hwasan_mem*(). ifeq ($(call clang-min-version, 150000)$(call gcc-min-version, 130000),y) -CFLAGS_KASAN += $(call cc-param,hwasan-kernel-mem-intrinsic-prefix=1) + CFLAGS_KASAN += $(call cc-param,hwasan-kernel-mem-intrinsic-prefix=1) endif endif # CONFIG_KASAN_SW_TAGS diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index 8d5595b6c59f..fa050d5a48cd 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -71,12 +71,12 @@ /* [32-] kernel hacking assistances */ #define KPF_RESERVED 32 #define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 +#define KPF_OWNER_2 34 #define KPF_PRIVATE 35 #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 #define KPF_ARCH 38 -#define KPF_UNCACHED 39 +#define KPF_UNCACHED 39 /* unused */ #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 @@ -129,12 +129,11 @@ static const char * const page_flag_names[] = { [KPF_RESERVED] = "r:reserved", [KPF_MLOCKED] = "m:mlocked", - [KPF_MAPPEDTODISK] = "d:mappedtodisk", + [KPF_OWNER_2] = "d:owner_2", [KPF_PRIVATE] = "P:private", [KPF_PRIVATE_2] = "p:private_2", [KPF_OWNER_PRIVATE] = "O:owner_private", [KPF_ARCH] = "h:arch", - [KPF_UNCACHED] = "c:uncached", [KPF_SOFTDIRTY] = "f:softdirty", [KPF_ARCH_2] = "H:arch_2", @@ -472,9 +471,9 @@ static int bit_mask_ok(uint64_t flags) static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) { - /* Anonymous pages overload PG_mappedtodisk */ - if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK))) - flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE); + /* Anonymous pages use PG_owner_2 for anon_exclusive */ + if ((flags & BIT(ANON)) && (flags & BIT(OWNER_2))) + flags ^= BIT(OWNER_2) | BIT(ANON_EXCLUSIVE); /* SLUB overloads several page flags */ if (flags & BIT(SLAB)) { diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index f6c6e5474c3a..1cf82acb2a3e 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -20,7 +20,7 @@ void memblock_free_pages(struct page *page, unsigned long pfn, { } -static inline void accept_memory(phys_addr_t start, phys_addr_t end) +static inline void accept_memory(phys_addr_t start, unsigned long size) { } diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore index 49bccb90c35b..ce167a761981 100644 --- a/tools/testing/radix-tree/.gitignore +++ b/tools/testing/radix-tree/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +generated/autoconf.h generated/bit-length.h generated/map-shift.h idr.c diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index d1acd7d58850..8b3591a51e1f 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -1,77 +1,29 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \ - -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined -LDFLAGS += -fsanitize=address -fsanitize=undefined -LDLIBS+= -lpthread -lurcu +.PHONY: clean + TARGETS = main idr-test multiorder xarray maple -LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o -CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o maple.o $(LIBS) -OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ - regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \ - iteration_check_2.o benchmark.o - -ifndef SHIFT - SHIFT=3 -endif - -ifeq ($(BUILD), 32) - CFLAGS += -m32 - LDFLAGS += -m32 -LONG_BIT := 32 -endif - -ifndef LONG_BIT -LONG_BIT := $(shell getconf LONG_BIT) -endif +CORE_OFILES = $(SHARED_OFILES) xarray.o maple.o test.o +OFILES = main.o $(CORE_OFILES) regression1.o regression2.o \ + regression3.o regression4.o tag_check.o multiorder.o idr-test.o \ + iteration_check.o iteration_check_2.o benchmark.o targets: generated/map-shift.h generated/bit-length.h $(TARGETS) +include ../shared/shared.mk + main: $(OFILES) idr-test.o: ../../../lib/test_ida.c idr-test: idr-test.o $(CORE_OFILES) -xarray: $(CORE_OFILES) +xarray: $(CORE_OFILES) xarray.o -maple: $(CORE_OFILES) +maple: $(CORE_OFILES) maple.o multiorder: multiorder.o $(CORE_OFILES) clean: - $(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h generated/bit-length.h + $(RM) $(TARGETS) *.o radix-tree.c idr.c generated/* -vpath %.c ../../lib - -$(OFILES): Makefile *.h */*.h generated/map-shift.h generated/bit-length.h \ - ../../include/linux/*.h \ - ../../include/asm/*.h \ - ../../../include/linux/xarray.h \ - ../../../include/linux/maple_tree.h \ - ../../../include/linux/radix-tree.h \ - ../../../lib/radix-tree.h \ - ../../../include/linux/idr.h - -radix-tree.c: ../../../lib/radix-tree.c - sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ - -idr.c: ../../../lib/idr.c - sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ - -xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c - -maple.o: ../../../lib/maple_tree.c ../../../lib/test_maple_tree.c - -generated/map-shift.h: - @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ - echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \ - generated/map-shift.h; \ - fi - -generated/bit-length.h: FORCE - @if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then \ - echo "Generating $@"; \ - echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@; \ - fi - -FORCE: ; +$(OFILES): $(SHARED_DEPS) *.h diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index cd1cf05503b4..c5b00aca9def 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -120,7 +120,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); mas_push_node(&mas, mn); mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); /* free */ + mas_destroy(&mas); mtree_unlock(mt); @@ -144,7 +144,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas.status = ma_start; - mas_nomem(&mas, GFP_KERNEL); + mas_destroy(&mas); /* Allocate 3 nodes, will fail. */ mas_node_count(&mas, 3); /* Drop the lock and allocate 3 nodes. */ @@ -161,7 +161,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != 3); /* Free. */ mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); + mas_destroy(&mas); /* Set allocation request to 1. */ mas_set_alloc_req(&mas, 1); @@ -277,6 +277,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) } mas_reset(&mas); MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); + mas_destroy(&mas); } @@ -299,7 +300,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) } MT_BUG_ON(mt, mas_allocated(&mas) != total); mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); /* Free. */ + mas_destroy(&mas); /* Free. */ MT_BUG_ON(mt, mas_allocated(&mas) != 0); for (i = 1; i < 128; i++) { @@ -35847,6 +35848,7 @@ static noinline void __init check_nomem(struct maple_tree *mt) mas_store(&ms, &ms); /* insert 1 -> &ms */ mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */ mtree_unlock(mt); + mas_destroy(&ms); mtree_destroy(mt); } @@ -36224,6 +36226,97 @@ static noinline void __init check_mtree_dup(struct maple_tree *mt) extern void test_kmem_cache_bulk(void); +/* callback function used for check_nomem_writer_race() */ +static void writer2(void *maple_tree) +{ + struct maple_tree *mt = (struct maple_tree *)maple_tree; + MA_STATE(mas, mt, 6, 10); + + mtree_lock(mas.tree); + mas_store(&mas, xa_mk_value(0xC)); + mas_destroy(&mas); + mtree_unlock(mas.tree); +} + +/* + * check_nomem_writer_race() - test a possible race in the mas_nomem() path + * @mt: The tree to build. + * + * There is a possible race condition in low memory conditions when mas_nomem() + * gives up its lock. A second writer can chagne the entry that the primary + * writer executing the mas_nomem() path is modifying. This test recreates this + * scenario to ensure we are handling it correctly. + */ +static void check_nomem_writer_race(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 5); + + mt_set_non_kernel(0); + /* setup root with 2 values with NULL in between */ + mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); + mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); + mtree_store_range(mt, 11, 15, xa_mk_value(0xB), GFP_KERNEL); + + /* setup writer 2 that will trigger the race condition */ + mt_set_private(mt); + mt_set_callback(writer2); + + mtree_lock(mt); + /* erase 0-5 */ + mas_erase(&mas); + + /* index 6-10 should retain the value from writer 2 */ + check_load(mt, 6, xa_mk_value(0xC)); + mtree_unlock(mt); + + /* test for the same race but with mas_store_gfp() */ + mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); + mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); + + mas_set_range(&mas, 0, 5); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + + /* ensure write made by writer 2 is retained */ + check_load(mt, 6, xa_mk_value(0xC)); + + mt_set_private(NULL); + mt_set_callback(NULL); + mtree_unlock(mt); +} + + /* test to simulate expanding a vma from [0x7fffffffe000, 0x7ffffffff000) + * to [0x7ffde4ca1000, 0x7ffffffff000) and then shrinking the vma to + * [0x7ffde4ca1000, 0x7ffde4ca2000) + */ +static inline int check_vma_modification(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 0); + + mtree_lock(mt); + /* vma with old start and old end */ + __mas_set_range(&mas, 0x7fffffffe000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, xa_mk_value(1), GFP_KERNEL); + mas_store_prealloc(&mas, xa_mk_value(1)); + + /* next write occurs partly in previous range [0, 0x7fffffffe000)*/ + mas_prev_range(&mas, 0); + /* expand vma to {0x7ffde4ca1000, 0x7ffffffff000) */ + __mas_set_range(&mas, 0x7ffde4ca1000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, xa_mk_value(1), GFP_KERNEL); + mas_store_prealloc(&mas, xa_mk_value(1)); + + /* shrink vma to [0x7ffde4ca1000, 7ffde4ca2000) */ + __mas_set_range(&mas, 0x7ffde4ca2000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, NULL, GFP_KERNEL); + mas_store_prealloc(&mas, NULL); + mt_dump(mt, mt_dump_hex); + + mas_destroy(&mas); + mtree_unlock(mt); + return 0; +} + void farmer_tests(void) { struct maple_node *node; @@ -36231,6 +36324,10 @@ void farmer_tests(void) mt_dump(&tree, mt_dump_dec); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU); + check_vma_modification(&tree); + mtree_destroy(&tree); + tree.ma_root = xa_mk_value(0); mt_dump(&tree, mt_dump_dec); @@ -36257,6 +36354,10 @@ void farmer_tests(void) check_dfs_preorder(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU); + check_nomem_writer_race(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_prealloc(&tree); mtree_destroy(&tree); diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c index d0e53bff1eb6..253208a8541b 100644 --- a/tools/testing/radix-tree/xarray.c +++ b/tools/testing/radix-tree/xarray.c @@ -4,17 +4,9 @@ * Copyright (c) 2018 Matthew Wilcox */ -#define XA_DEBUG +#include "xarray-shared.h" #include "test.h" -#define module_init(x) -#define module_exit(x) -#define MODULE_AUTHOR(x) -#define MODULE_DESCRIPTION(X) -#define MODULE_LICENSE(x) -#define dump_stack() assert(0) - -#include "../../../lib/xarray.c" #undef XA_DEBUG #include "../../../lib/test_xarray.c" diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 432db923bced..1e2d46636a0c 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -141,6 +141,16 @@ long cg_read_long(const char *cgroup, const char *control) return atol(buf); } +long cg_read_long_fd(int fd) +{ + char buf[128]; + + if (pread(fd, buf, sizeof(buf), 0) <= 0) + return -1; + + return atol(buf); +} + long cg_read_key_long(const char *cgroup, const char *control, const char *key) { char buf[PAGE_SIZE]; @@ -183,6 +193,18 @@ int cg_write(const char *cgroup, const char *control, char *buf) return ret == len ? 0 : ret; } +/* + * Returns fd on success, or -1 on failure. + * (fd should be closed with close() as usual) + */ +int cg_open(const char *cgroup, const char *control, int flags) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/%s", cgroup, control); + return open(path, flags); +} + int cg_write_numeric(const char *cgroup, const char *control, long value) { char buf[64]; diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index e8d04ac9e3d2..19b131ee7707 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -34,9 +34,11 @@ extern int cg_read_strcmp(const char *cgroup, const char *control, extern int cg_read_strstr(const char *cgroup, const char *control, const char *needle); extern long cg_read_long(const char *cgroup, const char *control); +extern long cg_read_long_fd(int fd); long cg_read_key_long(const char *cgroup, const char *control, const char *key); extern long cg_read_lc(const char *cgroup, const char *control); extern int cg_write(const char *cgroup, const char *control, char *buf); +extern int cg_open(const char *cgroup, const char *control, int flags); int cg_write_numeric(const char *cgroup, const char *control, long value); extern int cg_run(const char *cgroup, int (*fn)(const char *cgroup, void *arg), diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 41ae8047b889..16f5d74ae762 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -161,13 +161,16 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg) /* * This test create a memory cgroup, allocates * some anonymous memory and some pagecache - * and check memory.current and some memory.stat values. + * and checks memory.current, memory.peak, and some memory.stat values. */ -static int test_memcg_current(const char *root) +static int test_memcg_current_peak(const char *root) { int ret = KSFT_FAIL; - long current; + long current, peak, peak_reset; char *memcg; + bool fd2_closed = false, fd3_closed = false, fd4_closed = false; + int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1; + struct stat ss; memcg = cg_name(root, "memcg_test"); if (!memcg) @@ -180,15 +183,124 @@ static int test_memcg_current(const char *root) if (current != 0) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak != 0) + goto cleanup; + if (cg_run(memcg, alloc_anon_50M_check, NULL)) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + /* + * We'll open a few FDs for the same memory.peak file to exercise the free-path + * We need at least three to be closed in a different order than writes occurred to test + * the linked-list handling. + */ + peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd == -1) { + if (errno == ENOENT) + ret = KSFT_SKIP; + goto cleanup; + } + + /* + * Before we try to use memory.peak's fd, try to figure out whether + * this kernel supports writing to that file in the first place. (by + * checking the writable bit on the file's st_mode) + */ + if (fstat(peak_fd, &ss)) + goto cleanup; + + if ((ss.st_mode & S_IWUSR) == 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd2 == -1) + goto cleanup; + + peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd3 == -1) + goto cleanup; + + /* any non-empty string resets, but make it clear */ + static const char reset_string[] = "reset\n"; + + peak_reset = write(peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(peak_fd2, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(peak_fd3, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + /* Make sure a completely independent read isn't affected by our FD-local reset above*/ + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + fd2_closed = true; + if (close(peak_fd2)) + goto cleanup; + + peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd4 == -1) + goto cleanup; + + peak_reset = write(peak_fd4, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak = cg_read_long_fd(peak_fd); + if (peak > MB(30) || peak < 0) + goto cleanup; + if (cg_run(memcg, alloc_pagecache_50M_check, NULL)) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + /* Make sure everything is back to normal */ + peak = cg_read_long_fd(peak_fd); + if (peak < MB(50)) + goto cleanup; + + peak = cg_read_long_fd(peak_fd4); + if (peak < MB(50)) + goto cleanup; + + fd3_closed = true; + if (close(peak_fd3)) + goto cleanup; + + fd4_closed = true; + if (close(peak_fd4)) + goto cleanup; + ret = KSFT_PASS; cleanup: + close(peak_fd); + if (!fd2_closed) + close(peak_fd2); + if (!fd3_closed) + close(peak_fd3); + if (!fd4_closed) + close(peak_fd4); cg_destroy(memcg); free(memcg); @@ -817,13 +929,19 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) /* * This test checks that memory.swap.max limits the amount of - * anonymous memory which can be swapped out. + * anonymous memory which can be swapped out. Additionally, it verifies that + * memory.swap.peak reflects the high watermark and can be reset. */ -static int test_memcg_swap_max(const char *root) +static int test_memcg_swap_max_peak(const char *root) { int ret = KSFT_FAIL; char *memcg; - long max; + long max, peak; + struct stat ss; + int swap_peak_fd = -1, mem_peak_fd = -1; + + /* any non-empty string resets */ + static const char reset_string[] = "foobarbaz"; if (!is_swap_enabled()) return KSFT_SKIP; @@ -840,6 +958,61 @@ static int test_memcg_swap_max(const char *root) goto cleanup; } + swap_peak_fd = cg_open(memcg, "memory.swap.peak", + O_RDWR | O_APPEND | O_CLOEXEC); + + if (swap_peak_fd == -1) { + if (errno == ENOENT) + ret = KSFT_SKIP; + goto cleanup; + } + + /* + * Before we try to use memory.swap.peak's fd, try to figure out + * whether this kernel supports writing to that file in the first + * place. (by checking the writable bit on the file's st_mode) + */ + if (fstat(swap_peak_fd, &ss)) + goto cleanup; + + if ((ss.st_mode & S_IWUSR) == 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (mem_peak_fd == -1) + goto cleanup; + + if (cg_read_long(memcg, "memory.swap.peak")) + goto cleanup; + + if (cg_read_long_fd(swap_peak_fd)) + goto cleanup; + + /* switch the swap and mem fds into local-peak tracking mode*/ + int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string)); + + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + if (cg_read_long_fd(swap_peak_fd)) + goto cleanup; + + if (cg_read_long(memcg, "memory.peak")) + goto cleanup; + + if (cg_read_long_fd(mem_peak_fd)) + goto cleanup; + + peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + if (cg_read_long_fd(mem_peak_fd)) + goto cleanup; + if (cg_read_strcmp(memcg, "memory.max", "max\n")) goto cleanup; @@ -862,6 +1035,61 @@ static int test_memcg_swap_max(const char *root) if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long(memcg, "memory.swap.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(mem_peak_fd); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak < MB(29)) + goto cleanup; + + /* + * open, reset and close the peak swap on another FD to make sure + * multiple extant fds don't corrupt the linked-list + */ + peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string); + if (peak_reset) + goto cleanup; + + peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string); + if (peak_reset) + goto cleanup; + + /* actually reset on the fds */ + peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak > MB(10)) + goto cleanup; + + /* + * The cgroup is now empty, but there may be a page or two associated + * with the open FD accounted to it. + */ + peak = cg_read_long_fd(mem_peak_fd); + if (peak > MB(1)) + goto cleanup; + + if (cg_read_long(memcg, "memory.peak") < MB(29)) + goto cleanup; + + if (cg_read_long(memcg, "memory.swap.peak") < MB(29)) + goto cleanup; + if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30))) goto cleanup; @@ -869,9 +1097,29 @@ static int test_memcg_swap_max(const char *root) if (max <= 0) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long(memcg, "memory.swap.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(mem_peak_fd); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak < MB(19)) + goto cleanup; + ret = KSFT_PASS; cleanup: + if (mem_peak_fd != -1 && close(mem_peak_fd)) + ret = KSFT_FAIL; + if (swap_peak_fd != -1 && close(swap_peak_fd)) + ret = KSFT_FAIL; cg_destroy(memcg); free(memcg); @@ -1295,7 +1543,7 @@ struct memcg_test { const char *name; } tests[] = { T(test_memcg_subtree_control), - T(test_memcg_current), + T(test_memcg_current_peak), T(test_memcg_min), T(test_memcg_low), T(test_memcg_high), @@ -1303,7 +1551,7 @@ struct memcg_test { T(test_memcg_max), T(test_memcg_reclaim), T(test_memcg_oom_events), - T(test_memcg_swap_max), + T(test_memcg_swap_max_peak), T(test_memcg_sock), T(test_memcg_oom_group_leaf_events), T(test_memcg_oom_group_parent_events), diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 190096017f80..40de679248b8 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -263,15 +263,13 @@ static int test_zswapin(const char *root) static int attempt_writeback(const char *cgroup, void *arg) { long pagesize = sysconf(_SC_PAGESIZE); - char *test_group = arg; size_t memsize = MB(4); char buf[pagesize]; long zswap_usage; - bool wb_enabled; + bool wb_enabled = *(bool *) arg; int ret = -1; char *mem; - wb_enabled = cg_read_long(test_group, "memory.zswap.writeback"); mem = (char *)malloc(memsize); if (!mem) return ret; @@ -288,12 +286,12 @@ static int attempt_writeback(const char *cgroup, void *arg) memcpy(&mem[i], buf, pagesize); /* Try and reclaim allocated memory */ - if (cg_write_numeric(test_group, "memory.reclaim", memsize)) { + if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) { ksft_print_msg("Failed to reclaim all of the requested memory\n"); goto out; } - zswap_usage = cg_read_long(test_group, "memory.zswap.current"); + zswap_usage = cg_read_long(cgroup, "memory.zswap.current"); /* zswpin */ for (int i = 0; i < memsize; i += pagesize) { @@ -303,7 +301,7 @@ static int attempt_writeback(const char *cgroup, void *arg) } } - if (cg_write_numeric(test_group, "memory.zswap.max", zswap_usage/2)) + if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2)) goto out; /* @@ -312,7 +310,7 @@ static int attempt_writeback(const char *cgroup, void *arg) * If writeback is disabled, memory reclaim will fail as zswap is limited and * it can't writeback to swap. */ - ret = cg_write_numeric(test_group, "memory.reclaim", memsize); + ret = cg_write_numeric(cgroup, "memory.reclaim", memsize); if (!wb_enabled) ret = (ret == -EAGAIN) ? 0 : -1; @@ -321,12 +319,41 @@ static int attempt_writeback(const char *cgroup, void *arg) return ret; } +static int test_zswap_writeback_one(const char *cgroup, bool wb) +{ + long zswpwb_before, zswpwb_after; + + zswpwb_before = get_cg_wb_count(cgroup); + if (zswpwb_before != 0) { + ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); + return -1; + } + + if (cg_run(cgroup, attempt_writeback, (void *) &wb)) + return -1; + + /* Verify that zswap writeback occurred only if writeback was enabled */ + zswpwb_after = get_cg_wb_count(cgroup); + if (zswpwb_after < 0) + return -1; + + if (wb != !!zswpwb_after) { + ksft_print_msg("zswpwb_after is %ld while wb is %s", + zswpwb_after, wb ? "enabled" : "disabled"); + return -1; + } + + return 0; +} + /* Test to verify the zswap writeback path */ static int test_zswap_writeback(const char *root, bool wb) { - long zswpwb_before, zswpwb_after; int ret = KSFT_FAIL; - char *test_group; + char *test_group, *test_group_child = NULL; + + if (cg_read_strcmp(root, "memory.zswap.writeback", "1")) + return KSFT_SKIP; test_group = cg_name(root, "zswap_writeback_test"); if (!test_group) @@ -336,29 +363,35 @@ static int test_zswap_writeback(const char *root, bool wb) if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) goto out; - zswpwb_before = get_cg_wb_count(test_group); - if (zswpwb_before != 0) { - ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); - goto out; - } - - if (cg_run(test_group, attempt_writeback, (void *) test_group)) + if (test_zswap_writeback_one(test_group, wb)) goto out; - /* Verify that zswap writeback occurred only if writeback was enabled */ - zswpwb_after = get_cg_wb_count(test_group); - if (zswpwb_after < 0) + /* Reset memory.zswap.max to max (modified by attempt_writeback), and + * set up child cgroup, whose memory.zswap.writeback is hardcoded to 1. + * Thus, the parent's setting shall be what's in effect. */ + if (cg_write(test_group, "memory.zswap.max", "max")) + goto out; + if (cg_write(test_group, "cgroup.subtree_control", "+memory")) goto out; - if (wb != !!zswpwb_after) { - ksft_print_msg("zswpwb_after is %ld while wb is %s", - zswpwb_after, wb ? "enabled" : "disabled"); + test_group_child = cg_name(test_group, "zswap_writeback_test_child"); + if (!test_group_child) + goto out; + if (cg_create(test_group_child)) + goto out; + if (cg_write(test_group_child, "memory.zswap.writeback", "1")) + goto out; + + if (test_zswap_writeback_one(test_group_child, wb)) goto out; - } ret = KSFT_PASS; out: + if (test_group_child) { + cg_destroy(test_group_child); + free(test_group_child); + } cg_destroy(test_group); free(test_group); return ret; diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore index e65ef9d9cedc..2ab675fecb6b 100644 --- a/tools/testing/selftests/damon/.gitignore +++ b/tools/testing/selftests/damon/.gitignore @@ -3,3 +3,4 @@ huge_count_read_write debugfs_target_ids_read_before_terminate_race debugfs_target_ids_pid_leak access_memory +access_memory_even diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 1e2e98cc809d..5b2a6a5dd1af 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -25,4 +25,6 @@ TEST_PROGS += debugfs_target_ids_pid_leak.sh TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py +EXTRA_CLEAN = __pycache__ + include ../lib.mk diff --git a/tools/testing/selftests/damon/damon_nr_regions.py b/tools/testing/selftests/damon/damon_nr_regions.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/damos_apply_interval.py b/tools/testing/selftests/damon/damos_apply_interval.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/damos_quota.py b/tools/testing/selftests/damon/damos_quota.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/damos_tried_regions.py b/tools/testing/selftests/damon/damos_tried_regions.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index e10b87376fde..02e1204971b0 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -113,7 +113,7 @@ endif endif -ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) +ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index d680c00d2853..67df7b47087f 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -254,7 +254,7 @@ function cleanup_hugetlb_memory() { local cgroup="$1" if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then echo killing write_to_hugetlbfs - killall -2 write_to_hugetlbfs + killall -2 --wait write_to_hugetlbfs wait_for_hugetlb_memory_to_get_depleted $cgroup fi set -e diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c index 267eea2e0e0b..3b1b532f1cbb 100644 --- a/tools/testing/selftests/mm/hugepage-mmap.c +++ b/tools/testing/selftests/mm/hugepage-mmap.c @@ -8,13 +8,6 @@ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this * example, the app is requesting memory of size 256MB that is backed by * huge pages. - * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. */ #define _GNU_SOURCE #include @@ -27,15 +20,6 @@ #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_SHARED | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_SHARED) -#endif - static void check_bytes(char *addr) { ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); @@ -74,7 +58,7 @@ int main(void) if (fd < 0) ksft_exit_fail_msg("memfd_create() failed: %s\n", strerror(errno)); - addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); + addr = mmap(NULL, LENGTH, PROTECTION, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { close(fd); ksft_exit_fail_msg("mmap(): %s\n", strerror(errno)); diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c index 478bb1e989e9..ef06260802b5 100644 --- a/tools/testing/selftests/mm/hugepage-shm.c +++ b/tools/testing/selftests/mm/hugepage-shm.c @@ -8,13 +8,6 @@ * SHM_HUGETLB in the shmget system call to inform the kernel that it is * requesting huge pages. * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. - * * Note: The default shared memory limit is quite low on many kernels, * you may need to increase it via: * @@ -39,15 +32,6 @@ #define dprintf(x) printf(x) -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define SHMAT_FLAGS (SHM_RND) -#else -#define ADDR (void *)(0x0UL) -#define SHMAT_FLAGS (0) -#endif - int main(void) { int shmid; @@ -61,7 +45,7 @@ int main(void) } printf("shmid: 0x%x\n", shmid); - shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); + shmaddr = shmat(shmid, NULL, 0); if (shmaddr == (char *)-1) { perror("Shared memory attach failure"); shmctl(shmid, IPC_RMID, NULL); diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c index 894d28c3dd47..df366a4d1b92 100644 --- a/tools/testing/selftests/mm/hugepage-vmemmap.c +++ b/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -22,20 +22,6 @@ #define PM_PFRAME_BITS 55 #define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) -/* - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. - */ -#ifdef __ia64__ -#define MAP_ADDR (void *)(0x8000000000000000UL) -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define MAP_ADDR NULL -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - static size_t pagesize; static size_t maplength; @@ -113,7 +99,8 @@ int main(int argc, char **argv) exit(1); } - addr = mmap(MAP_ADDR, maplength, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + addr = mmap(NULL, maplength, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); exit(1); diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 829320a519e7..56d4480e8d3c 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -1095,7 +1095,7 @@ static void usage(void) fprintf(stderr, "\n\tSupported Options:\n"); fprintf(stderr, "\t\t-h: This help message.\n"); fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); - fprintf(stderr, "\t\t Defaults to 0. Use this size for anon allocations.\n"); + fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n"); exit(1); } @@ -1209,6 +1209,8 @@ int main(int argc, char **argv) default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; default_settings.hugepages[anon_order].enabled = THP_ALWAYS; + default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT; + default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS; save_settings(); thp_push_settings(&default_settings); diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c index a1f005a90a4f..b47399feab53 100644 --- a/tools/testing/selftests/mm/map_hugetlb.c +++ b/tools/testing/selftests/mm/map_hugetlb.c @@ -4,11 +4,6 @@ * system call with MAP_HUGETLB flag. Before running this program make * sure the administrator has allocated enough default sized huge pages * to cover the 256 MB allocation. - * - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. */ #include #include @@ -21,15 +16,6 @@ #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - static void check_bytes(char *addr) { ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); @@ -60,7 +46,7 @@ int main(int argc, char **argv) void *addr; size_t hugepage_size; size_t length = LENGTH; - int flags = FLAGS; + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; int shift = 0; hugepage_size = default_huge_page_size(); @@ -85,7 +71,7 @@ int main(int argc, char **argv) ksft_print_msg("Default size hugepages\n"); ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20); - addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); + addr = mmap(NULL, length, PROTECTION, flags, -1, 0); if (addr == MAP_FAILED) ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index 6908569ef406..64bcbb7151cf 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -15,10 +15,10 @@ #include #include -#define TWOMEG (2<<20) -#define RUNTIME (20) - -#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +#define TWOMEG (2<<20) +#define RUNTIME (20) +#define MAX_RETRIES 100 +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) FIXTURE(migration) { @@ -65,6 +65,7 @@ int migrate(uint64_t *ptr, int n1, int n2) int ret, tmp; int status = 0; struct timespec ts1, ts2; + int failures = 0; if (clock_gettime(CLOCK_MONOTONIC, &ts1)) return -1; @@ -79,13 +80,17 @@ int migrate(uint64_t *ptr, int n1, int n2) ret = move_pages(0, 1, (void **) &ptr, &n2, &status, MPOL_MF_MOVE_ALL); if (ret) { - if (ret > 0) + if (ret > 0) { + /* Migration is best effort; try again */ + if (++failures < MAX_RETRIES) + continue; printf("Didn't migrate %d pages\n", ret); + } else perror("Couldn't migrate pages"); return -2; } - + failures = 0; tmp = n2; n2 = n1; n1 = tmp; diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index bfcea5cf9a48..01675c412b2a 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -99,6 +99,16 @@ static int sys_madvise(void *start, size_t len, int types) return sret; } +static void *sys_mremap(void *addr, size_t old_len, size_t new_len, + unsigned long flags, void *new_addr) +{ + void *sret; + + errno = 0; + sret = (void *) syscall(__NR_mremap, addr, old_len, new_len, flags, new_addr); + return sret; +} + static int sys_pkey_alloc(unsigned long flags, unsigned long init_val) { int ret = syscall(__NR_pkey_alloc, flags, init_val); @@ -756,6 +766,42 @@ static void test_seal_mprotect_partial_mprotect(bool seal) REPORT_TEST_PASS(); } +static void test_seal_mprotect_partial_mprotect_tail(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 2 * page_size; + int ret; + int prot; + + /* + * Check if a partial mseal (that results in two vmas) works correctly. + * It might mprotect the first, but it'll never touch the second (msealed) vma. + */ + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_mprotect(ptr, size, PROT_EXEC); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + FAIL_TEST_IF_FALSE(get_vma_size(ptr + page_size, &prot) > 0); + FAIL_TEST_IF_FALSE(prot == 0x4); + } + + REPORT_TEST_PASS(); +} + + static void test_seal_mprotect_two_vma_with_gap(bool seal) { void *ptr; @@ -973,6 +1019,36 @@ static void test_seal_munmap_vma_with_gap(bool seal) REPORT_TEST_PASS(); } +static void test_seal_munmap_partial_across_vmas(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 2 * page_size; + int ret; + int prot; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + FAIL_TEST_IF_FALSE(get_vma_size(ptr + page_size, &prot) > 0); + FAIL_TEST_IF_FALSE(prot == 0x4); + } + + REPORT_TEST_PASS(); +} + static void test_munmap_start_freed(bool seal) { void *ptr; @@ -1104,12 +1180,12 @@ static void test_seal_mremap_shrink(bool seal) } /* shrink from 4 pages to 2 pages. */ - ret2 = mremap(ptr, size, 2 * page_size, 0, 0); + ret2 = sys_mremap(ptr, size, 2 * page_size, 0, 0); if (seal) { - FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(ret2 == (void *) MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); } else { - FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + FAIL_TEST_IF_FALSE(ret2 != (void *) MAP_FAILED); } @@ -1136,7 +1212,7 @@ static void test_seal_mremap_expand(bool seal) } /* expand from 2 page to 4 pages. */ - ret2 = mremap(ptr, 2 * page_size, 4 * page_size, 0, 0); + ret2 = sys_mremap(ptr, 2 * page_size, 4 * page_size, 0, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1169,7 +1245,7 @@ static void test_seal_mremap_move(bool seal) } /* move from ptr to fixed address. */ - ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr); + ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1288,7 +1364,7 @@ static void test_seal_mremap_shrink_fixed(bool seal) } /* mremap to move and shrink to fixed address */ - ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + ret2 = sys_mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); @@ -1319,7 +1395,7 @@ static void test_seal_mremap_expand_fixed(bool seal) } /* mremap to move and expand to fixed address */ - ret2 = mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED, + ret2 = sys_mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); @@ -1350,7 +1426,7 @@ static void test_seal_mremap_move_fixed(bool seal) } /* mremap to move to fixed address */ - ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); + ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1379,14 +1455,13 @@ static void test_seal_mremap_move_fixed_zero(bool seal) /* * MREMAP_FIXED can move the mapping to zero address */ - ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + ret2 = sys_mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); } else { FAIL_TEST_IF_FALSE(ret2 == 0); - } REPORT_TEST_PASS(); @@ -1409,13 +1484,13 @@ static void test_seal_mremap_move_dontunmap(bool seal) } /* mremap to move, and don't unmap src addr. */ - ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0); + ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); } else { + /* kernel will allocate a new address */ FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); - } REPORT_TEST_PASS(); @@ -1423,7 +1498,7 @@ static void test_seal_mremap_move_dontunmap(bool seal) static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) { - void *ptr; + void *ptr, *ptr2; unsigned long page_size = getpagesize(); unsigned long size = 4 * page_size; int ret; @@ -1438,24 +1513,30 @@ static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) } /* - * The 0xdeaddead should not have effect on dest addr - * when MREMAP_DONTUNMAP is set. + * The new address is any address that not allocated. + * use allocate/free to similate that. */ - ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, - 0xdeaddead); + setup_single_address(size, &ptr2); + FAIL_TEST_IF_FALSE(ptr2 != (void *)-1); + ret = sys_munmap(ptr2, size); + FAIL_TEST_IF_FALSE(!ret); + + /* + * remap to any address. + */ + ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + (void *) ptr2); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); } else { - FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); - FAIL_TEST_IF_FALSE((long)ret2 != 0xdeaddead); - + /* remap success and return ptr2 */ + FAIL_TEST_IF_FALSE(ret2 == ptr2); } REPORT_TEST_PASS(); } - static void test_seal_merge_and_split(void) { void *ptr; @@ -1720,6 +1801,69 @@ static void test_seal_discard_ro_anon(bool seal) REPORT_TEST_PASS(); } +static void test_seal_discard_across_vmas(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 2 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_madvise(ptr, size, MADV_DONTNEED); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + REPORT_TEST_PASS(); +} + + +static void test_seal_madvise_nodiscard(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* + * Test a random madvise flag like MADV_RANDOM that does not touch page + * contents (and thus should work for msealed VMAs). RANDOM also happens to + * share bits with other discard-ish flags like REMOVE. + */ + ret = sys_madvise(ptr, size, MADV_RANDOM); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + REPORT_TEST_PASS(); +} + int main(int argc, char **argv) { bool test_seal = seal_support(); @@ -1732,7 +1876,7 @@ int main(int argc, char **argv) if (!pkey_supported()) ksft_print_msg("PKEY not supported\n"); - ksft_set_plan(80); + ksft_set_plan(88); test_seal_addseal(); test_seal_unmapped_start(); @@ -1778,12 +1922,17 @@ int main(int argc, char **argv) test_seal_mprotect_split(false); test_seal_mprotect_split(true); + test_seal_mprotect_partial_mprotect_tail(false); + test_seal_mprotect_partial_mprotect_tail(true); + test_seal_munmap(false); test_seal_munmap(true); test_seal_munmap_two_vma(false); test_seal_munmap_two_vma(true); test_seal_munmap_vma_with_gap(false); test_seal_munmap_vma_with_gap(true); + test_seal_munmap_partial_across_vmas(false); + test_seal_munmap_partial_across_vmas(true); test_munmap_start_freed(false); test_munmap_start_freed(true); @@ -1811,8 +1960,12 @@ int main(int argc, char **argv) test_seal_mremap_move_fixed_zero(true); test_seal_mremap_move_dontunmap_anyaddr(false); test_seal_mremap_move_dontunmap_anyaddr(true); + test_seal_madvise_nodiscard(false); + test_seal_madvise_nodiscard(true); test_seal_discard_ro_anon(false); test_seal_discard_ro_anon(true); + test_seal_discard_across_vmas(false); + test_seal_discard_across_vmas(true); test_seal_discard_ro_anon_on_rw(false); test_seal_discard_ro_anon_on_rw(true); test_seal_discard_ro_anon_on_shared(false); diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 36045edb10de..c5797ad1d37b 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -189,7 +189,7 @@ else fi # filter 64bit architectures -ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64" +ARCH64STR="arm64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64" if [ -z "$ARCH" ]; then ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') fi diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index e5e8dafc9d94..eb6d1b9fc362 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -84,6 +84,76 @@ static void write_debugfs(const char *fmt, ...) write_file(SPLIT_DEBUGFS, input, ret + 1); } +static char *allocate_zero_filled_hugepage(size_t len) +{ + char *result; + size_t i; + + result = memalign(pmd_pagesize, len); + if (!result) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(result, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + result[i] = (char)0; + + return result; +} + +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) +{ + unsigned long rss_anon_before, rss_anon_after; + size_t i; + + if (!check_huge_anon(one_page, 4, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + rss_anon_before = rss_anon(); + if (!rss_anon_before) { + printf("No RssAnon is allocated before split\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len, 0); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)0) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + if (!check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); + exit(EXIT_FAILURE); + } + + rss_anon_after = rss_anon(); + if (rss_anon_after >= rss_anon_before) { + printf("Incorrect RssAnon value. Before: %ld After: %ld\n", + rss_anon_before, rss_anon_after); + exit(EXIT_FAILURE); + } +} + +void split_pmd_zero_pages(void) +{ + char *one_page; + int nr_hpages = 4; + size_t len = nr_hpages * pmd_pagesize; + + one_page = allocate_zero_filled_hugepage(len); + verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); + printf("Split zero filled huge pages successful\n"); + free(one_page); +} + void split_pmd_thp(void) { char *one_page; @@ -431,6 +501,7 @@ int main(int argc, char **argv) fd_size = 2 * pmd_pagesize; + split_pmd_zero_pages(); split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index a4163438108e..577eaab6266f 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -33,10 +33,11 @@ static const char * const thp_defrag_strings[] = { }; static const char * const shmem_enabled_strings[] = { + "never", "always", "within_size", "advise", - "never", + "inherit", "deny", "force", NULL @@ -200,6 +201,7 @@ void thp_write_num(const char *name, unsigned long num) void thp_read_settings(struct thp_settings *settings) { unsigned long orders = thp_supported_orders(); + unsigned long shmem_orders = thp_shmem_supported_orders(); char path[PATH_MAX]; int i; @@ -234,12 +236,24 @@ void thp_read_settings(struct thp_settings *settings) settings->hugepages[i].enabled = thp_read_string(path, thp_enabled_strings); } + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & shmem_orders)) { + settings->shmem_hugepages[i].enabled = SHMEM_NEVER; + continue; + } + snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled", + (getpagesize() >> 10) << i); + settings->shmem_hugepages[i].enabled = + thp_read_string(path, shmem_enabled_strings); + } } void thp_write_settings(struct thp_settings *settings) { struct khugepaged_settings *khugepaged = &settings->khugepaged; unsigned long orders = thp_supported_orders(); + unsigned long shmem_orders = thp_shmem_supported_orders(); char path[PATH_MAX]; int enabled; int i; @@ -271,6 +285,15 @@ void thp_write_settings(struct thp_settings *settings) enabled = settings->hugepages[i].enabled; thp_write_string(path, thp_enabled_strings[enabled]); } + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & shmem_orders)) + continue; + snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled", + (getpagesize() >> 10) << i); + enabled = settings->shmem_hugepages[i].enabled; + thp_write_string(path, shmem_enabled_strings[enabled]); + } } struct thp_settings *thp_current_settings(void) @@ -324,17 +347,18 @@ void thp_set_read_ahead_path(char *path) dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0'; } -unsigned long thp_supported_orders(void) +static unsigned long __thp_supported_orders(bool is_shmem) { unsigned long orders = 0; char path[PATH_MAX]; char buf[256]; - int ret; - int i; + int ret, i; + char anon_dir[] = "enabled"; + char shmem_dir[] = "shmem_enabled"; for (i = 0; i < NR_ORDERS; i++) { - ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/enabled", - (getpagesize() >> 10) << i); + ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/%s", + (getpagesize() >> 10) << i, is_shmem ? shmem_dir : anon_dir); if (ret >= PATH_MAX) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); @@ -347,3 +371,13 @@ unsigned long thp_supported_orders(void) return orders; } + +unsigned long thp_supported_orders(void) +{ + return __thp_supported_orders(false); +} + +unsigned long thp_shmem_supported_orders(void) +{ + return __thp_supported_orders(true); +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index 71cbff05f4c7..876235a23460 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -22,10 +22,11 @@ enum thp_defrag { }; enum shmem_enabled { + SHMEM_NEVER, SHMEM_ALWAYS, SHMEM_WITHIN_SIZE, SHMEM_ADVISE, - SHMEM_NEVER, + SHMEM_INHERIT, SHMEM_DENY, SHMEM_FORCE, }; @@ -46,6 +47,10 @@ struct khugepaged_settings { unsigned long pages_to_scan; }; +struct shmem_hugepages_settings { + enum shmem_enabled enabled; +}; + struct thp_settings { enum thp_enabled thp_enabled; enum thp_defrag thp_defrag; @@ -54,6 +59,7 @@ struct thp_settings { struct khugepaged_settings khugepaged; unsigned long read_ahead_kb; struct hugepages_settings hugepages[NR_ORDERS]; + struct shmem_hugepages_settings shmem_hugepages[NR_ORDERS]; }; int read_file(const char *path, char *buf, size_t buflen); @@ -76,5 +82,6 @@ void thp_save_settings(void); void thp_set_read_ahead_path(char *path); unsigned long thp_supported_orders(void); +unsigned long thp_shmem_supported_orders(void); #endif /* __THP_SETTINGS_H__ */ diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 5a62530da3b5..d8d0cf04bb57 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -12,6 +12,7 @@ #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define SMAP_FILE_PATH "/proc/self/smaps" +#define STATUS_FILE_PATH "/proc/self/status" #define MAX_LINE_LENGTH 500 unsigned int __page_size; @@ -171,6 +172,27 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } +unsigned long rss_anon(void) +{ + unsigned long rss_anon = 0; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + + fp = fopen(STATUS_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH); + + if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer))) + goto err_out; + + if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1) + ksft_exit_fail_msg("Reading status error\n"); + +err_out: + fclose(fp); + return rss_anon; +} + bool __check_huge(void *addr, char *pattern, int nr_hpages, uint64_t hpage_size) { diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 9007c420d52c..2eaed8209925 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); +unsigned long rss_anon(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 6a2caba19ee1..1289d311efd7 100644 --- a/tools/testing/selftests/mm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -28,7 +28,7 @@ enum method { /* Global variables. */ static const char *self; -static char *shmaddr; +static int *shmaddr; static int shmid; /* @@ -47,15 +47,17 @@ void sig_handler(int signo) { printf("Received %d.\n", signo); if (signo == SIGINT) { - printf("Deleting the memory\n"); - if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(4); - } + if (shmaddr) { + printf("Deleting the memory\n"); + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } - shmctl(shmid, IPC_RMID, NULL); - printf("Done deleting the memory\n"); + shmctl(shmid, IPC_RMID, NULL); + printf("Done deleting the memory\n"); + } } exit(2); } @@ -211,7 +213,8 @@ int main(int argc, char **argv) shmctl(shmid, IPC_RMID, NULL); exit(2); } - printf("shmaddr: %p\n", ptr); + shmaddr = ptr; + printf("shmaddr: %p\n", shmaddr); break; default: diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/shared/autoconf.h similarity index 100% rename from tools/testing/radix-tree/generated/autoconf.h rename to tools/testing/shared/autoconf.h diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/shared/linux.c similarity index 91% rename from tools/testing/radix-tree/linux.c rename to tools/testing/shared/linux.c index 4eb442206d01..17263696b5d8 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/shared/linux.c @@ -26,8 +26,21 @@ struct kmem_cache { unsigned int non_kernel; unsigned long nr_allocated; unsigned long nr_tallocated; + bool exec_callback; + void (*callback)(void *); + void *private; }; +void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)) +{ + cachep->callback = callback; +} + +void kmem_cache_set_private(struct kmem_cache *cachep, void *private) +{ + cachep->private = private; +} + void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val) { cachep->non_kernel = val; @@ -58,9 +71,17 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *p; + if (cachep->exec_callback) { + if (cachep->callback) + cachep->callback(cachep->private); + cachep->exec_callback = false; + } + if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (!cachep->non_kernel) + if (!cachep->non_kernel) { + cachep->exec_callback = true; return NULL; + } cachep->non_kernel--; } @@ -223,6 +244,9 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, ret->objs = NULL; ret->ctor = ctor; ret->non_kernel = 0; + ret->exec_callback = false; + ret->callback = NULL; + ret->private = NULL; return ret; } diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/shared/linux/bug.h similarity index 100% rename from tools/testing/radix-tree/linux/bug.h rename to tools/testing/shared/linux/bug.h diff --git a/tools/testing/radix-tree/linux/cpu.h b/tools/testing/shared/linux/cpu.h similarity index 100% rename from tools/testing/radix-tree/linux/cpu.h rename to tools/testing/shared/linux/cpu.h diff --git a/tools/testing/radix-tree/linux/idr.h b/tools/testing/shared/linux/idr.h similarity index 100% rename from tools/testing/radix-tree/linux/idr.h rename to tools/testing/shared/linux/idr.h diff --git a/tools/testing/radix-tree/linux/init.h b/tools/testing/shared/linux/init.h similarity index 100% rename from tools/testing/radix-tree/linux/init.h rename to tools/testing/shared/linux/init.h diff --git a/tools/testing/radix-tree/linux/kconfig.h b/tools/testing/shared/linux/kconfig.h similarity index 100% rename from tools/testing/radix-tree/linux/kconfig.h rename to tools/testing/shared/linux/kconfig.h diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/shared/linux/kernel.h similarity index 100% rename from tools/testing/radix-tree/linux/kernel.h rename to tools/testing/shared/linux/kernel.h diff --git a/tools/testing/radix-tree/linux/kmemleak.h b/tools/testing/shared/linux/kmemleak.h similarity index 100% rename from tools/testing/radix-tree/linux/kmemleak.h rename to tools/testing/shared/linux/kmemleak.h diff --git a/tools/testing/radix-tree/linux/local_lock.h b/tools/testing/shared/linux/local_lock.h similarity index 100% rename from tools/testing/radix-tree/linux/local_lock.h rename to tools/testing/shared/linux/local_lock.h diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/shared/linux/lockdep.h similarity index 100% rename from tools/testing/radix-tree/linux/lockdep.h rename to tools/testing/shared/linux/lockdep.h diff --git a/tools/testing/radix-tree/linux/maple_tree.h b/tools/testing/shared/linux/maple_tree.h similarity index 100% rename from tools/testing/radix-tree/linux/maple_tree.h rename to tools/testing/shared/linux/maple_tree.h diff --git a/tools/testing/radix-tree/linux/percpu.h b/tools/testing/shared/linux/percpu.h similarity index 100% rename from tools/testing/radix-tree/linux/percpu.h rename to tools/testing/shared/linux/percpu.h diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/shared/linux/preempt.h similarity index 100% rename from tools/testing/radix-tree/linux/preempt.h rename to tools/testing/shared/linux/preempt.h diff --git a/tools/testing/radix-tree/linux/radix-tree.h b/tools/testing/shared/linux/radix-tree.h similarity index 100% rename from tools/testing/radix-tree/linux/radix-tree.h rename to tools/testing/shared/linux/radix-tree.h diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/shared/linux/rcupdate.h similarity index 100% rename from tools/testing/radix-tree/linux/rcupdate.h rename to tools/testing/shared/linux/rcupdate.h diff --git a/tools/testing/radix-tree/linux/xarray.h b/tools/testing/shared/linux/xarray.h similarity index 100% rename from tools/testing/radix-tree/linux/xarray.h rename to tools/testing/shared/linux/xarray.h diff --git a/tools/testing/shared/maple-shared.h b/tools/testing/shared/maple-shared.h new file mode 100644 index 000000000000..3d847edd149d --- /dev/null +++ b/tools/testing/shared/maple-shared.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#define CONFIG_DEBUG_MAPLE_TREE +#define CONFIG_MAPLE_SEARCH +#define MAPLE_32BIT (MAPLE_NODE_SLOTS > 31) +#include "shared.h" +#include +#include +#include "linux/init.h" diff --git a/tools/testing/shared/maple-shim.c b/tools/testing/shared/maple-shim.c new file mode 100644 index 000000000000..640df76f483e --- /dev/null +++ b/tools/testing/shared/maple-shim.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* Very simple shim around the maple tree. */ + +#include "maple-shared.h" + +#include "../../../lib/maple_tree.c" diff --git a/tools/testing/shared/shared.h b/tools/testing/shared/shared.h new file mode 100644 index 000000000000..f08f683812ad --- /dev/null +++ b/tools/testing/shared/shared.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include + +#include +#include + +#ifndef module_init +#define module_init(x) +#endif + +#ifndef module_exit +#define module_exit(x) +#endif + +#ifndef MODULE_AUTHOR +#define MODULE_AUTHOR(x) +#endif + +#ifndef MODULE_LICENSE +#define MODULE_LICENSE(x) +#endif + +#ifndef MODULE_DESCRIPTION +#define MODULE_DESCRIPTION(x) +#endif + +#ifndef dump_stack +#define dump_stack() assert(0) +#endif diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk new file mode 100644 index 000000000000..a05f0588513a --- /dev/null +++ b/tools/testing/shared/shared.mk @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS += -I../shared -I. -I../../include -I../../../lib -g -Og -Wall \ + -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined +LDFLAGS += -fsanitize=address -fsanitize=undefined +LDLIBS += -lpthread -lurcu +LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o +SHARED_OFILES = xarray-shared.o radix-tree.o idr.o linux.o $(LIBS) + +SHARED_DEPS = Makefile ../shared/shared.mk ../shared/*.h generated/map-shift.h \ + generated/bit-length.h generated/autoconf.h \ + ../../include/linux/*.h \ + ../../include/asm/*.h \ + ../../../include/linux/xarray.h \ + ../../../include/linux/maple_tree.h \ + ../../../include/linux/radix-tree.h \ + ../../../lib/radix-tree.h \ + ../../../include/linux/idr.h + +ifndef SHIFT + SHIFT=3 +endif + +ifeq ($(BUILD), 32) + CFLAGS += -m32 + LDFLAGS += -m32 +LONG_BIT := 32 +endif + +ifndef LONG_BIT +LONG_BIT := $(shell getconf LONG_BIT) +endif + +%.o: ../shared/%.c + $(CC) -c $(CFLAGS) $< -o $@ + +vpath %.c ../../lib + +$(SHARED_OFILES): $(SHARED_DEPS) + +radix-tree.c: ../../../lib/radix-tree.c + sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ + +idr.c: ../../../lib/idr.c + sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ + +xarray-shared.o: ../shared/xarray-shared.c ../../../lib/xarray.c \ + ../../../lib/test_xarray.c + +maple-shared.o: ../shared/maple-shared.c ../../../lib/maple_tree.c \ + ../../../lib/test_maple_tree.c + +generated/autoconf.h: + @mkdir -p generated + cp ../shared/autoconf.h generated/autoconf.h + +generated/map-shift.h: + @mkdir -p generated + @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ + echo "Generating $@"; \ + echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \ + generated/map-shift.h; \ + fi + +generated/bit-length.h: FORCE + @mkdir -p generated + @if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then \ + echo "Generating $@"; \ + echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@; \ + fi + +FORCE: ; diff --git a/tools/testing/radix-tree/trace/events/maple_tree.h b/tools/testing/shared/trace/events/maple_tree.h similarity index 100% rename from tools/testing/radix-tree/trace/events/maple_tree.h rename to tools/testing/shared/trace/events/maple_tree.h diff --git a/tools/testing/shared/xarray-shared.c b/tools/testing/shared/xarray-shared.c new file mode 100644 index 000000000000..e90901958dcd --- /dev/null +++ b/tools/testing/shared/xarray-shared.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "xarray-shared.h" + +#include "../../../lib/xarray.c" diff --git a/tools/testing/shared/xarray-shared.h b/tools/testing/shared/xarray-shared.h new file mode 100644 index 000000000000..ac2d16ff53ae --- /dev/null +++ b/tools/testing/shared/xarray-shared.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#define XA_DEBUG +#include "shared.h" diff --git a/tools/testing/vma/.gitignore b/tools/testing/vma/.gitignore new file mode 100644 index 000000000000..b003258eba79 --- /dev/null +++ b/tools/testing/vma/.gitignore @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +generated/bit-length.h +generated/map-shift.h +generated/autoconf.h +idr.c +radix-tree.c +vma diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile new file mode 100644 index 000000000000..860fd2311dcc --- /dev/null +++ b/tools/testing/vma/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +.PHONY: default clean + +default: vma + +include ../shared/shared.mk + +OFILES = $(SHARED_OFILES) vma.o maple-shim.o +TARGETS = vma + +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h + +vma: $(OFILES) + $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) + +clean: + $(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h generated/bit-length.h generated/autoconf.h diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h new file mode 100644 index 000000000000..e01f66f98982 --- /dev/null +++ b/tools/testing/vma/linux/atomic.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_ATOMIC_H +#define _LINUX_ATOMIC_H + +#define atomic_t int32_t +#define atomic_inc(x) uatomic_inc(x) +#define atomic_read(x) uatomic_read(x) +#define atomic_set(x, y) do {} while (0) +#define U8_MAX UCHAR_MAX + +#endif /* _LINUX_ATOMIC_H */ diff --git a/tools/testing/vma/linux/mmzone.h b/tools/testing/vma/linux/mmzone.h new file mode 100644 index 000000000000..33cd1517f7a3 --- /dev/null +++ b/tools/testing/vma/linux/mmzone.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_MMZONE_H +#define _LINUX_MMZONE_H + +#include + +struct pglist_data *first_online_pgdat(void); +struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); + +#define for_each_online_pgdat(pgdat) \ + for (pgdat = first_online_pgdat(); \ + pgdat; \ + pgdat = next_online_pgdat(pgdat)) + +enum zone_type { + __MAX_NR_ZONES +}; + +#define MAX_NR_ZONES __MAX_NR_ZONES +#define MAX_PAGE_ORDER 10 +#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) + +#define pageblock_order MAX_PAGE_ORDER +#define pageblock_nr_pages BIT(pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) + +struct zone { + atomic_long_t managed_pages; +}; + +typedef struct pglist_data { + struct zone node_zones[MAX_NR_ZONES]; + +} pg_data_t; + +#endif /* _LINUX_MMZONE_H */ diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c new file mode 100644 index 000000000000..c53f220eb6cc --- /dev/null +++ b/tools/testing/vma/vma.c @@ -0,0 +1,1563 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +#include "maple-shared.h" +#include "vma_internal.h" + +/* Include so header guard set. */ +#include "../../../mm/vma.h" + +static bool fail_prealloc; + +/* Then override vma_iter_prealloc() so we can choose to fail it. */ +#define vma_iter_prealloc(vmi, vma) \ + (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL)) + +/* + * Directly import the VMA implementation here. Our vma_internal.h wrapper + * provides userland-equivalent functionality for everything vma.c uses. + */ +#include "../../../mm/vma.c" + +const struct vm_operations_struct vma_dummy_vm_ops; +static struct anon_vma dummy_anon_vma; + +#define ASSERT_TRUE(_expr) \ + do { \ + if (!(_expr)) { \ + fprintf(stderr, \ + "Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \ + __FILE__, __LINE__, __FUNCTION__, #_expr); \ + return false; \ + } \ + } while (0) +#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr)) +#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) +#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) + +static struct task_struct __current; + +struct task_struct *get_current(void) +{ + return &__current; +} + +/* Helper function to simply allocate a VMA. */ +static struct vm_area_struct *alloc_vma(struct mm_struct *mm, + unsigned long start, + unsigned long end, + pgoff_t pgoff, + vm_flags_t flags) +{ + struct vm_area_struct *ret = vm_area_alloc(mm); + + if (ret == NULL) + return NULL; + + ret->vm_start = start; + ret->vm_end = end; + ret->vm_pgoff = pgoff; + ret->__vm_flags = flags; + + return ret; +} + +/* Helper function to allocate a VMA and link it to the tree. */ +static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, + unsigned long start, + unsigned long end, + pgoff_t pgoff, + vm_flags_t flags) +{ + struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, flags); + + if (vma == NULL) + return NULL; + + if (vma_link(mm, vma)) { + vm_area_free(vma); + return NULL; + } + + /* + * Reset this counter which we use to track whether writes have + * begun. Linking to the tree will have caused this to be incremented, + * which means we will get a false positive otherwise. + */ + vma->vm_lock_seq = -1; + + return vma; +} + +/* Helper function which provides a wrapper around a merge new VMA operation. */ +static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) +{ + /* + * For convenience, get prev and next VMAs. Which the new VMA operation + * requires. + */ + vmg->next = vma_next(vmg->vmi); + vmg->prev = vma_prev(vmg->vmi); + vma_iter_next_range(vmg->vmi); + + return vma_merge_new_range(vmg); +} + +/* + * Helper function which provides a wrapper around a merge existing VMA + * operation. + */ +static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) +{ + return vma_merge_existing_range(vmg); +} + +/* + * Helper function which provides a wrapper around the expansion of an existing + * VMA. + */ +static int expand_existing(struct vma_merge_struct *vmg) +{ + return vma_expand(vmg); +} + +/* + * Helper function to reset merge state the associated VMA iterator to a + * specified new range. + */ +static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, + unsigned long end, pgoff_t pgoff, vm_flags_t flags) +{ + vma_iter_set(vmg->vmi, start); + + vmg->prev = NULL; + vmg->next = NULL; + vmg->vma = NULL; + + vmg->start = start; + vmg->end = end; + vmg->pgoff = pgoff; + vmg->flags = flags; +} + +/* + * Helper function to try to merge a new VMA. + * + * Update vmg and the iterator for it and try to merge, otherwise allocate a new + * VMA, link it to the maple tree and return it. + */ +static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, + struct vma_merge_struct *vmg, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t flags, + bool *was_merged) +{ + struct vm_area_struct *merged; + + vmg_set_range(vmg, start, end, pgoff, flags); + + merged = merge_new(vmg); + if (merged) { + *was_merged = true; + ASSERT_EQ(vmg->state, VMA_MERGE_SUCCESS); + return merged; + } + + *was_merged = false; + + ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE); + + return alloc_and_link_vma(mm, start, end, pgoff, flags); +} + +/* + * Helper function to reset the dummy anon_vma to indicate it has not been + * duplicated. + */ +static void reset_dummy_anon_vma(void) +{ + dummy_anon_vma.was_cloned = false; + dummy_anon_vma.was_unlinked = false; +} + +/* + * Helper function to remove all VMAs and destroy the maple tree associated with + * a virtual address space. Returns a count of VMAs in the tree. + */ +static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi) +{ + struct vm_area_struct *vma; + int count = 0; + + fail_prealloc = false; + reset_dummy_anon_vma(); + + vma_iter_set(vmi, 0); + for_each_vma(*vmi, vma) { + vm_area_free(vma); + count++; + } + + mtree_destroy(&mm->mm_mt); + mm->map_count = 0; + return count; +} + +/* Helper function to determine if VMA has had vma_start_write() performed. */ +static bool vma_write_started(struct vm_area_struct *vma) +{ + int seq = vma->vm_lock_seq; + + /* We reset after each check. */ + vma->vm_lock_seq = -1; + + /* The vma_start_write() stub simply increments this value. */ + return seq > -1; +} + +/* Helper function providing a dummy vm_ops->close() method.*/ +static void dummy_close(struct vm_area_struct *) +{ +} + +static bool test_simple_merge(void) +{ + struct vm_area_struct *vma; + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, flags); + struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, flags); + VMA_ITERATOR(vmi, &mm, 0x1000); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + .start = 0x1000, + .end = 0x2000, + .flags = flags, + .pgoff = 1, + }; + + ASSERT_FALSE(vma_link(&mm, vma_left)); + ASSERT_FALSE(vma_link(&mm, vma_right)); + + vma = merge_new(&vmg); + ASSERT_NE(vma, NULL); + + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x3000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->vm_flags, flags); + + vm_area_free(vma); + mtree_destroy(&mm.mm_mt); + + return true; +} + +static bool test_simple_modify(void) +{ + struct vm_area_struct *vma; + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, flags); + VMA_ITERATOR(vmi, &mm, 0x1000); + + ASSERT_FALSE(vma_link(&mm, init_vma)); + + /* + * The flags will not be changed, the vma_modify_flags() function + * performs the merge/split only. + */ + vma = vma_modify_flags(&vmi, init_vma, init_vma, + 0x1000, 0x2000, VM_READ | VM_MAYREAD); + ASSERT_NE(vma, NULL); + /* We modify the provided VMA, and on split allocate new VMAs. */ + ASSERT_EQ(vma, init_vma); + + ASSERT_EQ(vma->vm_start, 0x1000); + ASSERT_EQ(vma->vm_end, 0x2000); + ASSERT_EQ(vma->vm_pgoff, 1); + + /* + * Now walk through the three split VMAs and make sure they are as + * expected. + */ + + vma_iter_set(&vmi, 0); + vma = vma_iter_load(&vmi); + + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x1000); + ASSERT_EQ(vma->vm_pgoff, 0); + + vm_area_free(vma); + vma_iter_clear(&vmi); + + vma = vma_next(&vmi); + + ASSERT_EQ(vma->vm_start, 0x1000); + ASSERT_EQ(vma->vm_end, 0x2000); + ASSERT_EQ(vma->vm_pgoff, 1); + + vm_area_free(vma); + vma_iter_clear(&vmi); + + vma = vma_next(&vmi); + + ASSERT_EQ(vma->vm_start, 0x2000); + ASSERT_EQ(vma->vm_end, 0x3000); + ASSERT_EQ(vma->vm_pgoff, 2); + + vm_area_free(vma); + mtree_destroy(&mm.mm_mt); + + return true; +} + +static bool test_simple_expand(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags); + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .vmi = &vmi, + .vma = vma, + .start = 0, + .end = 0x3000, + .pgoff = 0, + }; + + ASSERT_FALSE(vma_link(&mm, vma)); + + ASSERT_FALSE(expand_existing(&vmg)); + + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x3000); + ASSERT_EQ(vma->vm_pgoff, 0); + + vm_area_free(vma); + mtree_destroy(&mm.mm_mt); + + return true; +} + +static bool test_simple_shrink(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, flags); + VMA_ITERATOR(vmi, &mm, 0); + + ASSERT_FALSE(vma_link(&mm, vma)); + + ASSERT_FALSE(vma_shrink(&vmi, vma, 0, 0x1000, 0)); + + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x1000); + ASSERT_EQ(vma->vm_pgoff, 0); + + vm_area_free(vma); + mtree_destroy(&mm.mm_mt); + + return true; +} + +static bool test_merge_new(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct anon_vma_chain dummy_anon_vma_chain_a = { + .anon_vma = &dummy_anon_vma, + }; + struct anon_vma_chain dummy_anon_vma_chain_b = { + .anon_vma = &dummy_anon_vma, + }; + struct anon_vma_chain dummy_anon_vma_chain_c = { + .anon_vma = &dummy_anon_vma, + }; + struct anon_vma_chain dummy_anon_vma_chain_d = { + .anon_vma = &dummy_anon_vma, + }; + const struct vm_operations_struct vm_ops = { + .close = dummy_close, + }; + int count; + struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d; + bool merged; + + /* + * 0123456789abc + * AA B CC + */ + vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + ASSERT_NE(vma_a, NULL); + /* We give each VMA a single avc so we can test anon_vma duplication. */ + INIT_LIST_HEAD(&vma_a->anon_vma_chain); + list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain); + + vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + ASSERT_NE(vma_b, NULL); + INIT_LIST_HEAD(&vma_b->anon_vma_chain); + list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain); + + vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, flags); + ASSERT_NE(vma_c, NULL); + INIT_LIST_HEAD(&vma_c->anon_vma_chain); + list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain); + + /* + * NO merge. + * + * 0123456789abc + * AA B ** CC + */ + vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, flags, &merged); + ASSERT_NE(vma_d, NULL); + INIT_LIST_HEAD(&vma_d->anon_vma_chain); + list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain); + ASSERT_FALSE(merged); + ASSERT_EQ(mm.map_count, 4); + + /* + * Merge BOTH sides. + * + * 0123456789abc + * AA*B DD CC + */ + vma_a->vm_ops = &vm_ops; /* This should have no impact. */ + vma_b->anon_vma = &dummy_anon_vma; + vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged); + ASSERT_EQ(vma, vma_a); + /* Merge with A, delete B. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x4000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 3); + + /* + * Merge to PREVIOUS VMA. + * + * 0123456789abc + * AAAA* DD CC + */ + vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, flags, &merged); + ASSERT_EQ(vma, vma_a); + /* Extend A. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x5000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 3); + + /* + * Merge to NEXT VMA. + * + * 0123456789abc + * AAAAA *DD CC + */ + vma_d->anon_vma = &dummy_anon_vma; + vma_d->vm_ops = &vm_ops; /* This should have no impact. */ + vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged); + ASSERT_EQ(vma, vma_d); + /* Prepend. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0x6000); + ASSERT_EQ(vma->vm_end, 0x9000); + ASSERT_EQ(vma->vm_pgoff, 6); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 3); + + /* + * Merge BOTH sides. + * + * 0123456789abc + * AAAAA*DDD CC + */ + vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */ + vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged); + ASSERT_EQ(vma, vma_a); + /* Merge with A, delete D. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x9000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 2); + + /* + * Merge to NEXT VMA. + * + * 0123456789abc + * AAAAAAAAA *CC + */ + vma_c->anon_vma = &dummy_anon_vma; + vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, flags, &merged); + ASSERT_EQ(vma, vma_c); + /* Prepend C. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0xa000); + ASSERT_EQ(vma->vm_end, 0xc000); + ASSERT_EQ(vma->vm_pgoff, 0xa); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 2); + + /* + * Merge BOTH sides. + * + * 0123456789abc + * AAAAAAAAA*CCC + */ + vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, flags, &merged); + ASSERT_EQ(vma, vma_a); + /* Extend A and delete C. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0xc000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 1); + + /* + * Final state. + * + * 0123456789abc + * AAAAAAAAAAAAA + */ + + count = 0; + vma_iter_set(&vmi, 0); + for_each_vma(vmi, vma) { + ASSERT_NE(vma, NULL); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0xc000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + + vm_area_free(vma); + count++; + } + + /* Should only have one VMA left (though freed) after all is done.*/ + ASSERT_EQ(count, 1); + + mtree_destroy(&mm.mm_mt); + return true; +} + +static bool test_vma_merge_special_flags(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + vm_flags_t special_flags[] = { VM_IO, VM_DONTEXPAND, VM_PFNMAP, VM_MIXEDMAP }; + vm_flags_t all_special_flags = 0; + int i; + struct vm_area_struct *vma_left, *vma; + + /* Make sure there aren't new VM_SPECIAL flags. */ + for (i = 0; i < ARRAY_SIZE(special_flags); i++) { + all_special_flags |= special_flags[i]; + } + ASSERT_EQ(all_special_flags, VM_SPECIAL); + + /* + * 01234 + * AAA + */ + vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + ASSERT_NE(vma_left, NULL); + + /* 1. Set up new VMA with special flag that would otherwise merge. */ + + /* + * 01234 + * AAA* + * + * This should merge if not for the VM_SPECIAL flag. + */ + vmg_set_range(&vmg, 0x3000, 0x4000, 3, flags); + for (i = 0; i < ARRAY_SIZE(special_flags); i++) { + vm_flags_t special_flag = special_flags[i]; + + vma_left->__vm_flags = flags | special_flag; + vmg.flags = flags | special_flag; + vma = merge_new(&vmg); + ASSERT_EQ(vma, NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + } + + /* 2. Modify VMA with special flag that would otherwise merge. */ + + /* + * 01234 + * AAAB + * + * Create a VMA to modify. + */ + vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + ASSERT_NE(vma, NULL); + vmg.vma = vma; + + for (i = 0; i < ARRAY_SIZE(special_flags); i++) { + vm_flags_t special_flag = special_flags[i]; + + vma_left->__vm_flags = flags | special_flag; + vmg.flags = flags | special_flag; + vma = merge_existing(&vmg); + ASSERT_EQ(vma, NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + } + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_vma_merge_with_close(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + const struct vm_operations_struct vm_ops = { + .close = dummy_close, + }; + struct vm_area_struct *vma_prev, *vma_next, *vma; + + /* + * When merging VMAs we are not permitted to remove any VMA that has a + * vm_ops->close() hook. + * + * Considering the two possible adjacent VMAs to which a VMA can be + * merged: + * + * [ prev ][ vma ][ next ] + * + * In no case will we need to delete prev. If the operation is + * mergeable, then prev will be extended with one or both of vma and + * next deleted. + * + * As a result, during initial mergeability checks, only + * can_vma_merge_before() (which implies the VMA being merged with is + * 'next' as shown above) bothers to check to see whether the next VMA + * has a vm_ops->close() callback that will need to be called when + * removed. + * + * If it does, then we cannot merge as the resources that the close() + * operation potentially clears down are tied only to the existing VMA + * range and we have no way of extending those to the nearly merged one. + * + * We must consider two scenarios: + * + * A. + * + * vm_ops->close: - - !NULL + * [ prev ][ vma ][ next ] + * + * Where prev may or may not be present/mergeable. + * + * This is picked up by a specific check in can_vma_merge_before(). + * + * B. + * + * vm_ops->close: - !NULL + * [ prev ][ vma ] + * + * Where prev and vma are present and mergeable. + * + * This is picked up by a specific check in the modified VMA merge. + * + * IMPORTANT NOTE: We make the assumption that the following case: + * + * - !NULL NULL + * [ prev ][ vma ][ next ] + * + * Cannot occur, because vma->vm_ops being the same implies the same + * vma->vm_file, and therefore this would mean that next->vm_ops->close + * would be set too, and thus scenario A would pick this up. + */ + + /* + * The only case of a new VMA merge that results in a VMA being deleted + * is one where both the previous and next VMAs are merged - in this + * instance the next VMA is deleted, and the previous VMA is extended. + * + * If we are unable to do so, we reduce the operation to simply + * extending the prev VMA and not merging next. + * + * 0123456789 + * PPP**NNNN + * -> + * 0123456789 + * PPPPPPNNN + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma_next->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + ASSERT_EQ(merge_new(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x5000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * When modifying an existing VMA there are further cases where we + * delete VMAs. + * + * <> + * 0123456789 + * PPPVV + * + * In this instance, if vma has a close hook, the merge simply cannot + * proceed. + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + /* + * The VMA being modified in a way that would otherwise merge should + * also fail. + */ + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * This case is mirrored if merging with next. + * + * <> + * 0123456789 + * VVNNNN + * + * In this instance, if vma has a close hook, the merge simply cannot + * proceed. + */ + + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + /* + * Initially this is misapprehended as an out of memory report, as the + * close() check is handled in the same way as anon_vma duplication + * failures, however a subsequent patch resolves this. + */ + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * Finally, we consider two variants of the case where we modify a VMA + * to merge with both the previous and next VMAs. + * + * The first variant is where vma has a close hook. In this instance, no + * merge can proceed. + * + * <> + * 0123456789 + * PPPVVNNNN + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 3); + + /* + * The second variant is where next has a close hook. In this instance, + * we reduce the operation to a merge between prev and vma. + * + * <> + * 0123456789 + * PPPVVNNNN + * -> + * 0123456789 + * PPPPPNNNN + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma_next->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x5000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + return true; +} + +static bool test_vma_merge_new_with_close(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, flags); + const struct vm_operations_struct vm_ops = { + .close = dummy_close, + }; + struct vm_area_struct *vma; + + /* + * We should allow the partial merge of a proposed new VMA if the + * surrounding VMAs have vm_ops->close() hooks (but are otherwise + * compatible), e.g.: + * + * New VMA + * A v-------v B + * |-----| |-----| + * close close + * + * Since the rule is to not DELETE a VMA with a close operation, this + * should be permitted, only rather than expanding A and deleting B, we + * should simply expand A and leave B intact, e.g.: + * + * New VMA + * A B + * |------------||-----| + * close close + */ + + /* Have prev and next have a vm_ops->close() hook. */ + vma_prev->vm_ops = &vm_ops; + vma_next->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x2000, 0x5000, 2, flags); + vma = merge_new(&vmg); + ASSERT_NE(vma, NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x5000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->vm_ops, &vm_ops); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 2); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_merge_existing(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma, *vma_prev, *vma_next; + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + const struct vm_operations_struct vm_ops = { + .close = dummy_close, + }; + + /* + * Merge right case - partial span. + * + * <-> + * 0123456789 + * VVVVNNN + * -> + * 0123456789 + * VNNNNNN + */ + vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); + vma->vm_ops = &vm_ops; /* This should have no impact. */ + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); + vma_next->vm_ops = &vm_ops; /* This should have no impact. */ + vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); + vmg.vma = vma; + vmg.prev = vma; + vma->anon_vma = &dummy_anon_vma; + ASSERT_EQ(merge_existing(&vmg), vma_next); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_next->vm_start, 0x3000); + ASSERT_EQ(vma_next->vm_end, 0x9000); + ASSERT_EQ(vma_next->vm_pgoff, 3); + ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); + ASSERT_EQ(vma->vm_start, 0x2000); + ASSERT_EQ(vma->vm_end, 0x3000); + ASSERT_EQ(vma->vm_pgoff, 2); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_TRUE(vma_write_started(vma_next)); + ASSERT_EQ(mm.map_count, 2); + + /* Clear down and reset. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * Merge right case - full span. + * + * <--> + * 0123456789 + * VVVVNNN + * -> + * 0123456789 + * NNNNNNN + */ + vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); + vma_next->vm_ops = &vm_ops; /* This should have no impact. */ + vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags); + vmg.vma = vma; + vma->anon_vma = &dummy_anon_vma; + ASSERT_EQ(merge_existing(&vmg), vma_next); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_next->vm_start, 0x2000); + ASSERT_EQ(vma_next->vm_end, 0x9000); + ASSERT_EQ(vma_next->vm_pgoff, 2); + ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma_next)); + ASSERT_EQ(mm.map_count, 1); + + /* Clear down and reset. We should have deleted vma. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); + + /* + * Merge left case - partial span. + * + * <-> + * 0123456789 + * PPPVVVV + * -> + * 0123456789 + * PPPPPPV + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vma->vm_ops = &vm_ops; /* This should have no impact. */ + vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + vma->anon_vma = &dummy_anon_vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x6000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_EQ(vma->vm_start, 0x6000); + ASSERT_EQ(vma->vm_end, 0x7000); + ASSERT_EQ(vma->vm_pgoff, 6); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 2); + + /* Clear down and reset. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * Merge left case - full span. + * + * <--> + * 0123456789 + * PPPVVVV + * -> + * 0123456789 + * PPPPPPP + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + vma->anon_vma = &dummy_anon_vma; + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x7000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_EQ(mm.map_count, 1); + + /* Clear down and reset. We should have deleted vma. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); + + /* + * Merge both case. + * + * <--> + * 0123456789 + * PPPVVVVNNN + * -> + * 0123456789 + * PPPPPPPPPP + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); + vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + vma->anon_vma = &dummy_anon_vma; + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x9000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_EQ(mm.map_count, 1); + + /* Clear down and reset. We should have deleted prev and next. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); + + /* + * Non-merge ranges. the modified VMA merge operation assumes that the + * caller always specifies ranges within the input VMA so we need only + * examine these cases. + * + * - + * - + * - + * <-> + * <> + * <> + * 0123456789a + * PPPVVVVVNNN + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, flags); + + vmg_set_range(&vmg, 0x4000, 0x5000, 4, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + vmg_set_range(&vmg, 0x6000, 0x7000, 6, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + vmg_set_range(&vmg, 0x4000, 0x7000, 4, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + vmg_set_range(&vmg, 0x4000, 0x6000, 4, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 3); + + return true; +} + +static bool test_anon_vma_non_mergeable(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma, *vma_prev, *vma_next; + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct anon_vma_chain dummy_anon_vma_chain1 = { + .anon_vma = &dummy_anon_vma, + }; + struct anon_vma_chain dummy_anon_vma_chain2 = { + .anon_vma = &dummy_anon_vma, + }; + + /* + * In the case of modified VMA merge, merging both left and right VMAs + * but where prev and next have incompatible anon_vma objects, we revert + * to a merge of prev and VMA: + * + * <--> + * 0123456789 + * PPPVVVVNNN + * -> + * 0123456789 + * PPPPPPPNNN + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); + + /* + * Give both prev and next single anon_vma_chain fields, so they will + * merge with the NULL vmg->anon_vma. + * + * However, when prev is compared to next, the merge should fail. + */ + + INIT_LIST_HEAD(&vma_prev->anon_vma_chain); + list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain); + ASSERT_TRUE(list_is_singular(&vma_prev->anon_vma_chain)); + vma_prev->anon_vma = &dummy_anon_vma; + ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_prev->anon_vma, vma_prev)); + + INIT_LIST_HEAD(&vma_next->anon_vma_chain); + list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain); + ASSERT_TRUE(list_is_singular(&vma_next->anon_vma_chain)); + vma_next->anon_vma = (struct anon_vma *)2; + ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_next->anon_vma, vma_next)); + + ASSERT_FALSE(is_mergeable_anon_vma(vma_prev->anon_vma, vma_next->anon_vma, NULL)); + + vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x7000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_FALSE(vma_write_started(vma_next)); + + /* Clear down and reset. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * Now consider the new VMA case. This is equivalent, only adding a new + * VMA in a gap between prev and next. + * + * <--> + * 0123456789 + * PPP****NNN + * -> + * 0123456789 + * PPPPPPPNNN + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); + + INIT_LIST_HEAD(&vma_prev->anon_vma_chain); + list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain); + vma_prev->anon_vma = (struct anon_vma *)1; + + INIT_LIST_HEAD(&vma_next->anon_vma_chain); + list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain); + vma_next->anon_vma = (struct anon_vma *)2; + + vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg.prev = vma_prev; + + ASSERT_EQ(merge_new(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x7000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_FALSE(vma_write_started(vma_next)); + + /* Final cleanup. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + return true; +} + +static bool test_dup_anon_vma(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct anon_vma_chain dummy_anon_vma_chain = { + .anon_vma = &dummy_anon_vma, + }; + struct vm_area_struct *vma_prev, *vma_next, *vma; + + reset_dummy_anon_vma(); + + /* + * Expanding a VMA delete the next one duplicates next's anon_vma and + * assigns it to the expanded VMA. + * + * This covers new VMA merging, as these operations amount to a VMA + * expand. + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next->anon_vma = &dummy_anon_vma; + + vmg_set_range(&vmg, 0, 0x5000, 0, flags); + vmg.vma = vma_prev; + vmg.next = vma_next; + + ASSERT_EQ(expand_existing(&vmg), 0); + + /* Will have been cloned. */ + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_prev->anon_vma->was_cloned); + + /* Cleanup ready for next run. */ + cleanup_mm(&mm, &vmi); + + /* + * next has anon_vma, we assign to prev. + * + * |<----->| + * |-------*********-------| + * prev vma next + * extend delete delete + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + + /* Initialise avc so mergeability check passes. */ + INIT_LIST_HEAD(&vma_next->anon_vma_chain); + list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain); + + vma_next->anon_vma = &dummy_anon_vma; + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x8000); + + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_prev->anon_vma->was_cloned); + + cleanup_mm(&mm, &vmi); + + /* + * vma has anon_vma, we assign to prev. + * + * |<----->| + * |-------*********-------| + * prev vma next + * extend delete delete + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + + vma->anon_vma = &dummy_anon_vma; + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x8000); + + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_prev->anon_vma->was_cloned); + + cleanup_mm(&mm, &vmi); + + /* + * vma has anon_vma, we assign to prev. + * + * |<----->| + * |-------************* + * prev vma + * extend shrink/delete + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags); + + vma->anon_vma = &dummy_anon_vma; + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x5000); + + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_prev->anon_vma->was_cloned); + + cleanup_mm(&mm, &vmi); + + /* + * vma has anon_vma, we assign to next. + * + * |<----->| + * *************-------| + * vma next + * shrink/delete extend + */ + + vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + + vma->anon_vma = &dummy_anon_vma; + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_next); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + + ASSERT_EQ(vma_next->vm_start, 0x3000); + ASSERT_EQ(vma_next->vm_end, 0x8000); + + ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_next->anon_vma->was_cloned); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_vmi_prealloc_fail(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct vm_area_struct *vma_prev, *vma; + + /* + * We are merging vma into prev, with vma possessing an anon_vma, which + * will be duplicated. We cause the vmi preallocation to fail and assert + * the duplicated anon_vma is unlinked. + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma->anon_vma = &dummy_anon_vma; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + fail_prealloc = true; + + /* This will cause the merge to fail. */ + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM); + /* We will already have assigned the anon_vma. */ + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + /* And it was both cloned and unlinked. */ + ASSERT_TRUE(dummy_anon_vma.was_cloned); + ASSERT_TRUE(dummy_anon_vma.was_unlinked); + + cleanup_mm(&mm, &vmi); /* Resets fail_prealloc too. */ + + /* + * We repeat the same operation for expanding a VMA, which is what new + * VMA merging ultimately uses too. This asserts that unlinking is + * performed in this case too. + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma->anon_vma = &dummy_anon_vma; + + vmg_set_range(&vmg, 0, 0x5000, 3, flags); + vmg.vma = vma_prev; + vmg.next = vma; + + fail_prealloc = true; + ASSERT_EQ(expand_existing(&vmg), -ENOMEM); + ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM); + + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(dummy_anon_vma.was_cloned); + ASSERT_TRUE(dummy_anon_vma.was_unlinked); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_merge_extend(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0x1000); + struct vm_area_struct *vma; + + vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags); + alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + + /* + * Extend a VMA into the gap between itself and the following VMA. + * This should result in a merge. + * + * <-> + * * * + * + */ + + ASSERT_EQ(vma_merge_extend(&vmi, vma, 0x2000), vma); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x4000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 1); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_copy_vma(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + bool need_locks = false; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma, *vma_new, *vma_next; + + /* Move backwards and do not merge. */ + + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks); + + ASSERT_NE(vma_new, vma); + ASSERT_EQ(vma_new->vm_start, 0); + ASSERT_EQ(vma_new->vm_end, 0x2000); + ASSERT_EQ(vma_new->vm_pgoff, 0); + + cleanup_mm(&mm, &vmi); + + /* Move a VMA into position next to another and merge the two. */ + + vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, flags); + vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks); + + ASSERT_EQ(vma_new, vma_next); + + cleanup_mm(&mm, &vmi); + return true; +} + +int main(void) +{ + int num_tests = 0, num_fail = 0; + + maple_tree_init(); + +#define TEST(name) \ + do { \ + num_tests++; \ + if (!test_##name()) { \ + num_fail++; \ + fprintf(stderr, "Test " #name " FAILED\n"); \ + } \ + } while (0) + + /* Very simple tests to kick the tyres. */ + TEST(simple_merge); + TEST(simple_modify); + TEST(simple_expand); + TEST(simple_shrink); + + TEST(merge_new); + TEST(vma_merge_special_flags); + TEST(vma_merge_with_close); + TEST(vma_merge_new_with_close); + TEST(merge_existing); + TEST(anon_vma_non_mergeable); + TEST(dup_anon_vma); + TEST(vmi_prealloc_fail); + TEST(merge_extend); + TEST(copy_vma); + +#undef TEST + + printf("%d tests run, %d passed, %d failed.\n", + num_tests, num_tests - num_fail, num_fail); + + return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h new file mode 100644 index 000000000000..c5b9da034511 --- /dev/null +++ b/tools/testing/vma/vma_internal.h @@ -0,0 +1,923 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * vma_internal.h + * + * Header providing userland wrappers and shims for the functionality provided + * by mm/vma_internal.h. + * + * We make the header guard the same as mm/vma_internal.h, so if this shim + * header is included, it precludes the inclusion of the kernel one. + */ + +#ifndef __MM_VMA_INTERNAL_H +#define __MM_VMA_INTERNAL_H + +#define __private +#define __bitwise +#define __randomize_layout + +#define CONFIG_MMU +#define CONFIG_PER_VMA_LOCK + +#include + +#include +#include +#include +#include +#include + +#define VM_WARN_ON(_expr) (WARN_ON(_expr)) +#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) +#define VM_BUG_ON(_expr) (BUG_ON(_expr)) +#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) + +#define VM_NONE 0x00000000 +#define VM_READ 0x00000001 +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 +#define VM_MAYREAD 0x00000010 +#define VM_MAYWRITE 0x00000020 +#define VM_GROWSDOWN 0x00000100 +#define VM_PFNMAP 0x00000400 +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 +#define VM_DONTEXPAND 0x00040000 +#define VM_ACCOUNT 0x00100000 +#define VM_MIXEDMAP 0x10000000 +#define VM_STACK VM_GROWSDOWN +#define VM_SHADOW_STACK VM_NONE +#define VM_SOFTDIRTY 0 + +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +#define FIRST_USER_ADDRESS 0UL +#define USER_PGTABLES_CEILING 0UL + +#define vma_policy(vma) NULL + +#define down_write_nest_lock(sem, nest_lock) + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) + +#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr) +#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr) + +#define TASK_SIZE ((1ul << 47)-PAGE_SIZE) + +#define AS_MM_ALL_LOCKS 2 + +/* We hardcode this for now. */ +#define sysctl_max_map_count 0x1000000UL + +#define pgoff_t unsigned long +typedef unsigned long pgprotval_t; +typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; +typedef unsigned long vm_flags_t; +typedef __bitwise unsigned int vm_fault_t; + +/* + * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...) + * either way :) + */ +#define pr_warn_once pr_err + +typedef struct refcount_struct { + atomic_t refs; +} refcount_t; + +struct kref { + refcount_t refcount; +}; + +/* + * Define the task command name length as enum, then it can be visible to + * BPF programs. + */ +enum { + TASK_COMM_LEN = 16, +}; + +struct task_struct { + char comm[TASK_COMM_LEN]; + pid_t pid; + struct mm_struct *mm; +}; + +struct task_struct *get_current(void); +#define current get_current() + +struct anon_vma { + struct anon_vma *root; + struct rb_root_cached rb_root; + + /* Test fields. */ + bool was_cloned; + bool was_unlinked; +}; + +struct anon_vma_chain { + struct anon_vma *anon_vma; + struct list_head same_vma; +}; + +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + +struct vma_iterator { + struct ma_state mas; +}; + +#define VMA_ITERATOR(name, __mm, __addr) \ + struct vma_iterator name = { \ + .mas = { \ + .tree = &(__mm)->mm_mt, \ + .index = __addr, \ + .node = NULL, \ + .status = ma_start, \ + }, \ + } + +struct address_space { + struct rb_root_cached i_mmap; + unsigned long flags; + atomic_t i_mmap_writable; +}; + +struct vm_userfaultfd_ctx {}; +struct mempolicy {}; +struct mmu_gather {}; +struct mutex {}; +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = {} + +struct mm_struct { + struct maple_tree mm_mt; + int map_count; /* number of VMAs */ + unsigned long total_vm; /* Total pages mapped */ + unsigned long locked_vm; /* Pages that have PG_mlocked set */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ + unsigned long stack_vm; /* VM_STACK */ +}; + +struct vma_lock { + struct rw_semaphore lock; +}; + + +struct file { + struct address_space *f_mapping; +}; + +struct vm_area_struct { + /* The first cache line has the info for VMA tree walking. */ + + union { + struct { + /* VMA covers [vm_start; vm_end) addresses within mm */ + unsigned long vm_start; + unsigned long vm_end; + }; +#ifdef CONFIG_PER_VMA_LOCK + struct rcu_head vm_rcu; /* Used for deferred freeing. */ +#endif + }; + + struct mm_struct *vm_mm; /* The address space we belong to. */ + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vm_flags_t __private __vm_flags; + }; + +#ifdef CONFIG_PER_VMA_LOCK + /* Flag to indicate areas detached from the mm->mm_mt tree */ + bool detached; + + /* + * Can only be written (using WRITE_ONCE()) while holding both: + * - mmap_lock (in write mode) + * - vm_lock->lock (in write mode) + * Can be read reliably while holding one of: + * - mmap_lock (in read or write mode) + * - vm_lock->lock (in read or write mode) + * Can be read unreliably (using READ_ONCE()) for pessimistic bailout + * while holding nothing (except RCU to keep the VMA struct allocated). + * + * This sequence counter is explicitly allowed to overflow; sequence + * counter reuse can only lead to occasional unnecessary use of the + * slowpath. + */ + int vm_lock_seq; + struct vma_lock *vm_lock; +#endif + + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + + /* + * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma + * list, after a COW of one of the file pages. A MAP_SHARED vma + * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack + * or brk vma (with NULL file) can only be in an anon_vma list. + */ + struct list_head anon_vma_chain; /* Serialized by mmap_lock & + * page_table_lock */ + struct anon_vma *anon_vma; /* Serialized by page_table_lock */ + + /* Function pointers to deal with this struct. */ + const struct vm_operations_struct *vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ + +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif +#ifdef CONFIG_SWAP + atomic_long_t swap_readahead_info; +#endif +#ifndef CONFIG_MMU + struct vm_region *vm_region; /* NOMMU mapping region */ +#endif +#ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ +#endif +#ifdef CONFIG_NUMA_BALANCING + struct vma_numab_state *numab_state; /* NUMA Balancing state */ +#endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +} __randomize_layout; + +struct vm_fault {}; + +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*close)(struct vm_area_struct * area); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); + int (*mremap)(struct vm_area_struct *area); + /* + * Called by mprotect() to make driver-specific permission + * checks before mprotect() is finalised. The VMA must not + * be modified. Returns 0 if mprotect() can proceed. + */ + int (*mprotect)(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags); + vm_fault_t (*fault)(struct vm_fault *vmf); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); + vm_fault_t (*map_pages)(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff); + unsigned long (*pagesize)(struct vm_area_struct * area); + + /* notification that a previously read-only page is about to become + * writable, if an error is returned it will cause a SIGBUS */ + vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); + + /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ + vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); + + /* called by access_process_vm when get_user_pages() fails, typically + * for use by special VMAs. See also generic_access_phys() for a generic + * implementation useful for any iomem mapping. + */ + int (*access)(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); + + /* Called by the /proc/PID/maps code to ask the vma whether it + * has a special name. Returning non-NULL will also cause this + * vma to be dumped unconditionally. */ + const char *(*name)(struct vm_area_struct *vma); + +#ifdef CONFIG_NUMA + /* + * set_policy() op must add a reference to any non-NULL @new mempolicy + * to hold the policy upon return. Caller should pass NULL @new to + * remove a policy and fall back to surrounding context--i.e. do not + * install a MPOL_DEFAULT policy, nor the task or system default + * mempolicy. + */ + int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); + + /* + * get_policy() op must add reference [mpol_get()] to any policy at + * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure + * in mm/mempolicy.c will do this automatically. + * get_policy() must NOT add a ref if the policy at (vma,addr) is not + * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. + * If no [shared/vma] mempolicy exists at the addr, get_policy() op + * must return NULL--i.e., do not "fallback" to task or system default + * policy. + */ + struct mempolicy *(*get_policy)(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *ilx); +#endif + /* + * Called by vm_normal_page() for special PTEs to find the + * page for @addr. This is useful if the default behavior + * (using pte_page()) would not find the correct page. + */ + struct page *(*find_special_page)(struct vm_area_struct *vma, + unsigned long addr); +}; + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot)); +} + +static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + return __pgprot(vm_flags); +} + +static inline bool is_shared_maywrite(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ + return is_shared_maywrite(vma->vm_flags); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses mas_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return mas_find(&vmi->mas, ULONG_MAX); +} + +static inline bool vma_lock_alloc(struct vm_area_struct *vma) +{ + vma->vm_lock = calloc(1, sizeof(struct vma_lock)); + + if (!vma->vm_lock) + return false; + + init_rwsem(&vma->vm_lock->lock); + vma->vm_lock_seq = -1; + + return true; +} + +static inline void vma_assert_write_locked(struct vm_area_struct *); +static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +{ + /* When detaching vma should be write-locked */ + if (detached) + vma_assert_write_locked(vma); + vma->detached = detached; +} + +extern const struct vm_operations_struct vma_dummy_vm_ops; + +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ + memset(vma, 0, sizeof(*vma)); + vma->vm_mm = mm; + vma->vm_ops = &vma_dummy_vm_ops; + INIT_LIST_HEAD(&vma->anon_vma_chain); + vma_mark_detached(vma, false); +} + +static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) +{ + struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct)); + + if (!vma) + return NULL; + + vma_init(vma, mm); + if (!vma_lock_alloc(vma)) { + free(vma); + return NULL; + } + + return vma; +} + +static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct)); + + if (!new) + return NULL; + + memcpy(new, orig, sizeof(*new)); + if (!vma_lock_alloc(new)) { + free(new); + return NULL; + } + INIT_LIST_HEAD(&new->anon_vma_chain); + + return new; +} + +/* + * These are defined in vma.h, but sadly vm_stat_account() is referenced by + * kernel/fork.c, so we have to these broadly available there, and temporarily + * define them here to resolve the dependency cycle. + */ + +#define is_exec_mapping(flags) \ + ((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC) + +#define is_stack_mapping(flags) \ + (((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK)) + +#define is_data_mapping(flags) \ + ((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE) + +static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, + long npages) +{ + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); + + if (is_exec_mapping(flags)) + mm->exec_vm += npages; + else if (is_stack_mapping(flags)) + mm->stack_vm += npages; + else if (is_data_mapping(flags)) + mm->data_vm += npages; +} + +#undef is_exec_mapping +#undef is_stack_mapping +#undef is_data_mapping + +/* Currently stubbed but we may later wish to un-stub. */ +static inline void vm_acct_memory(long pages); +static inline void vm_unacct_memory(long pages) +{ + vm_acct_memory(-pages); +} + +static inline void mapping_allow_writable(struct address_space *mapping) +{ + atomic_inc(&mapping->i_mmap_writable); +} + +static inline void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff) +{ + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +} + +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max - 1); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void mmap_assert_locked(struct mm_struct *); +static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} + +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + return mtree_load(&mm->mm_mt, addr); +} + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + +static inline bool vma_is_anonymous(struct vm_area_struct *vma) +{ + return !vma->vm_ops; +} + +/* Defined in vma.h, so temporarily define here to avoid circular dependency. */ +#define vma_iter_load(vmi) \ + mas_walk(&(vmi)->mas) + +static inline struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); + + vma = vma_iter_load(&vmi); + *pprev = vma_prev(&vmi); + if (!vma) + vma = vma_next(&vmi); + return vma; +} + +#undef vma_iter_load + +static inline void vma_iter_init(struct vma_iterator *vmi, + struct mm_struct *mm, unsigned long addr) +{ + mas_init(&vmi->mas, &mm->mm_mt, addr); +} + +/* Stubbed functions. */ + +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx vm_ctx) +{ + return true; +} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + return true; +} + +static inline void might_sleep(void) +{ +} + +static inline unsigned long vma_pages(struct vm_area_struct *vma) +{ + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +} + +static inline void fput(struct file *) +{ +} + +static inline void mpol_put(struct mempolicy *) +{ +} + +static inline void vma_lock_free(struct vm_area_struct *vma) +{ + free(vma->vm_lock); +} + +static inline void __vm_area_free(struct vm_area_struct *vma) +{ + vma_lock_free(vma); + free(vma); +} + +static inline void vm_area_free(struct vm_area_struct *vma) +{ + __vm_area_free(vma); +} + +static inline void lru_add_drain(void) +{ +} + +static inline void tlb_gather_mmu(struct mmu_gather *, struct mm_struct *) +{ +} + +static inline void update_hiwater_rss(struct mm_struct *) +{ +} + +static inline void update_hiwater_vm(struct mm_struct *) +{ +} + +static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long tree_end, + bool mm_wr_locked) +{ + (void)tlb; + (void)mas; + (void)vma; + (void)start_addr; + (void)end_addr; + (void)tree_end; + (void)mm_wr_locked; +} + +static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, + struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling, bool mm_wr_locked) +{ + (void)tlb; + (void)mas; + (void)vma; + (void)floor; + (void)ceiling; + (void)mm_wr_locked; +} + +static inline void mapping_unmap_writable(struct address_space *) +{ +} + +static inline void flush_dcache_mmap_lock(struct address_space *) +{ +} + +static inline void tlb_finish_mmu(struct mmu_gather *) +{ +} + +static inline void get_file(struct file *) +{ +} + +static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *) +{ + return 0; +} + +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +{ + /* For testing purposes. We indicate that an anon_vma has been cloned. */ + if (src->anon_vma != NULL) { + dst->anon_vma = src->anon_vma; + dst->anon_vma->was_cloned = true; + } + + return 0; +} + +static inline void vma_start_write(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; +} + +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + (void)vma; + (void)start; + (void)end; + (void)adjust_next; +} + +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); +} + +static inline void vm_acct_memory(long pages) +{ +} + +static inline void vma_interval_tree_insert(struct vm_area_struct *, + struct rb_root_cached *) +{ +} + +static inline void vma_interval_tree_remove(struct vm_area_struct *, + struct rb_root_cached *) +{ +} + +static inline void flush_dcache_mmap_unlock(struct address_space *) +{ +} + +static inline void anon_vma_interval_tree_insert(struct anon_vma_chain*, + struct rb_root_cached *) +{ +} + +static inline void anon_vma_interval_tree_remove(struct anon_vma_chain*, + struct rb_root_cached *) +{ +} + +static inline void uprobe_mmap(struct vm_area_struct *) +{ +} + +static inline void uprobe_munmap(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + (void)vma; + (void)start; + (void)end; +} + +static inline void i_mmap_lock_write(struct address_space *) +{ +} + +static inline void anon_vma_lock_write(struct anon_vma *) +{ +} + +static inline void vma_assert_write_locked(struct vm_area_struct *) +{ +} + +static inline void unlink_anon_vmas(struct vm_area_struct *vma) +{ + /* For testing purposes, indicate that the anon_vma was unlinked. */ + vma->anon_vma->was_unlinked = true; +} + +static inline void anon_vma_unlock_write(struct anon_vma *) +{ +} + +static inline void i_mmap_unlock_write(struct address_space *) +{ +} + +static inline void anon_vma_merge(struct vm_area_struct *, + struct vm_area_struct *) +{ +} + +static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + struct list_head *unmaps) +{ + (void)vma; + (void)start; + (void)end; + (void)unmaps; + + return 0; +} + +static inline void mmap_write_downgrade(struct mm_struct *) +{ +} + +static inline void mmap_read_unlock(struct mm_struct *) +{ +} + +static inline void mmap_write_unlock(struct mm_struct *) +{ +} + +static inline bool can_modify_mm(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + (void)mm; + (void)start; + (void)end; + + return true; +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + (void)mm; + (void)start; + (void)end; +} + +static inline void mmap_assert_locked(struct mm_struct *) +{ +} + +static inline bool mpol_equal(struct mempolicy *, struct mempolicy *) +{ + return true; +} + +static inline void khugepaged_enter_vma(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + (void)vma; + (void)vm_flags; +} + +static inline bool mapping_can_writeback(struct address_space *) +{ + return true; +} + +static inline bool is_vm_hugetlb_page(struct vm_area_struct *) +{ + return false; +} + +static inline bool vma_soft_dirty_enabled(struct vm_area_struct *) +{ + return false; +} + +static inline bool userfaultfd_wp(struct vm_area_struct *) +{ + return false; +} + +static inline void mmap_assert_write_locked(struct mm_struct *) +{ +} + +static inline void mutex_lock(struct mutex *) +{ +} + +static inline void mutex_unlock(struct mutex *) +{ +} + +static inline bool mutex_is_locked(struct mutex *) +{ + return true; +} + +static inline bool signal_pending(void *) +{ + return false; +} + +#endif /* __MM_VMA_INTERNAL_H */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb2b78e92910..f416d5e3f9c0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2860,13 +2860,11 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *p_pfn) { + struct follow_pfnmap_args args = { .vma = vma, .address = addr }; kvm_pfn_t pfn; - pte_t *ptep; - pte_t pte; - spinlock_t *ptl; int r; - r = follow_pte(vma, addr, &ptep, &ptl); + r = follow_pfnmap_start(&args); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does @@ -2881,21 +2879,19 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, if (r) return r; - r = follow_pte(vma, addr, &ptep, &ptl); + r = follow_pfnmap_start(&args); if (r) return r; } - pte = ptep_get(ptep); - - if (write_fault && !pte_write(pte)) { + if (write_fault && !args.writable) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } if (writable) - *writable = pte_write(pte); - pfn = pte_pfn(pte); + *writable = args.writable; + pfn = args.pfn; /* * Get a reference here because callers of *hva_to_pfn* and @@ -2916,9 +2912,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, */ if (!kvm_try_get_pfn(pfn)) r = -EFAULT; - out: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); *p_pfn = pfn; return r;