diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index f1b90cf1249b..b057eddefbfc 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -355,10 +355,15 @@ Description: If 'target' is written to the 'type' file, writing to or What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//matching Date: Dec 2022 Contact: SeongJae Park -Description: Writing 'Y' or 'N' to this file sets whether to filter out - pages that do or do not match to the 'type' and 'memcg_path', - respectively. Filter out means the action of the scheme will - not be applied to. +Description: Writing 'Y' or 'N' to this file sets whether the filter is for + the memory of the 'type', or all except the 'type'. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//allow +Date: Jan 2025 +Contact: SeongJae Park +Description: Writing 'Y' or 'N' to this file sets whether to allow or reject + applying the scheme's action to the memory that satisfies the + 'type' and the 'matching' of the directory. What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/nr_tried Date: Mar 2022 @@ -384,6 +389,12 @@ Contact: SeongJae Park Description: Reading this file returns the total size of regions that the action of the scheme has successfully applied in bytes. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/sz_ops_filter_passed +Date: Dec 2024 +Contact: SeongJae Park +Description: Reading this file returns the total size of memory that passed + DAMON operations layer-handled filters of the scheme in bytes. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/qt_exceeds Date: Mar 2022 Contact: SeongJae Park @@ -424,3 +435,10 @@ Contact: SeongJae Park Description: Reading this file returns the 'age' of a memory region that corresponding DAMON-based Operation Scheme's action has tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//sz_filter_passed +Date: Dec 2024 +Contact: SeongJae Park +Description: Reading this file returns the size of the memory in the region + that passed DAMON operations layer-handled filters of the + scheme in bytes. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3872bc6ec49d..9138fcd18260 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3351,8 +3351,8 @@ [KNL] Set the initial state for the memory hotplug onlining policy. If not specified, the default value is set according to the - CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config - option. + CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel config + options. See Documentation/admin-guide/mm/memory-hotplug.rst. memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact @@ -6992,6 +6992,13 @@ See Documentation/admin-guide/mm/transhuge.rst for more details. + transparent_hugepage_tmpfs= [KNL] + Format: [always|within_size|advise|never] + Can be used to control the default hugepage allocation policy + for the tmpfs mount. + See Documentation/admin-guide/mm/transhuge.rst + for more details. + trusted.source= [KEYS] Format: This parameter identifies the trust source as a backend diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index c4dddf6733cd..ede14b679d02 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -42,32 +42,45 @@ the execution. :: $ git clone https://github.com/sjp38/masim; cd masim; make $ sudo damo start "./masim ./configs/stairs.cfg --quiet" - $ sudo ./damo show - 0 addr [85.541 TiB , 85.541 TiB ) (57.707 MiB ) access 0 % age 10.400 s - 1 addr [85.541 TiB , 85.542 TiB ) (413.285 MiB) access 0 % age 11.400 s - 2 addr [127.649 TiB , 127.649 TiB) (57.500 MiB ) access 0 % age 1.600 s - 3 addr [127.649 TiB , 127.649 TiB) (32.500 MiB ) access 0 % age 500 ms - 4 addr [127.649 TiB , 127.649 TiB) (9.535 MiB ) access 100 % age 300 ms - 5 addr [127.649 TiB , 127.649 TiB) (8.000 KiB ) access 60 % age 0 ns - 6 addr [127.649 TiB , 127.649 TiB) (6.926 MiB ) access 0 % age 1 s - 7 addr [127.998 TiB , 127.998 TiB) (120.000 KiB) access 0 % age 11.100 s - 8 addr [127.998 TiB , 127.998 TiB) (8.000 KiB ) access 40 % age 100 ms - 9 addr [127.998 TiB , 127.998 TiB) (4.000 KiB ) access 0 % age 11 s - total size: 577.590 MiB - $ sudo ./damo stop + $ sudo damo report access + heatmap: 641111111000000000000000000000000000000000000000000000[...]33333333333333335557984444[...]7 + # min/max temperatures: -1,840,000,000, 370,010,000, column size: 3.925 MiB + 0 addr 86.182 TiB size 8.000 KiB access 0 % age 14.900 s + 1 addr 86.182 TiB size 8.000 KiB access 60 % age 0 ns + 2 addr 86.182 TiB size 3.422 MiB access 0 % age 4.100 s + 3 addr 86.182 TiB size 2.004 MiB access 95 % age 2.200 s + 4 addr 86.182 TiB size 29.688 MiB access 0 % age 14.100 s + 5 addr 86.182 TiB size 29.516 MiB access 0 % age 16.700 s + 6 addr 86.182 TiB size 29.633 MiB access 0 % age 17.900 s + 7 addr 86.182 TiB size 117.652 MiB access 0 % age 18.400 s + 8 addr 126.990 TiB size 62.332 MiB access 0 % age 9.500 s + 9 addr 126.990 TiB size 13.980 MiB access 0 % age 5.200 s + 10 addr 126.990 TiB size 9.539 MiB access 100 % age 3.700 s + 11 addr 126.990 TiB size 16.098 MiB access 0 % age 6.400 s + 12 addr 127.987 TiB size 132.000 KiB access 0 % age 2.900 s + total size: 314.008 MiB + $ sudo damo stop The first command of the above example downloads and builds an artificial memory access generator program called ``masim``. The second command asks DAMO -to execute the artificial generator process start via the given command and -make DAMON monitors the generator process. The third command retrieves the -current snapshot of the monitored access pattern of the process from DAMON and -shows the pattern in a human readable format. +to start the program via the given command and make DAMON monitors the newly +started process. The third command retrieves the current snapshot of the +monitored access pattern of the process from DAMON and shows the pattern in a +human readable format. -Each line of the output shows which virtual address range (``addr [XX, XX)``) -of the process is how frequently (``access XX %``) accessed for how long time -(``age XX``). For example, the fifth region of ~9 MiB size is being most -frequently accessed for last 300 milliseconds. Finally, the fourth command -stops DAMON. +The first line of the output shows the relative access temperature (hotness) of +the regions in a single row hetmap format. Each column on the heatmap +represents regions of same size on the monitored virtual address space. The +position of the colun on the row and the number on the column represents the +relative location and access temperature of the region. ``[...]`` means +unmapped huge regions on the virtual address spaces. The second line shows +additional information for better understanding the heatmap. + +Each line of the output from the third line shows which virtual address range +(``addr XX size XX``) of the process is how frequently (``access XX %``) +accessed for how long time (``age XX``). For example, the evelenth region of +~9.5 MiB size is being most frequently accessed for last 3.7 seconds. Finally, +the fourth command stops DAMON. Note that DAMON can monitor not only virtual address spaces but multiple types of address spaces including the physical address space. @@ -95,7 +108,7 @@ Visualizing Recorded Patterns You can visualize the pattern in a heatmap, showing which memory region (x-axis) got accessed when (y-axis) and how frequently (number).:: - $ sudo damo report heats --heatmap stdout + $ sudo damo report heatmap 22222222222222222222222222222222222222211111111111111111111111111111111111111100 44444444444444444444444444444444444444434444444444444444444444444444444444443200 44444444444444444444444444444444444444433444444444444444444444444444444444444200 @@ -160,6 +173,6 @@ Data Access Pattern Aware Memory Management Below command makes every memory region of size >=4K that has not accessed for >=60 seconds in your workload to be swapped out. :: - $ sudo damo schemes --damos_access_rate 0 0 --damos_sz_region 4K max \ - --damos_age 60s max --damos_action pageout \ - + $ sudo damo start --damos_access_rate 0 0 --damos_sz_region 4K max \ + --damos_age 60s max --damos_action pageout \ + diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index d9be9f7caa7d..47a44bd348ab 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -26,12 +26,6 @@ DAMON provides below interfaces for different users. writing kernel space DAMON application programs for you. You can even extend DAMON for various address spaces. For detail, please refer to the interface :doc:`document `. -- *debugfs interface. (DEPRECATED!)* - :ref:`This ` is almost identical to :ref:`sysfs interface - `. This is deprecated, so users should move to the - :ref:`sysfs interface `. If you depend on this and cannot - move, please report your usecase to damon@lists.linux.dev and - linux-mm@kvack.org. .. _sysfs_interface: @@ -89,10 +83,10 @@ comma (","). │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value │ │ │ │ │ │ │ :ref:`watermarks `/metric,interval_us,high,mid,low │ │ │ │ │ │ │ :ref:`filters `/nr_filters - │ │ │ │ │ │ │ │ 0/type,matching,memcg_id - │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds + │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx + │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds │ │ │ │ │ │ │ :ref:`tried_regions `/total_bytes - │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age + │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ ... │ │ │ │ ... @@ -412,59 +406,62 @@ number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each filter. The filters are evaluated in the numeric order. -Each filter directory contains six files, namely ``type``, ``matcing``, -``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. To ``type`` -file, you can write one of five special keywords: ``anon`` for anonymous pages, -``memcg`` for specific memory cgroup, ``young`` for young pages, ``addr`` for -specific address range (an open-ended interval), or ``target`` for specific -DAMON monitoring target filtering. In case of the memory cgroup filtering, you -can specify the memory cgroup of the interest by writing the path of the memory -cgroup from the cgroups mount point to ``memcg_path`` file. In case of the -address range filtering, you can specify the start and end address of the range -to ``addr_start`` and ``addr_end`` files, respectively. For the DAMON -monitoring target filtering, you can specify the index of the target between -the list of the DAMON context's monitoring targets list to ``target_idx`` file. -You can write ``Y`` or ``N`` to ``matching`` file to filter out pages that does -or does not match to the type, respectively. Then, the scheme's action will -not be applied to the pages that specified to be filtered out. +Each filter directory contains seven files, namely ``type``, ``matching``, +``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. +To ``type`` file, you can write one of five special keywords: ``anon`` for +anonymous pages, ``memcg`` for specific memory cgroup, ``young`` for young +pages, ``addr`` for specific address range (an open-ended interval), or +``target`` for specific DAMON monitoring target filtering. Meaning of the +types are same to the description on the :ref:`design doc +`. + +In case of the memory cgroup filtering, you can specify the memory cgroup of +the interest by writing the path of the memory cgroup from the cgroups mount +point to ``memcg_path`` file. In case of the address range filtering, you can +specify the start and end address of the range to ``addr_start`` and +``addr_end`` files, respectively. For the DAMON monitoring target filtering, +you can specify the index of the target between the list of the DAMON context's +monitoring targets list to ``target_idx`` file. + +You can write ``Y`` or ``N`` to ``matching`` file to specify whether the filter +is for memory that matches the ``type``. You can write ``Y`` or ``N`` to +``allow`` file to specify if applying the action to the memory that satisfies +the ``type`` and ``matching`` should be allowed or not. For example, below restricts a DAMOS action to be applied to only non-anonymous pages of all memory cgroups except ``/having_care_already``.:: # echo 2 > nr_filters - # # filter out anonymous pages + # # disallow anonymous pages echo anon > 0/type echo Y > 0/matching + echo N > 0/allow # # further filter out all cgroups except one at '/having_care_already' echo memcg > 1/type echo /having_care_already > 1/memcg_path echo Y > 1/matching + echo N > 1/allow -Note that ``anon`` and ``memcg`` filters are currently supported only when -``paddr`` :ref:`implementation ` is being used. - -Also, memory regions that are filtered out by ``addr`` or ``target`` filters -are not counted as the scheme has tried to those, while regions that filtered -out by other type filters are counted as the scheme has tried to. The -difference is applied to :ref:`stats ` and -:ref:`tried regions `. +Refer to the :ref:`DAMOS filters design documentation +` for more details including how multiple filters +of different ``allow`` works, when each of the filters are supported, and +differences on stats. .. _sysfs_schemes_stats: schemes//stats/ ------------------ -DAMON counts the total number and bytes of regions that each scheme is tried to -be applied, the two numbers for the regions that each scheme is successfully -applied, and the total number of the quota limit exceeds. This statistics can -be used for online analysis or tuning of the schemes. +DAMON counts statistics for each scheme. This statistics can be used for +online analysis or tuning of the schemes. Refer to :ref:`design doc +` for more details about the stats. The statistics can be retrieved by reading the files under ``stats`` directory -(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and -``qt_exceeds``), respectively. The files are not updated in real time, so you -should ask DAMON sysfs interface to update the content of the files for the -stats by writing a special keyword, ``update_schemes_stats`` to the relevant -``kdamonds//state`` file. +(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, +``sz_ops_filter_passed``, and ``qt_exceeds``), respectively. The files are not +updated in real time, so you should ask DAMON sysfs interface to update the +content of the files for the stats by writing a special keyword, +``update_schemes_stats`` to the relevant ``kdamonds//state`` file. .. _sysfs_schemes_tried_regions: @@ -501,10 +498,10 @@ set the ``access pattern`` as their interested pattern that they want to query. tried_regions// ------------------ -In each region directory, you will find four files (``start``, ``end``, -``nr_accesses``, and ``age``). Reading the files will show the start and end -addresses, ``nr_accesses``, and ``age`` of the region that corresponding -DAMON-based operation scheme ``action`` has tried to be applied. +In each region directory, you will find five files (``start``, ``end``, +``nr_accesses``, ``age``, and ``sz_filter_passed``). Reading the files will +show the properties of the region that corresponding DAMON-based operation +scheme ``action`` has tried to be applied. Example ~~~~~~~ @@ -600,306 +597,3 @@ fields are as usual. It shows the index of the DAMON context (``ctx_idx=X``) of the scheme in the list of the contexts of the context's kdamond, the index of the scheme (``scheme_idx=X``) in the list of the schemes of the context, in addition to the output of ``damon_aggregated`` tracepoint. - - -.. _debugfs_interface: - -debugfs Interface (DEPRECATED!) -=============================== - -.. note:: - - THIS IS DEPRECATED! - - DAMON debugfs interface is deprecated, so users should move to the - :ref:`sysfs interface `. If you depend on this and cannot - move, please report your usecase to damon@lists.linux.dev and - linux-mm@kvack.org. - -DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``, -``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, -``mk_contexts`` and ``rm_contexts`` under its debugfs directory, -``/damon/``. - - -``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation -notice. Reading it returns the deprecation notice, as below:: - - # cat DEPRECATED - DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. - - -Attributes ----------- - -Users can get and set the ``sampling interval``, ``aggregation interval``, -``update interval``, and min/max number of monitoring target regions by -reading from and writing to the ``attrs`` file. To know about the monitoring -attributes in detail, please refer to the :doc:`/mm/damon/design`. For -example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and -1000, and then check it again:: - - # cd /damon - # echo 5000 100000 1000000 10 1000 > attrs - # cat attrs - 5000 100000 1000000 10 1000 - - -Target IDs ----------- - -Some types of address spaces supports multiple monitoring target. For example, -the virtual memory address spaces monitoring can have multiple processes as the -monitoring targets. Users can set the targets by writing relevant id values of -the targets to, and get the ids of the current targets by reading from the -``target_ids`` file. In case of the virtual address spaces monitoring, the -values should be pids of the monitoring target processes. For example, below -commands set processes having pids 42 and 4242 as the monitoring targets and -check it again:: - - # cd /damon - # echo 42 4242 > target_ids - # cat target_ids - 42 4242 - -Users can also monitor the physical memory address space of the system by -writing a special keyword, "``paddr\n``" to the file. Because physical address -space monitoring doesn't support multiple targets, reading the file will show a -fake value, ``42``, as below:: - - # cd /damon - # echo paddr > target_ids - # cat target_ids - 42 - -Note that setting the target ids doesn't start the monitoring. - - -Initial Monitoring Target Regions ---------------------------------- - -In case of the virtual address space monitoring, DAMON automatically sets and -updates the monitoring target regions so that entire memory mappings of target -processes can be covered. However, users can want to limit the monitoring -region to specific address ranges, such as the heap, the stack, or specific -file-mapped area. Or, some users can know the initial access pattern of their -workloads and therefore want to set optimal initial regions for the 'adaptive -regions adjustment'. - -In contrast, DAMON do not automatically sets and updates the monitoring target -regions in case of physical memory monitoring. Therefore, users should set the -monitoring target regions by themselves. - -In such cases, users can explicitly set the initial monitoring target regions -as they want, by writing proper values to the ``init_regions`` file. The input -should be a sequence of three integers separated by white spaces that represent -one region in below form.:: - - - -The ``target idx`` should be the index of the target in ``target_ids`` file, -starting from ``0``, and the regions should be passed in address order. For -example, below commands will set a couple of address ranges, ``1-100`` and -``100-200`` as the initial monitoring target region of pid 42, which is the -first one (index ``0``) in ``target_ids``, and another couple of address -ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one -(index ``1``) in ``target_ids``.:: - - # cd /damon - # cat target_ids - 42 4242 - # echo "0 1 100 \ - 0 100 200 \ - 1 20 40 \ - 1 50 100" > init_regions - -Note that this sets the initial monitoring target regions only. In case of -virtual memory monitoring, DAMON will automatically updates the boundary of the -regions after one ``update interval``. Therefore, users should set the -``update interval`` large enough in this case, if they don't want the -update. - - -Schemes -------- - -Users can get and set the DAMON-based operation :ref:`schemes -` by reading from and writing to ``schemes`` debugfs file. -Reading the file also shows the statistics of each scheme. To the file, each -of the schemes should be represented in each line in below form:: - - - -You can disable schemes by simply writing an empty string to the file. - -Target Access Pattern -~~~~~~~~~~~~~~~~~~~~~ - -The target access :ref:`pattern ` of the -scheme. The ```` is constructed with three ranges in -below form:: - - min-size max-size min-acc max-acc min-age max-age - -Specifically, bytes for the size of regions (``min-size`` and ``max-size``), -number of monitored accesses per aggregate interval for access frequency -(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of -regions (``min-age`` and ``max-age``) are specified. Note that the ranges are -closed interval. - -Action -~~~~~~ - -The ```` is a predefined integer for memory management :ref:`actions -`. The mapping between the ```` values and -the memory management actions is as below. For the detailed meaning of the -action and DAMON operations set supporting each action, please refer to the -list on :ref:`design doc `. - - - 0: ``willneed`` - - 1: ``cold`` - - 2: ``pageout`` - - 3: ``hugepage`` - - 4: ``nohugepage`` - - 5: ``stat`` - -Quota -~~~~~ - -Users can set the :ref:`quotas ` of the given scheme -via the ```` in below form:: - - - -This makes DAMON to try to use only up to ```` milliseconds for applying -the action to memory regions of the ``target access pattern`` within the -```` milliseconds, and to apply the action to only up to -```` bytes of memory regions within the ````. Setting both -```` and ```` zero disables the quota limits. - -For the :ref:`prioritization `, users -can set the weights for the three properties in ```` in below -form:: - - - -Watermarks -~~~~~~~~~~ - -Users can specify :ref:`watermarks ` of the -given scheme via ```` in below form:: - - - -```` is a predefined integer for the metric to be checked. The -supported numbers and their meanings are as below. - - - 0: Ignore the watermarks - - 1: System's free memory rate (per thousand) - -The value of the metric is checked every ```` microseconds. - -If the value is higher than ```` or lower than ````, the -scheme is deactivated. If the value is lower than ````, the scheme -is activated. - -.. _damos_stats: - -Statistics -~~~~~~~~~~ - -It also counts the total number and bytes of regions that each scheme is tried -to be applied, the two numbers for the regions that each scheme is successfully -applied, and the total number of the quota limit exceeds. This statistics can -be used for online analysis or tuning of the schemes. - -The statistics can be shown by reading the ``schemes`` file. Reading the file -will show each scheme you entered in each line, and the five numbers for the -statistics will be added at the end of each line. - -Example -~~~~~~~ - -Below commands applies a scheme saying "If a memory region of size in [4KiB, -8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate -interval in [10, 20], page out the region. For the paging out, use only up to -10ms per second, and also don't page out more than 1GiB per second. Under the -limitation, page out memory regions having longer age first. Also, check the -free memory rate of the system every 5 seconds, start the monitoring and paging -out when the free memory rate becomes lower than 50%, but stop it if the free -memory rate becomes larger than 60%, or lower than 30%".:: - - # cd /damon - # scheme="4096 8192 0 5 10 20 2" # target access pattern and action - # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas - # scheme+=" 0 0 100" # prioritization weights - # scheme+=" 1 5000000 600 500 300" # watermarks - # echo "$scheme" > schemes - - -Turning On/Off --------------- - -Setting the files as described above doesn't incur effect unless you explicitly -start the monitoring. You can start, stop, and check the current status of the -monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file. -Writing ``on`` to the file starts the monitoring of the targets with the -attributes. Writing ``off`` to the file stops those. DAMON also stops if -every target process is terminated. Below example commands turn on, off, and -check the status of DAMON:: - - # cd /damon - # echo on > monitor_on_DEPRECATED - # echo off > monitor_on_DEPRECATED - # cat monitor_on_DEPRECATED - off - -Please note that you cannot write to the above-mentioned debugfs files while -the monitoring is turned on. If you write to the files while DAMON is running, -an error code such as ``-EBUSY`` will be returned. - - -Monitoring Thread PID ---------------------- - -DAMON does requested monitoring with a kernel thread called ``kdamond``. You -can get the pid of the thread by reading the ``kdamond_pid`` file. When the -monitoring is turned off, reading the file returns ``none``. :: - - # cd /damon - # cat monitor_on_DEPRECATED - off - # cat kdamond_pid - none - # echo on > monitor_on_DEPRECATED - # cat kdamond_pid - 18594 - - -Using Multiple Monitoring Threads ---------------------------------- - -One ``kdamond`` thread is created for each monitoring context. You can create -and remove monitoring contexts for multiple ``kdamond`` required use case using -the ``mk_contexts`` and ``rm_contexts`` files. - -Writing the name of the new context to the ``mk_contexts`` file creates a -directory of the name on the DAMON debugfs directory. The directory will have -DAMON debugfs files for the context. :: - - # cd /damon - # ls foo - # ls: cannot access 'foo': No such file or directory - # echo foo > mk_contexts - # ls foo - # attrs init_regions kdamond_pid schemes target_ids - -If the context is not needed anymore, you can remove it and the corresponding -directory by putting the name of the context to the ``rm_contexts`` file. :: - - # echo foo > rm_contexts - # ls foo - # ls: cannot access 'foo': No such file or directory - -Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files -are in the root directory only. diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index cb2c080f400c..33c886f3d198 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -280,8 +280,8 @@ The following files are currently defined: blocks; configure auto-onlining. The default value depends on the - CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel configuration - option. + CONFIG_MHP_DEFAULT_ONLINE_TYPE kernel configuration + options. See the ``state`` property of memory blocks for details. ``block_size_bytes`` read-only: the size in bytes of a memory block. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 8872203df088..dff8d5985f0f 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -332,6 +332,12 @@ allocation policy for the internal shmem mount by using the kernel parameter seven valid policies for shmem (``always``, ``within_size``, ``advise``, ``never``, ``deny``, and ``force``). +Similarly to ``transparent_hugepage_shmem``, you can control the default +hugepage allocation policy for the tmpfs mount by using the kernel parameter +``transparent_hugepage_tmpfs=``, where ```` is one of the +four valid policies for tmpfs (``always``, ``within_size``, ``advise``, +``never``). The tmpfs mount default policy is ``never``. + In the same manner as ``thp_anon`` controls each supported anonymous THP size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem`` has the same format as ``thp_anon``, but also supports the policy @@ -352,8 +358,21 @@ default to ``never``. Hugepages in tmpfs/shmem ======================== -You can control hugepage allocation policy in tmpfs with mount option -``huge=``. It can have following values: +Traditionally, tmpfs only supported a single huge page size ("PMD"). Today, +it also supports smaller sizes just like anonymous memory, often referred +to as "multi-size THP" (mTHP). Huge pages of any size are commonly +represented in the kernel as "large folios". + +While there is fine control over the huge page sizes to use for the internal +shmem mount (see below), ordinary tmpfs mounts will make use of all available +huge page sizes without any control over the exact sizes, behaving more like +other file systems. + +tmpfs mounts +------------ + +The THP allocation policy for tmpfs mounts can be adjusted using the mount +option: ``huge=``. It can have following values: always Attempt to allocate huge pages every time we need a new page; @@ -363,24 +382,24 @@ never within_size Only allocate huge page if it will be fully within i_size. - Also respect fadvise()/madvise() hints; + Also respect madvise() hints; advise - Only allocate huge pages if requested with fadvise()/madvise(); + Only allocate huge pages if requested with madvise(); -The default policy is ``never``. +Remember, that the kernel may use huge pages of all available sizes, and +that no fine control as for the internal tmpfs mount is available. + +The default policy in the past was ``never``, but it can now be adjusted +using the kernel parameter ``transparent_hugepage_tmpfs=``. ``mount -o remount,huge= /mountpoint`` works fine after mount: remounting ``huge=never`` will not attempt to break up huge pages at all, just stop more from being allocated. -There's also sysfs knob to control hugepage allocation policy for internal -shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount -is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or -MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. - -In addition to policies listed above, shmem_enabled allows two further -values: +In addition to policies listed above, the sysfs knob +/sys/kernel/mm/transparent_hugepage/shmem_enabled will affect the +allocation policy of tmpfs mounts, when set to the following values: deny For use in emergencies, to force the huge option off from @@ -388,13 +407,24 @@ deny force Force the huge option on for all - very useful for testing; -Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to -control mTHP allocation: -'/sys/kernel/mm/transparent_hugepage/hugepages-kB/shmem_enabled', -and its value for each mTHP is essentially consistent with the global -setting. An 'inherit' option is added to ensure compatibility with these -global settings. Conversely, the options 'force' and 'deny' are dropped, -which are rather testing artifacts from the old ages. +shmem / internal tmpfs +---------------------- +The mount internal tmpfs mount is used for SysV SHM, memfds, shared anonymous +mmaps (of /dev/zero or MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. + +To control the THP allocation policy for this internal tmpfs mount, the +sysfs knob /sys/kernel/mm/transparent_hugepage/shmem_enabled and the knobs +per THP size in +'/sys/kernel/mm/transparent_hugepage/hugepages-kB/shmem_enabled' +can be used. + +The global knob has the same semantics as the ``huge=`` mount options +for tmpfs mounts, except that the different huge page sizes can be controlled +individually, and will only use the setting of the global knob when the +per-size knob is set to 'inherit'. + +The options 'force' and 'deny' are dropped for the individual sizes, which +are rather testing artifacts from the old ages. always Attempt to allocate huge pages every time we need a new page; @@ -408,10 +438,10 @@ never within_size Only allocate huge page if it will be fully within i_size. - Also respect fadvise()/madvise() hints; + Also respect madvise() hints; advise - Only allocate huge pages if requested with fadvise()/madvise(); + Only allocate huge pages if requested with madvise(); Need of application restart =========================== @@ -561,6 +591,16 @@ swpin is incremented every time a huge page is swapped in from a non-zswap swap device in one piece. +swpin_fallback + is incremented if swapin fails to allocate or charge a huge page + and instead falls back to using huge pages with lower orders or + small pages. + +swpin_fallback_charge + is incremented if swapin fails to charge a huge page and instead + falls back to using huge pages with lower orders or small pages + even though the allocation was successful. + swpout is incremented every time a huge page is swapped out to a non-zswap swap device in one piece without splitting. diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 6a882c57a7e7..238afcb86d1f 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -48,6 +48,7 @@ fixes/update part 1.1 Stefani Seibold June 9 2009 3.11 /proc//patch_state - Livepatch patch operation state 3.12 /proc//arch_status - Task architecture specific information 3.13 /proc//fd - List of symlinks to open files + 3.14 /proc//fd for fast access. ------------------------------------------------------- +3.14 /proc//ksm_merging_pages shows. + +ksm_process_profit +^^^^^^^^^^^^^^^^^^ + +The profit that KSM brings (Saved bytes). KSM can save memory by merging +identical pages, but also can consume additional memory, because it needs +to generate a number of rmap_items to save each scanned page's brief rmap +information. Some of these pages may be merged, but some may not be abled +to be merged after being checked several times, which are unprofitable +memory consumed. + +ksm_merge_any +^^^^^^^^^^^^^ + +It specifies whether the process's mm is added by prctl() into the +candidate list of KSM or not, and KSM scanning is fully enabled at process +level. + +ksm_mergeable +^^^^^^^^^^^^^ + +It specifies whether any VMAs of the process's mm are currently applicable +to KSM. + +More information about KSM can be found at +Documentation/admin-guide/mm/ksm.rst. + Chapter 4: Configuring procfs ============================= @@ -2261,7 +2331,7 @@ arguments are now protected against local eavesdroppers. hidepid=invisible or hidepid=2 means hidepid=1 plus all /proc// will be fully invisible to other users. It doesn't mean that it hides a fact whether a process with a specific pid value exists (it can be learned by other means, e.g. -by "kill -0 $PID"), but it hides process' uid and gid, which may be learned by +by "kill -0 $PID"), but it hides process's uid and gid, which may be learned by stat()'ing /proc// otherwise. It greatly complicates an intruder's task of gathering information about running processes, whether some daemon runs with elevated privileges, whether other user runs some sensitive program, whether diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index f9c50525bdbf..e28c6a1b40ae 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -203,6 +203,8 @@ This scheme, however, cannot preserve the quality of the output if the assumption is not guaranteed. +.. _damon_design_adaptive_regions_adjustment: + Adaptive Regions Adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -264,6 +266,61 @@ tracepoints. For more details, please refer to the documentations for respectively. +.. _damon_design_monitoring_params_tuning_guide: + +Monitoring Parameters Tuning Guide +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In short, set ``aggregation interval`` to capture meaningful amount of accesses +for the purpose. The amount of accesses can be measured using ``nr_accesses`` +and ``age`` of regions in the aggregated monitoring results snapshot. The +default value of the interval, ``100ms``, turns out to be too short in many +cases. Set ``sampling interval`` proportional to ``aggregation interval``. By +default, ``1/20`` is recommended as the ratio. + +``Aggregation interval`` should be set as the time interval that the workload +can make an amount of accesses for the monitoring purpose, within the interval. +If the interval is too short, only small number of accesses are captured. As a +result, the monitoring results look everything is samely accessed only rarely. +For many purposes, that would be useless. If it is too long, however, the time +to converge regions with the :ref:`regions adjustment mechanism +` can be too long, depending on the +time scale of the given purpose. This could happen if the workload is actually +making only rare accesses but the user thinks the amount of accesses for the +monitoring purpose too high. For such cases, the target amount of access to +capture per ``aggregation interval`` should carefully reconsidered. Also, note +that the captured amount of accesses is represented with not only +``nr_accesses``, but also ``age``. For example, even if every region on the +monitoring results show zero ``nr_accesses``, regions could still be +distinguished using ``age`` values as the recency information. + +Hence the optimum value of ``aggregation interval`` depends on the access +intensiveness of the workload. The user should tune the interval based on the +amount of access that captured on each aggregated snapshot of the monitoring +results. + +Note that the default value of the interval is 100 milliseconds, which is too +short in many cases, especially on large systems. + +``Sampling interval`` defines the resolution of each aggregation. If it is set +too large, monitoring results will look like every region was samely rarely +accessed, or samely frequently accessed. That is, regions become +undistinguishable based on access pattern, and therefore the results will be +useless in many use cases. If ``sampling interval`` is too small, it will not +degrade the resolution, but will increase the monitoring overhead. If it is +appropriate enough to provide a resolution of the monitoring results that +sufficient for the given purpose, it shouldn't be unnecessarily further +lowered. It is recommended to be set proportional to ``aggregation interval``. +By default, the ratio is set as ``1/20``, and it is still recommended. + +Refer to below documents for an example tuning based on the above guide. + +.. toctree:: + :maxdepth: 1 + + monitoring_intervals_tuning_example + + .. _damon_design_damos: Operation Schemes @@ -504,9 +561,34 @@ have a list of latency-critical processes. To let users optimize DAMOS schemes with such special knowledge, DAMOS provides a feature called DAMOS filters. The feature allows users to set an arbitrary -number of filters for each scheme. Each filter specifies the type of target -memory, and whether it should exclude the memory of the type (filter-out), or -all except the memory of the type (filter-in). +number of filters for each scheme. Each filter specifies + +- a type of memory (``type``), +- whether it is for the memory of the type or all except the type + (``matching``), and +- whether it is to allow (include) or reject (exclude) applying + the scheme's action to the memory (``allow``). + +When multiple filters are installed, each filter is evaluated in the installed +order. If a part of memory is matched to one of the filter, next filters are +ignored. If the memory passes through the filters evaluation stage because it +is not matched to any of the filters, applying the scheme's action to it is +allowed, same to the behavior when no filter exists. + +For example, let's assume 1) a filter for allowing anonymous pages and 2) +another filter for rejecting young pages are installed in the order. If a page +of a region that eligible to apply the scheme's action is an anonymous page, +the scheme's action will be applied to the page regardless of whether it is +young or not, since it matches with the first allow-filter. If the page is +not anonymous but young, the scheme's action will not be applied, since the +second reject-filter blocks it. If the page is neither anonymous nor young, +the page will pass through the filters evaluation stage since there is no +matching filter, and the action will be applied to the page. + +Note that the action can equally be applied to memory that either explicitly +filter-allowed or filters evaluation stage passed. It means that installing +allow-filters at the end of the list makes no practical change but only +filters-checking overhead. For efficient handling of filters, some types of filters are handled by the core layer, while others are handled by operations set. In the latter case, @@ -516,7 +598,7 @@ filter are not counted as the scheme has tried to the region. In contrast, if a memory regions is filtered by an operations set layer-handled filter, it is counted as the scheme has tried. This difference affects the statistics. -Below types of filters are currently supported. +Below ``type`` of filters are currently supported. - anonymous page - Applied to pages that containing data that not stored in files. @@ -539,6 +621,60 @@ To know how user-space can set the watermarks via :ref:`DAMON sysfs interface `, refer to :ref:`filters ` part of the documentation. +.. _damon_design_damos_stat: + +Statistics +~~~~~~~~~~ + +The statistics of DAMOS behaviors that designed to help monitoring, tuning and +debugging of DAMOS. + +DAMOS accounts below statistics for each scheme, from the beginning of the +scheme's execution. + +- ``nr_tried``: Total number of regions that the scheme is tried to be applied. +- ``sz_trtied``: Total size of regions that the scheme is tried to be applied. +- ``sz_ops_filter_passed``: Total bytes that passed operations set + layer-handled DAMOS filters. +- ``nr_applied``: Total number of regions that the scheme is applied. +- ``sz_applied``: Total size of regions that the scheme is applied. +- ``qt_exceeds``: Total number of times the quota of the scheme has exceeded. + +"A scheme is tried to be applied to a region" means DAMOS core logic determined +the region is eligible to apply the scheme's :ref:`action +`. The :ref:`access pattern +`, :ref:`quotas +`, :ref:`watermarks +`, and :ref:`filters +` that handled on core logic could affect this. +The core logic will only ask the underlying :ref:`operation set +` to do apply the action to the region, so whether the +action is really applied or not is unclear. That's why it is called "tried". + +"A scheme is applied to a region" means the :ref:`operation set +` has applied the action to at least a part of the +region. The :ref:`filters ` that handled by the +operation set, and the types of the :ref:`action ` +and the pages of the region can affect this. For example, if a filter is set +to exclude anonymous pages and the region has only anonymous pages, or if the +action is ``pageout`` while all pages of the region are unreclaimable, applying +the action to the region will fail. + +To know how user-space can read the stats via :ref:`DAMON sysfs interface +`, refer to :ref:s`stats ` part of the +documentation. + +Regions Walking +~~~~~~~~~~~~~~~ + +DAMOS feature allowing users access each region that a DAMOS action has just +applied. Using this feature, DAMON :ref:`API ` allows users +access full properties of the regions including the access monitoring results +and amount of the region's internal memory that passed the DAMOS filters. +:ref:`DAMON sysfs interface ` also allows users read the data +via special :ref:`files `. + +.. _damon_design_api: Application Programming Interface --------------------------------- @@ -573,15 +709,11 @@ General Purpose User Interface Modules DAMON modules that provide user space ABIs for general purpose DAMON usage in runtime. -DAMON user interface modules, namely 'DAMON sysfs interface' and 'DAMON debugfs -interface' are DAMON API user kernel modules that provide ABIs to the -user-space. Please note that DAMON debugfs interface is currently deprecated. - -Like many other ABIs, the modules create files on sysfs and debugfs, allow -users to specify their requests to and get the answers from DAMON by writing to -and reading from the files. As a response to such I/O, DAMON user interface -modules control DAMON and retrieve the results as user requested via the DAMON -API, and return the results to the user-space. +Like many other ABIs, the modules create files on pseudo file systems like +'sysfs', allow users to specify their requests to and get the answers from +DAMON by writing to and reading from the files. As a response to such I/O, +DAMON user interface modules control DAMON and retrieve the results as user +requested via the DAMON API, and return the results to the user-space. The ABIs are designed to be used for user space applications development, rather than human beings' fingers. Human users are recommended to use such @@ -590,8 +722,9 @@ Github (https://github.com/damonitor/damo), Pypi (https://pypistats.org/packages/damo), and Fedora (https://packages.fedoraproject.org/pkgs/python-damo/damo/). -Please refer to the ABI :doc:`document ` for -details of the interfaces. +Currently, one module for this type, namely 'DAMON sysfs interface' is +available. Please refer to the ABI :ref:`doc ` for details of +the interfaces. Special-Purpose Access-aware Kernel Modules @@ -599,8 +732,8 @@ Special-Purpose Access-aware Kernel Modules DAMON modules that provide user space ABI for specific purpose DAMON usage. -DAMON sysfs/debugfs user interfaces are for full control of all DAMON features -in runtime. For each special-purpose system-wide data access-aware system +DAMON user interface modules are for full control of all DAMON features in +runtime. For each special-purpose system-wide data access-aware system operations such as proactive reclamation or LRU lists balancing, the interfaces could be simplified by removing unnecessary knobs for the specific purpose, and extended for boot-time and even compile time control. Default values of DAMON diff --git a/Documentation/mm/damon/monitoring_intervals_tuning_example.rst b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst new file mode 100644 index 000000000000..334a854efb40 --- /dev/null +++ b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst @@ -0,0 +1,247 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================= +DAMON Moniting Interval Parameters Tuning Example +================================================= + +DAMON's monitoring parameters need tuning based on given workload and the +monitoring purpose. There is a :ref:`tuning guide +` for that. This document +provides an example tuning based on the guide. + +Setup +===== + +For below example, DAMON of Linux kernel v6.11 and `damo +`_ (DAMON user-space tool) v2.5.9 was used to +monitor and visualize access patterns on the physical address space of a system +running a real-world server workload. + +5ms/100ms intervals: Too Short Interval +======================================= + +Let's start by capturing the access pattern snapshot on the physical address +space of the system using DAMON, with the default interval parameters (5 +milliseconds and 100 milliseconds for the sampling and the aggregation +intervals, respectively). Wait ten minutes between the start of DAMON and +the capturing of the snapshot, to show a meaningful time-wise access patterns. +:: + + # damo start + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + +Then, list the DAMON-found regions of different access patterns, sorted by the +"access temperature". "Access temperature" is a metric representing the +access-hotness of a region. It is calculated as a weighted sum of the access +frequency and the age of the region. If the access frequency is 0 %, the +temperature is multipled by minus one. That is, if a region is not accessed, +it gets minus temperature and it gets lower as not accessed for longer time. +The sorting is in temperature-ascendint order, so the region at the top of the +list is the coldest, and the one at the bottom is the hottest one. :: + + # damo report access --sort_regions_by temperature + 0 addr 16.052 GiB size 5.985 GiB access 0 % age 5.900 s # coldest + 1 addr 22.037 GiB size 6.029 GiB access 0 % age 5.300 s + 2 addr 28.065 GiB size 6.045 GiB access 0 % age 5.200 s + 3 addr 10.069 GiB size 5.983 GiB access 0 % age 4.500 s + 4 addr 4.000 GiB size 6.069 GiB access 0 % age 4.400 s + 5 addr 62.008 GiB size 3.992 GiB access 0 % age 3.700 s + 6 addr 56.795 GiB size 5.213 GiB access 0 % age 3.300 s + 7 addr 39.393 GiB size 6.096 GiB access 0 % age 2.800 s + 8 addr 50.782 GiB size 6.012 GiB access 0 % age 2.800 s + 9 addr 34.111 GiB size 5.282 GiB access 0 % age 2.300 s + 10 addr 45.489 GiB size 5.293 GiB access 0 % age 1.800 s # hottest + total size: 62.000 GiB + +The list shows not seemingly hot regions, and only minimum access pattern +diversity. Every region has zero access frequency. The number of region is +10, which is the default ``min_nr_regions value``. Size of each region is also +nearly idential. We can suspect this is because “adaptive regions adjustment” +mechanism was not well working. As the guide suggested, we can get relative +hotness of regions using ``age`` as the recency information. That would be +better than nothing, but given the fact that the longest age is only about 6 +seconds while we waited about ten minuts, it is unclear how useful this will +be. + +The temperature ranges to total size of regions of each range histogram +visualization of the results also shows no interesting distribution pattern. :: + + # damo report access --style temperature-sz-hist + + [-,590,000,000, -,549,000,000) 5.985 GiB |********** | + [-,549,000,000, -,508,000,000) 12.074 GiB |********************| + [-,508,000,000, -,467,000,000) 0 B | | + [-,467,000,000, -,426,000,000) 12.052 GiB |********************| + [-,426,000,000, -,385,000,000) 0 B | | + [-,385,000,000, -,344,000,000) 3.992 GiB |******* | + [-,344,000,000, -,303,000,000) 5.213 GiB |********* | + [-,303,000,000, -,262,000,000) 12.109 GiB |********************| + [-,262,000,000, -,221,000,000) 5.282 GiB |********* | + [-,221,000,000, -,180,000,000) 0 B | | + [-,180,000,000, -,139,000,000) 5.293 GiB |********* | + total size: 62.000 GiB + +In short, the parameters provide poor quality monitoring results for hot +regions detection. According to the :ref:`guide +`, this is due to the too short +aggregation interval. + +100ms/2s intervals: Starts Showing Small Hot Regions +==================================================== + +Following the guide, increase the interval 20 times (100 milliseocnds and 2 +seconds for sampling and aggregation intervals, respectively). :: + + # damo start -s 100ms -a 2s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 10.180 GiB size 6.117 GiB access 0 % age 7 m 8 s # coldest + 1 addr 49.275 GiB size 6.195 GiB access 0 % age 6 m 14 s + 2 addr 62.421 GiB size 3.579 GiB access 0 % age 6 m 4 s + 3 addr 40.154 GiB size 6.127 GiB access 0 % age 5 m 40 s + 4 addr 16.296 GiB size 6.182 GiB access 0 % age 5 m 32 s + 5 addr 34.254 GiB size 5.899 GiB access 0 % age 5 m 24 s + 6 addr 46.281 GiB size 2.995 GiB access 0 % age 5 m 20 s + 7 addr 28.420 GiB size 5.835 GiB access 0 % age 5 m 6 s + 8 addr 4.000 GiB size 6.180 GiB access 0 % age 4 m 16 s + 9 addr 22.478 GiB size 5.942 GiB access 0 % age 3 m 58 s + 10 addr 55.470 GiB size 915.645 MiB access 0 % age 3 m 6 s + 11 addr 56.364 GiB size 6.056 GiB access 0 % age 2 m 8 s + 12 addr 56.364 GiB size 4.000 KiB access 95 % age 16 s + 13 addr 49.275 GiB size 4.000 KiB access 100 % age 8 m 24 s # hottest + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-42,800,000,000, -33,479,999,000) 22.018 GiB |***************** | + [-33,479,999,000, -24,159,998,000) 27.090 GiB |********************| + [-24,159,998,000, -14,839,997,000) 6.836 GiB |****** | + [-14,839,997,000, -5,519,996,000) 6.056 GiB |***** | + [-5,519,996,000, 3,800,005,000) 4.000 KiB |* | + [3,800,005,000, 13,120,006,000) 0 B | | + [13,120,006,000, 22,440,007,000) 0 B | | + [22,440,007,000, 31,760,008,000) 0 B | | + [31,760,008,000, 41,080,009,000) 0 B | | + [41,080,009,000, 50,400,010,000) 0 B | | + [50,400,010,000, 59,720,011,000) 4.000 KiB |* | + total size: 62.000 GiB + +DAMON found two distinct 4 KiB regions that pretty hot. The regions are also +well aged. The hottest 4 KiB region was keeping the access frequency for about +8 minutes, and the coldest region was keeping no access for about 7 minutes. +The distribution on the histogram also looks like having a pattern. + +Especially, the finding of the 4 KiB regions among the 62 GiB total memory +shows DAMON’s adaptive regions adjustment is working as designed. + +Still the number of regions is close to the ``min_nr_regions``, and sizes of +cold regions are similar, though. Apparently it is improved, but it still has +rooms to improve. + +400ms/8s intervals: Pretty Improved Results +=========================================== + +Increase the intervals four times (400 milliseconds and 8 seconds +for sampling and aggregation intervals, respectively). :: + + # damo start -s 400ms -a 8s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 64.492 GiB size 1.508 GiB access 0 % age 6 m 48 s # coldest + 1 addr 21.749 GiB size 5.674 GiB access 0 % age 6 m 8 s + 2 addr 27.422 GiB size 5.801 GiB access 0 % age 6 m + 3 addr 49.431 GiB size 8.675 GiB access 0 % age 5 m 28 s + 4 addr 33.223 GiB size 5.645 GiB access 0 % age 5 m 12 s + 5 addr 58.321 GiB size 6.170 GiB access 0 % age 5 m 4 s + [...] + 25 addr 6.615 GiB size 297.531 MiB access 15 % age 0 ns + 26 addr 9.513 GiB size 12.000 KiB access 20 % age 0 ns + 27 addr 9.511 GiB size 108.000 KiB access 25 % age 0 ns + 28 addr 9.513 GiB size 20.000 KiB access 25 % age 0 ns + 29 addr 9.511 GiB size 12.000 KiB access 30 % age 0 ns + 30 addr 9.520 GiB size 4.000 KiB access 40 % age 0 ns + [...] + 41 addr 9.520 GiB size 4.000 KiB access 80 % age 56 s + 42 addr 9.511 GiB size 12.000 KiB access 100 % age 6 m 16 s + 43 addr 58.321 GiB size 4.000 KiB access 100 % age 6 m 24 s + 44 addr 9.512 GiB size 4.000 KiB access 100 % age 6 m 48 s + 45 addr 58.106 GiB size 4.000 KiB access 100 % age 6 m 48 s # hottest + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-40,800,000,000, -32,639,999,000) 21.657 GiB |********************| + [-32,639,999,000, -24,479,998,000) 17.938 GiB |***************** | + [-24,479,998,000, -16,319,997,000) 16.885 GiB |**************** | + [-16,319,997,000, -8,159,996,000) 586.879 MiB |* | + [-8,159,996,000, 5,000) 4.946 GiB |***** | + [5,000, 8,160,006,000) 260.000 KiB |* | + [8,160,006,000, 16,320,007,000) 0 B | | + [16,320,007,000, 24,480,008,000) 0 B | | + [24,480,008,000, 32,640,009,000) 0 B | | + [32,640,009,000, 40,800,010,000) 16.000 KiB |* | + [40,800,010,000, 48,960,011,000) 8.000 KiB |* | + total size: 62.000 GiB + +The number of regions having different access patterns has significantly +increased. Size of each region is also more varied. Total size of non-zero +access frequency regions is also significantly increased. Maybe this is already +good enough to make some meaningful memory management efficieny changes. + +800ms/16s intervals: Another bias +================================= + +Further double the intervals (800 milliseconds and 16 seconds for sampling +and aggregation intervals, respectively). The results is more improved for the +hot regions detection, but starts looking degrading cold regions detection. :: + + # damo start -s 800ms -a 16s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 64.781 GiB size 1.219 GiB access 0 % age 4 m 48 s + 1 addr 24.505 GiB size 2.475 GiB access 0 % age 4 m 16 s + 2 addr 26.980 GiB size 504.273 MiB access 0 % age 4 m + 3 addr 29.443 GiB size 2.462 GiB access 0 % age 4 m + 4 addr 37.264 GiB size 5.645 GiB access 0 % age 4 m + 5 addr 31.905 GiB size 5.359 GiB access 0 % age 3 m 44 s + [...] + 20 addr 8.711 GiB size 40.000 KiB access 5 % age 2 m 40 s + 21 addr 27.473 GiB size 1.970 GiB access 5 % age 4 m + 22 addr 48.185 GiB size 4.625 GiB access 5 % age 4 m + 23 addr 47.304 GiB size 902.117 MiB access 10 % age 4 m + 24 addr 8.711 GiB size 4.000 KiB access 100 % age 4 m + 25 addr 20.793 GiB size 3.713 GiB access 5 % age 4 m 16 s + 26 addr 8.773 GiB size 4.000 KiB access 100 % age 4 m 16 s + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-28,800,000,000, -23,359,999,000) 12.294 GiB |***************** | + [-23,359,999,000, -17,919,998,000) 9.753 GiB |************* | + [-17,919,998,000, -12,479,997,000) 15.131 GiB |********************| + [-12,479,997,000, -7,039,996,000) 0 B | | + [-7,039,996,000, -1,599,995,000) 7.506 GiB |********** | + [-1,599,995,000, 3,840,006,000) 6.127 GiB |********* | + [3,840,006,000, 9,280,007,000) 0 B | | + [9,280,007,000, 14,720,008,000) 136.000 KiB |* | + [14,720,008,000, 20,160,009,000) 40.000 KiB |* | + [20,160,009,000, 25,600,010,000) 11.188 GiB |*************** | + [25,600,010,000, 31,040,011,000) 4.000 KiB |* | + total size: 62.000 GiB + +It found more non-zero access frequency regions. The number of regions is still +much higher than the ``min_nr_regions``, but it is reduced from that of the +previous setup. And apparently the distribution seems bit biased to hot +regions. + +Conclusion +========== + +With the above experimental tuning results, we can conclude the theory and the +guide makes sense to at least this workload, and could be applied to similar +cases. diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index 1d416658d7f5..f573de936b5d 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -531,6 +531,10 @@ are extra requirements for accessing them: new page table has been installed in the same location and filled with entries. Writers normally need to take the PTE lock and revalidate that the PMD entry still refers to the same PTE-level page table. + If the writer does not care whether it is the same PTE-level page table, it + can take the PMD lock and revalidate that the contents of pmd entry still meet + the requirements. In particular, this also happens in :c:func:`!retract_page_tables` + when handling :c:macro:`!MADV_COLLAPSE`. To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or :c:func:`!pte_offset_map` can be used depending on stability requirements. @@ -712,9 +716,14 @@ calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`, before releasing the RCU lock via :c:func:`!rcu_read_unlock`. -VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for -their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it -via :c:func:`!vma_end_read`. +In cases when the user already holds mmap read lock, :c:func:`!vma_start_read_locked` +and :c:func:`!vma_start_read_locked_nested` can be used. These functions do not +fail due to lock contention but the caller should still check their return values +in case they fail for other reasons. + +VMA read locks increment :c:member:`!vma.vm_refcnt` reference counter for their +duration and the caller of :c:func:`!lock_vma_under_rcu` must drop it via +:c:func:`!vma_end_read`. VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always @@ -722,9 +731,9 @@ acquired. An mmap write lock **must** be held for the duration of the VMA write lock, releasing or downgrading the mmap write lock also releases the VMA write lock so there is no :c:func:`!vma_end_write` function. -Note that a semaphore write lock is not held across a VMA lock. Rather, a -sequence number is used for serialisation, and the write semaphore is only -acquired at the point of write lock to update this. +Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily +modified so that readers can detect the presense of a writer. The reference counter is +restored once the vma sequence number used for serialisation is updated. This ensures the semantics we require - VMA write locks provide exclusive write access to the VMA. @@ -734,7 +743,7 @@ Implementation details The VMA lock mechanism is designed to be a lightweight means of avoiding the use of the heavily contended mmap lock. It is implemented using a combination of a -read/write semaphore and sequence numbers belonging to the containing +reference counter and sequence numbers belonging to the containing :c:struct:`!struct mm_struct` and the VMA. Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic @@ -775,28 +784,31 @@ release of any VMA locks on its release makes sense, as you would never want to keep VMAs locked across entirely separate write operations. It also maintains correct lock ordering. -Each time a VMA read lock is acquired, we acquire a read lock on the -:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that -the sequence count of the VMA does not match that of the mm. +Each time a VMA read lock is acquired, we increment :c:member:`!vma.vm_refcnt` +reference counter and check that the sequence count of the VMA does not match +that of the mm. -If it does, the read lock fails. If it does not, we hold the lock, excluding -writers, but permitting other readers, who will also obtain this lock under RCU. +If it does, the read lock fails and :c:member:`!vma.vm_refcnt` is dropped. +If it does not, we keep the reference counter raised, excluding writers, but +permitting other readers, who can also obtain this lock under RCU. Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu` are also RCU safe, so the whole read lock operation is guaranteed to function correctly. -On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock` -read/write semaphore, before setting the VMA's sequence number under this lock, -also simultaneously holding the mmap write lock. +On the write side, we set a bit in :c:member:`!vma.vm_refcnt` which can't be +modified by readers and wait for all readers to drop their reference count. +Once there are no readers, VMA's sequence number is set to match that of the +mm. During this entire operation mmap write lock is held. This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep until these are finished and mutual exclusion is achieved. -After setting the VMA's sequence number, the lock is released, avoiding -complexity with a long-term held write lock. +After setting the VMA's sequence number, the bit in :c:member:`!vma.vm_refcnt` +indicating a writer is cleared. From this point on, VMA's sequence number will +indicate VMA's write-locked state until mmap write lock is dropped or downgraded. -This clever combination of a read/write semaphore and sequence count allows for +This clever combination of a reference counter and sequence count allows for fast RCU-based per-VMA lock acquisition (especially on page fault, though utilised elsewhere) with minimal complexity around lock ordering. diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index 581446d4a4eb..8e1ceb0a6619 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -62,7 +62,7 @@ Support of split page table lock by an architecture =================================================== There's no need in special enabling of PTE split page table lock: everything -required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which +required is done by pagetable_pte_ctor() and pagetable_dtor(), which must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table @@ -73,7 +73,7 @@ PMD split lock only makes sense if you have more than two page table levels. PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table -allocation and pagetable_pmd_dtor() on freeing. +allocation and pagetable_dtor() on freeing. Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index 50f6f0b6bf11..9d7cb51be493 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -26,12 +26,7 @@ DAMON 为不同的用户提供了下面这些接口。 使用它,用户可以通过读取和写入特殊的sysfs文件来使用DAMON的主要功能。因此,你可以编写和使 用你个性化的DAMON sysfs包装程序,代替你读/写sysfs文件。 `DAMON用户空间工具 `_ 就是这种程序的一个例子 它同时支持虚拟和物理地址 - 空间的监测。注意,这个界面只提供简单的监测结果 :ref:`统计 `。对于详细的监测 - 结果,DAMON提供了一个:ref:`跟踪点 `。 -- *debugfs interface.* - :ref:`这 ` 几乎与:ref:`sysfs interface ` 接 - 口相同。这将在下一个LTS内核发布后被移除,所以用户应该转移到 - :ref:`sysfs interface `。 + 空间的监测。 - *内核空间编程接口。* :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 核空间的DAMON应用程序,最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。 @@ -335,247 +330,6 @@ tried_regions// 请注意,我们强烈建议使用用户空间的工具,如 `damo `_ , 而不是像上面那样手动读写文件。以上只是一个例子。 -debugfs接口 -=========== - -.. note:: - - DAMON debugfs接口将在下一个LTS内核发布后被移除,所以用户应该转移到 - :ref:`sysfs接口`。 - -DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和 -``rm_contexts`` under its debugfs directory, ``/damon/``. - - -属性 ----- - -用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔`` -以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/mm/damon/design` 。例如, -下面的命令将这些值设置为5ms、100ms、1000ms、10和1000,然后再次检查:: - - # cd /damon - # echo 5000 100000 1000000 10 1000 > attrs - # cat attrs - 5000 100000 1000000 10 1000 - - -目标ID ------- - -一些类型的地址空间支持多个监测目标。例如,虚拟内存地址空间的监测可以有多个进程作为监测目标。用户 -可以通过写入目标的相关id值来设置目标,并通过读取 ``target_ids`` 文件来获得当前目标的id。在监 -测虚拟地址空间的情况下,这些值应该是监测目标进程的pid。例如,下面的命令将pid为42和4242的进程设 -为监测目标,并再次检查:: - - # cd /damon - # echo 42 4242 > target_ids - # cat target_ids - 42 4242 - -用户还可以通过在文件中写入一个特殊的关键字 "paddr\n" 来监测系统的物理内存地址空间。因为物理地 -址空间监测不支持多个目标,读取文件会显示一个假值,即 ``42`` ,如下图所示:: - - # cd /damon - # echo paddr > target_ids - # cat target_ids - 42 - -请注意,设置目标ID并不启动监测。 - - -初始监测目标区域 ----------------- - -在虚拟地址空间监测的情况下,DAMON自动设置和更新监测的目标区域,这样就可以覆盖目标进程的整个 -内存映射。然而,用户可能希望将监测区域限制在特定的地址范围内,如堆、栈或特定的文件映射区域。 -或者,一些用户可以知道他们工作负载的初始访问模式,因此希望为“自适应区域调整”设置最佳初始区域。 - -相比之下,DAMON在物理内存监测的情况下不会自动设置和更新监测目标区域。因此,用户应该自己设置 -监测目标区域。 - -在这种情况下,用户可以通过在 ``init_regions`` 文件中写入适当的值,明确地设置他们想要的初 -始监测目标区域。输入应该是一个由三个整数组成的队列,用空格隔开,代表一个区域的形式如下:: - - - -目标idx应该是 ``target_ids`` 文件中目标的索引,从 ``0`` 开始,区域应该按照地址顺序传递。 -例如,下面的命令将设置几个地址范围, ``1-100`` 和 ``100-200`` 作为pid 42的初始监测目标 -区域,这是 ``target_ids`` 中的第一个(索引 ``0`` ),另外几个地址范围, ``20-40`` 和 -``50-100`` 作为pid 4242的地址,这是 ``target_ids`` 中的第二个(索引 ``1`` ):: - - # cd /damon - # cat target_ids - 42 4242 - # echo "0 1 100 \ - 0 100 200 \ - 1 20 40 \ - 1 50 100" > init_regions - -请注意,这只是设置了初始的监测目标区域。在虚拟内存监测的情况下,DAMON会在一个 ``更新间隔`` -后自动更新区域的边界。因此,在这种情况下,如果用户不希望更新的话,应该把 ``更新间隔`` 设 -置得足够大。 - - -方案 ----- - -对于通常的基于DAMON的数据访问感知的内存管理优化,用户只是希望系统对特定访问模式的内存区域应用内 -存管理操作。DAMON从用户那里接收这种形式化的操作方案,并将这些方案应用到目标进程中。 - -用户可以通过读取和写入 ``scheme`` debugfs文件来获得和设置这些方案。读取该文件还可以显示每个 -方案的统计数据。在文件中,每一个方案都应该在每一行中以下列形式表示出来:: - - - -你可以通过简单地在文件中写入一个空字符串来禁用方案。 - -目标访问模式 -~~~~~~~~~~~~ - -``<目标访问模式>`` 是由三个范围构成的,形式如下:: - - min-size max-size min-acc max-acc min-age max-age - -具体来说,区域大小的字节数( `min-size` 和 `max-size` ),访问频率的每聚合区间的监测访问次 -数( `min-acc` 和 `max-acc` ),区域年龄的聚合区间数( `min-age` 和 `max-age` )都被指定。 -请注意,这些范围是封闭区间。 - -动作 -~~~~ - -```` 是一个预定义的内存管理动作的整数,DAMON将应用于具有目标访问模式的区域。支持 -的数字和它们的含义如下:: - - - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - 1: Call ``madvise()`` for the region with ``MADV_COLD`` - - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` - - 5: Do nothing but count the statistics - -配额 -~~~~ - -每个 ``动作`` 的最佳 ``目标访问模式`` 取决于工作负载,所以不容易找到。更糟糕的是,将某个 -动作的方案设置得过于激进会导致严重的开销。为了避免这种开销,用户可以通过下面表格中的 ```` -来限制方案的时间和大小配额:: - - - -这使得DAMON在 ```` 毫秒内,尽量只用 ```` 毫秒的时间对 ``目标访 -问模式`` 的内存区域应用动作,并在 ```` 内只对最多字节的内存区域应 -用动作。将 ```` 和 ```` 都设置为零,可以禁用配额限制。 - -当预计超过配额限制时,DAMON会根据 ``目标访问模式`` 的大小、访问频率和年龄,对发现的内存 -区域进行优先排序。为了实现个性化的优先级,用户可以在 ``<优先级权重>`` 中设置这三个属性的 -权重,具体形式如下:: - - - -水位 -~~~~ - -有些方案需要根据系统特定指标的当前值来运行,如自由内存比率。对于这种情况,用户可以为该条 -件指定水位。:: - - - -```` 是一个预定义的整数,用于要检查的度量。支持的数字和它们的含义如下。 - - - 0: 忽视水位 - - 1: 系统空闲内存率 (千分比) - -每隔 ``<检查间隔>`` 微秒检查一次公制的值。 - -如果该值高于 ``<高标>`` 或低于 ``<低标>`` ,该方案被停用。如果该值低于 ``<中标>`` , -该方案将被激活。 - -统计数据 -~~~~~~~~ - -它还统计每个方案被尝试应用的区域的总数量和字节数,每个方案被成功应用的区域的两个数量,以 -及超过配额限制的总数量。这些统计数据可用于在线分析或调整方案。 - -统计数据可以通过读取方案文件来显示。读取该文件将显示你在每一行中输入的每个 ``方案`` , -统计的五个数字将被加在每一行的末尾。 - -例子 -~~~~ - -下面的命令应用了一个方案:”如果一个大小为[4KiB, 8KiB]的内存区域在[10, 20]的聚合时间 -间隔内显示出每一个聚合时间间隔[0, 5]的访问量,请分页出该区域。对于分页,每秒最多只能使 -用10ms,而且每秒分页不能超过1GiB。在这一限制下,首先分页出具有较长年龄的内存区域。另外, -每5秒钟检查一次系统的可用内存率,当可用内存率低于50%时开始监测和分页,但如果可用内存率 -大于60%,或低于30%,则停止监测“:: - - # cd /damon - # scheme="4096 8192 0 5 10 20 2" # target access pattern and action - # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas - # scheme+=" 0 0 100" # prioritization weights - # scheme+=" 1 5000000 600 500 300" # watermarks - # echo "$scheme" > schemes - - -开关 ----- - -除非你明确地启动监测,否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED`` -文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入 -``off`` 该文件则停止这些目标。如果每个目标进程被终止,DAMON也会停止。下面的示例命令开启、关 -闭和检查DAMON的状态:: - - # cd /damon - # echo on > monitor_on_DEPRECATED - # echo off > monitor_on_DEPRECATED - # cat monitor_on_DEPRECATED - off - -请注意,当监测开启时,你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件,将会返 -回一个错误代码,如 ``-EBUSY`` 。 - - -监测线程PID ------------ - -DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以通过读取 ``kdamond_pid`` 文件获 -得该线程的 ``pid`` 。当监测被 ``关闭`` 时,读取该文件不会返回任何信息:: - - # cd /damon - # cat monitor_on_DEPRECATED - off - # cat kdamond_pid - none - # echo on > monitor_on_DEPRECATED - # cat kdamond_pid - 18594 - - -使用多个监测线程 ----------------- - -每个监测上下文都会创建一个 ``kdamond`` 线程。你可以使用 ``mk_contexts`` 和 ``rm_contexts`` -文件为多个 ``kdamond`` 需要的用例创建和删除监测上下文。 - -将新上下文的名称写入 ``mk_contexts`` 文件,在 ``DAMON debugfs`` 目录上创建一个该名称的目录。 -该目录将有该上下文的 ``DAMON debugfs`` 文件:: - - # cd /damon - # ls foo - # ls: cannot access 'foo': No such file or directory - # echo foo > mk_contexts - # ls foo - # attrs init_regions kdamond_pid schemes target_ids - -如果不再需要上下文,你可以通过把上下文的名字放到 ``rm_contexts`` 文件中来删除它和相应的目录:: - - # echo foo > rm_contexts - # ls foo - # ls: cannot access 'foo': No such file or directory - -注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目录下。 - 监测结果的监测点 ================ diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst index fbbbbad59ee4..d3fd4f850793 100644 --- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst @@ -26,12 +26,7 @@ DAMON 爲不同的用戶提供了下面這些接口。 使用它,用戶可以通過讀取和寫入特殊的sysfs文件來使用DAMON的主要功能。因此,你可以編寫和使 用你個性化的DAMON sysfs包裝程序,代替你讀/寫sysfs文件。 `DAMON用戶空間工具 `_ 就是這種程序的一個例子 它同時支持虛擬和物理地址 - 空間的監測。注意,這個界面只提供簡單的監測結果 :ref:`統計 `。對於詳細的監測 - 結果,DAMON提供了一個:ref:`跟蹤點 `。 -- *debugfs interface.* - :ref:`這 ` 幾乎與:ref:`sysfs interface ` 接 - 口相同。這將在下一個LTS內核發佈後被移除,所以用戶應該轉移到 - :ref:`sysfs interface `。 + 空間的監測。 - *內核空間編程接口。* :doc:`這 ` 這是爲內核空間程序員準備的。使用它,用戶可以通過爲你編寫內 核空間的DAMON應用程序,最靈活有效地利用DAMON的每一個功能。你甚至可以爲各種地址空間擴展DAMON。 @@ -335,247 +330,6 @@ tried_regions// 請注意,我們強烈建議使用用戶空間的工具,如 `damo `_ , 而不是像上面那樣手動讀寫文件。以上只是一個例子。 -debugfs接口 -=========== - -.. note:: - - DAMON debugfs接口將在下一個LTS內核發佈後被移除,所以用戶應該轉移到 - :ref:`sysfs接口`。 - -DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和 -``rm_contexts`` under its debugfs directory, ``/damon/``. - - -屬性 ----- - -用戶可以通過讀取和寫入 ``attrs`` 文件獲得和設置 ``採樣間隔`` 、 ``聚集間隔`` 、 ``更新間隔`` -以及監測目標區域的最小/最大數量。要詳細瞭解監測屬性,請參考 `:doc:/mm/damon/design` 。例如, -下面的命令將這些值設置爲5ms、100ms、1000ms、10和1000,然後再次檢查:: - - # cd /damon - # echo 5000 100000 1000000 10 1000 > attrs - # cat attrs - 5000 100000 1000000 10 1000 - - -目標ID ------- - -一些類型的地址空間支持多個監測目標。例如,虛擬內存地址空間的監測可以有多個進程作爲監測目標。用戶 -可以通過寫入目標的相關id值來設置目標,並通過讀取 ``target_ids`` 文件來獲得當前目標的id。在監 -測虛擬地址空間的情況下,這些值應該是監測目標進程的pid。例如,下面的命令將pid爲42和4242的進程設 -爲監測目標,並再次檢查:: - - # cd /damon - # echo 42 4242 > target_ids - # cat target_ids - 42 4242 - -用戶還可以通過在文件中寫入一個特殊的關鍵字 "paddr\n" 來監測系統的物理內存地址空間。因爲物理地 -址空間監測不支持多個目標,讀取文件會顯示一個假值,即 ``42`` ,如下圖所示:: - - # cd /damon - # echo paddr > target_ids - # cat target_ids - 42 - -請注意,設置目標ID並不啓動監測。 - - -初始監測目標區域 ----------------- - -在虛擬地址空間監測的情況下,DAMON自動設置和更新監測的目標區域,這樣就可以覆蓋目標進程的整個 -內存映射。然而,用戶可能希望將監測區域限制在特定的地址範圍內,如堆、棧或特定的文件映射區域。 -或者,一些用戶可以知道他們工作負載的初始訪問模式,因此希望爲“自適應區域調整”設置最佳初始區域。 - -相比之下,DAMON在物理內存監測的情況下不會自動設置和更新監測目標區域。因此,用戶應該自己設置 -監測目標區域。 - -在這種情況下,用戶可以通過在 ``init_regions`` 文件中寫入適當的值,明確地設置他們想要的初 -始監測目標區域。輸入應該是一個由三個整數組成的隊列,用空格隔開,代表一個區域的形式如下:: - - - -目標idx應該是 ``target_ids`` 文件中目標的索引,從 ``0`` 開始,區域應該按照地址順序傳遞。 -例如,下面的命令將設置幾個地址範圍, ``1-100`` 和 ``100-200`` 作爲pid 42的初始監測目標 -區域,這是 ``target_ids`` 中的第一個(索引 ``0`` ),另外幾個地址範圍, ``20-40`` 和 -``50-100`` 作爲pid 4242的地址,這是 ``target_ids`` 中的第二個(索引 ``1`` ):: - - # cd /damon - # cat target_ids - 42 4242 - # echo "0 1 100 \ - 0 100 200 \ - 1 20 40 \ - 1 50 100" > init_regions - -請注意,這只是設置了初始的監測目標區域。在虛擬內存監測的情況下,DAMON會在一個 ``更新間隔`` -後自動更新區域的邊界。因此,在這種情況下,如果用戶不希望更新的話,應該把 ``更新間隔`` 設 -置得足夠大。 - - -方案 ----- - -對於通常的基於DAMON的數據訪問感知的內存管理優化,用戶只是希望系統對特定訪問模式的內存區域應用內 -存管理操作。DAMON從用戶那裏接收這種形式化的操作方案,並將這些方案應用到目標進程中。 - -用戶可以通過讀取和寫入 ``scheme`` debugfs文件來獲得和設置這些方案。讀取該文件還可以顯示每個 -方案的統計數據。在文件中,每一個方案都應該在每一行中以下列形式表示出來:: - - - -你可以通過簡單地在文件中寫入一個空字符串來禁用方案。 - -目標訪問模式 -~~~~~~~~~~~~ - -``<目標訪問模式>`` 是由三個範圍構成的,形式如下:: - - min-size max-size min-acc max-acc min-age max-age - -具體來說,區域大小的字節數( `min-size` 和 `max-size` ),訪問頻率的每聚合區間的監測訪問次 -數( `min-acc` 和 `max-acc` ),區域年齡的聚合區間數( `min-age` 和 `max-age` )都被指定。 -請注意,這些範圍是封閉區間。 - -動作 -~~~~ - -```` 是一個預定義的內存管理動作的整數,DAMON將應用於具有目標訪問模式的區域。支持 -的數字和它們的含義如下:: - - - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - 1: Call ``madvise()`` for the region with ``MADV_COLD`` - - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` - - 5: Do nothing but count the statistics - -配額 -~~~~ - -每個 ``動作`` 的最佳 ``目標訪問模式`` 取決於工作負載,所以不容易找到。更糟糕的是,將某個 -動作的方案設置得過於激進會導致嚴重的開銷。爲了避免這種開銷,用戶可以通過下面表格中的 ```` -來限制方案的時間和大小配額:: - - - -這使得DAMON在 ```` 毫秒內,儘量只用 ```` 毫秒的時間對 ``目標訪 -問模式`` 的內存區域應用動作,並在 ```` 內只對最多字節的內存區域應 -用動作。將 ```` 和 ```` 都設置爲零,可以禁用配額限制。 - -當預計超過配額限制時,DAMON會根據 ``目標訪問模式`` 的大小、訪問頻率和年齡,對發現的內存 -區域進行優先排序。爲了實現個性化的優先級,用戶可以在 ``<優先級權重>`` 中設置這三個屬性的 -權重,具體形式如下:: - - - -水位 -~~~~ - -有些方案需要根據系統特定指標的當前值來運行,如自由內存比率。對於這種情況,用戶可以爲該條 -件指定水位。:: - - - -```` 是一個預定義的整數,用於要檢查的度量。支持的數字和它們的含義如下。 - - - 0: 忽視水位 - - 1: 系統空閒內存率 (千分比) - -每隔 ``<檢查間隔>`` 微秒檢查一次公制的值。 - -如果該值高於 ``<高標>`` 或低於 ``<低標>`` ,該方案被停用。如果該值低於 ``<中標>`` , -該方案將被激活。 - -統計數據 -~~~~~~~~ - -它還統計每個方案被嘗試應用的區域的總數量和字節數,每個方案被成功應用的區域的兩個數量,以 -及超過配額限制的總數量。這些統計數據可用於在線分析或調整方案。 - -統計數據可以通過讀取方案文件來顯示。讀取該文件將顯示你在每一行中輸入的每個 ``方案`` , -統計的五個數字將被加在每一行的末尾。 - -例子 -~~~~ - -下面的命令應用了一個方案:”如果一個大小爲[4KiB, 8KiB]的內存區域在[10, 20]的聚合時間 -間隔內顯示出每一個聚合時間間隔[0, 5]的訪問量,請分頁出該區域。對於分頁,每秒最多隻能使 -用10ms,而且每秒分頁不能超過1GiB。在這一限制下,首先分頁出具有較長年齡的內存區域。另外, -每5秒鐘檢查一次系統的可用內存率,當可用內存率低於50%時開始監測和分頁,但如果可用內存率 -大於60%,或低於30%,則停止監測“:: - - # cd /damon - # scheme="4096 8192 0 5 10 20 2" # target access pattern and action - # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas - # scheme+=" 0 0 100" # prioritization weights - # scheme+=" 1 5000000 600 500 300" # watermarks - # echo "$scheme" > schemes - - -開關 ----- - -除非你明確地啓動監測,否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED`` -文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入 -``off`` 該文件則停止這些目標。如果每個目標進程被終止,DAMON也會停止。下面的示例命令開啓、關 -閉和檢查DAMON的狀態:: - - # cd /damon - # echo on > monitor_on_DEPRECATED - # echo off > monitor_on_DEPRECATED - # cat monitor_on_DEPRECATED - off - -請注意,當監測開啓時,你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件,將會返 -回一個錯誤代碼,如 ``-EBUSY`` 。 - - -監測線程PID ------------ - -DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以通過讀取 ``kdamond_pid`` 文件獲 -得該線程的 ``pid`` 。當監測被 ``關閉`` 時,讀取該文件不會返回任何信息:: - - # cd /damon - # cat monitor_on_DEPRECATED - off - # cat kdamond_pid - none - # echo on > monitor_on_DEPRECATED - # cat kdamond_pid - 18594 - - -使用多個監測線程 ----------------- - -每個監測上下文都會創建一個 ``kdamond`` 線程。你可以使用 ``mk_contexts`` 和 ``rm_contexts`` -文件爲多個 ``kdamond`` 需要的用例創建和刪除監測上下文。 - -將新上下文的名稱寫入 ``mk_contexts`` 文件,在 ``DAMON debugfs`` 目錄上創建一個該名稱的目錄。 -該目錄將有該上下文的 ``DAMON debugfs`` 文件:: - - # cd /damon - # ls foo - # ls: cannot access 'foo': No such file or directory - # echo foo > mk_contexts - # ls foo - # attrs init_regions kdamond_pid schemes target_ids - -如果不再需要上下文,你可以通過把上下文的名字放到 ``rm_contexts`` 文件中來刪除它和相應的目錄:: - - # echo foo > rm_contexts - # ls foo - # ls: cannot access 'foo': No such file or directory - -注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目錄下。 - 監測結果的監測點 ================ diff --git a/MAINTAINERS b/MAINTAINERS index 19800134a9dd..ecb70ad017b8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6327,6 +6327,7 @@ F: Documentation/mm/damon/ F: include/linux/damon.h F: include/trace/events/damon.h F: mm/damon/ +F: samples/damon/ F: tools/testing/selftests/damon/ DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER @@ -15068,7 +15069,15 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: mm/mlock.c F: mm/mmap.c +F: mm/mprotect.c +F: mm/mremap.c +F: mm/mseal.c +F: mm/vma.c +F: mm/vma.h +F: mm/vma_internal.h +F: tools/testing/vma/ MEMORY TECHNOLOGY DEVICES (MTD) M: Miquel Raynal @@ -25018,21 +25027,6 @@ F: include/uapi/linux/vsockmon.h F: net/vmw_vsock/ F: tools/testing/vsock/ -VMA -M: Andrew Morton -M: Liam R. Howlett -M: Lorenzo Stoakes -R: Vlastimil Babka -R: Jann Horn -L: linux-mm@kvack.org -S: Maintained -W: https://www.linux-mm.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm -F: mm/vma.c -F: mm/vma.h -F: mm/vma_internal.h -F: tools/testing/vma/ - VMALLOC M: Andrew Morton R: Uladzislau Rezki diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c index ca3d9c732b61..6e577228e175 100644 --- a/arch/alpha/kernel/core_cia.c +++ b/arch/alpha/kernel/core_cia.c @@ -331,10 +331,7 @@ cia_prepare_tbia_workaround(int window) long i; /* Use minimal 1K map. */ - ppte = memblock_alloc(CIA_BROKEN_TBIA_SIZE, 32768); - if (!ppte) - panic("%s: Failed to allocate %u bytes align=0x%x\n", - __func__, CIA_BROKEN_TBIA_SIZE, 32768); + ppte = memblock_alloc_or_panic(CIA_BROKEN_TBIA_SIZE, 32768); pte = (virt_to_phys(ppte) >> (PAGE_SHIFT - 1)) | 1; for (i = 0; i < CIA_BROKEN_TBIA_SIZE / sizeof(unsigned long); ++i) diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index b22248044bf0..b1bfbd11980d 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -81,10 +81,7 @@ mk_resource_name(int pe, int port, char *str) char *name; sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port); - name = memblock_alloc(strlen(tmp) + 1, SMP_CACHE_BYTES); - if (!name) - panic("%s: Failed to allocate %zu bytes\n", __func__, - strlen(tmp) + 1); + name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES); strcpy(name, tmp); return name; @@ -119,10 +116,7 @@ alloc_io7(unsigned int pe) return NULL; } - io7 = memblock_alloc(sizeof(*io7), SMP_CACHE_BYTES); - if (!io7) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*io7)); + io7 = memblock_alloc_or_panic(sizeof(*io7), SMP_CACHE_BYTES); io7->pe = pe; raw_spin_lock_init(&io7->irq_lock); diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 4458eb7f44f0..8e9b4ac86b7e 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -391,10 +391,7 @@ alloc_pci_controller(void) { struct pci_controller *hose; - hose = memblock_alloc(sizeof(*hose), SMP_CACHE_BYTES); - if (!hose) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*hose)); + hose = memblock_alloc_or_panic(sizeof(*hose), SMP_CACHE_BYTES); *hose_tail = hose; hose_tail = &hose->next; @@ -405,13 +402,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - void *ptr = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); - - if (!ptr) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct resource)); - - return ptr; + return memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES); } diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 7fcf3e9b7103..681f56089d9c 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -71,14 +71,8 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base, if (align < mem_size) align = mem_size; - arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES); - if (!arena) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*arena)); - arena->ptes = memblock_alloc(mem_size, align); - if (!arena->ptes) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, mem_size, align); + arena = memblock_alloc_or_panic(sizeof(*arena), SMP_CACHE_BYTES); + arena->ptes = memblock_alloc_or_panic(mem_size, align); spin_lock_init(&arena->lock); arena->hose = hose; diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 4fe618446e4c..61c2198b1359 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -42,7 +42,7 @@ pgd_alloc(struct mm_struct *mm) { pgd_t *ret, *init; - ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + ret = __pgd_alloc(mm, 0); init = pgd_offset(&init_mm, 0UL); if (ret) { #ifdef CONFIG_ALPHA_LARGE_VMALLOC diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 096b8ef58edb..dfae070fe8d5 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -53,19 +53,14 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *ret = (pgd_t *) __get_free_page(GFP_KERNEL); + pgd_t *ret = __pgd_alloc(mm, 0); if (ret) { int num, num2; - num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE; - memzero(ret, num * sizeof(pgd_t)); + num = USER_PTRS_PER_PGD + USER_KERNEL_GUTTER / PGDIR_SIZE; num2 = VMALLOC_SIZE / PGDIR_SIZE; memcpy(ret + num, swapper_pg_dir + num, num2 * sizeof(pgd_t)); - - memzero(ret + num + num2, - (PTRS_PER_PGD - num - num2) * sizeof(pgd_t)); - } return ret; } diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f40d06ad5d2a..ea4fbe7b17f6 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -26,14 +26,7 @@ #else /* !CONFIG_MMU */ -#include #include - -static inline void __tlb_remove_table(void *_table) -{ - free_page_and_swap_cache((struct page *)_table); -} - #include static inline void @@ -41,8 +34,6 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { struct ptdesc *ptdesc = page_ptdesc(pte); - pagetable_pte_dtor(ptdesc); - #ifndef CONFIG_ARM_LPAE /* * With the classic ARM MMU, a pte page has two corresponding pmd @@ -61,7 +52,6 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) #ifdef CONFIG_ARM_LPAE struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pagetable_pmd_dtor(ptdesc); tlb_remove_ptdesc(tlb, ptdesc); #endif } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index e6a857bf0ce6..a41c93988d2c 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -880,10 +880,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) */ boot_alias_start = phys_to_idmap(start); if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) { - res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", - __func__, sizeof(*res)); + res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM (boot alias)"; res->start = boot_alias_start; res->end = phys_to_idmap(res_end); @@ -891,10 +888,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) request_resource(&iomem_resource, res); } - res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*res)); + res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM"; res->start = start; res->end = res_end; diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index f5b7a16c5803..f02f872ea8a9 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -726,13 +726,8 @@ EXPORT_SYMBOL(phys_mem_access_prot); static void __init *early_alloc(unsigned long sz) { - void *ptr = memblock_alloc(sz, sz); + return memblock_alloc_or_panic(sz, sz); - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, sz, sz); - - return ptr; } static void *__init late_alloc(unsigned long sz) @@ -1027,10 +1022,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr) if (!nr) return; - svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm)); - if (!svm) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, sizeof(*svm) * nr, __alignof__(*svm)); + svm = memblock_alloc_or_panic(sizeof(*svm) * nr, __alignof__(*svm)); for (md = io_desc; nr; md++, nr--) { create_mapping(md); @@ -1052,10 +1044,7 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size, struct vm_struct *vm; struct static_vm *svm; - svm = memblock_alloc(sizeof(*svm), __alignof__(*svm)); - if (!svm) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, sizeof(*svm), __alignof__(*svm)); + svm = memblock_alloc_or_panic(sizeof(*svm), __alignof__(*svm)); vm = &svm->vm; vm->addr = (void *)addr; diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index c415f3859b20..1a8f6914ee59 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -162,10 +162,7 @@ void __init paging_init(const struct machine_desc *mdesc) mpu_setup(); /* allocate the zero page. */ - zero_page = (void *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + zero_page = (void *)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); bootmem_init(); diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index f8e9bc58a84f..4eb81b7ed03a 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -17,11 +17,11 @@ #include "mm.h" #ifdef CONFIG_ARM_LPAE -#define __pgd_alloc() kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL) -#define __pgd_free(pgd) kfree(pgd) +#define _pgd_alloc(mm) kmalloc_array(PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL | __GFP_ZERO) +#define _pgd_free(mm, pgd) kfree(pgd) #else -#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL, 2) -#define __pgd_free(pgd) free_pages((unsigned long)pgd, 2) +#define _pgd_alloc(mm) __pgd_alloc(mm, 2) +#define _pgd_free(mm, pgd) __pgd_free(mm, pgd) #endif /* @@ -35,12 +35,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *new_pmd, *init_pmd; pte_t *new_pte, *init_pte; - new_pgd = __pgd_alloc(); + new_pgd = _pgd_alloc(mm); if (!new_pgd) goto no_pgd; - memset(new_pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - /* * Copy over the kernel and IO PGD entries */ @@ -134,7 +132,7 @@ no_pmd: no_pud: p4d_free(mm, new_p4d); no_p4d: - __pgd_free(new_pgd); + _pgd_free(mm, new_pgd); no_pgd: return NULL; } @@ -207,5 +205,5 @@ no_pgd: p4d_free(mm, p4d); } #endif - __pgd_free(pgd_base); + _pgd_free(mm, pgd_base); } diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index e75422864d1b..1b4509d3382c 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -85,24 +85,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) __pgd_populate(pgdp, __pa(p4dp), pgdval); } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - gfp_t gfp = GFP_PGTABLE_USER; - - if (mm == &init_mm) - gfp = GFP_PGTABLE_KERNEL; - return (p4d_t *)get_zeroed_page(gfp); -} - -static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - if (!pgtable_l5_enabled()) - return; - BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - free_page((unsigned long)p4d); -} - -#define __p4d_free_tlb(tlb, p4d, addr) p4d_free((tlb)->mm, p4d) #else static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot) { diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index a947c6e784ed..8d762607285c 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -9,12 +9,7 @@ #define __ASM_TLB_H #include -#include -static inline void __tlb_remove_table(void *_table) -{ - free_page_and_swap_cache((struct page *)_table); -} #define tlb_flush tlb_flush static void tlb_flush(struct mmu_gather *tlb); @@ -82,7 +77,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, { struct ptdesc *ptdesc = page_ptdesc(pte); - pagetable_pte_dtor(ptdesc); tlb_remove_ptdesc(tlb, ptdesc); } @@ -92,7 +86,6 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, { struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pagetable_pmd_dtor(ptdesc); tlb_remove_ptdesc(tlb, ptdesc); } #endif @@ -106,7 +99,19 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, if (!pgtable_l4_enabled()) return; - pagetable_pud_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 4 +static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4dp, + unsigned long addr) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(p4dp); + + if (!pgtable_l5_enabled()) + return; + tlb_remove_ptdesc(tlb, ptdesc); } #endif diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 4f613e8e0745..85104587f849 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -223,9 +223,7 @@ static void __init request_standard_resources(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); - if (!standard_resources) - panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 0c501cabc238..8160cff35089 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -33,7 +33,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) gfp_t gfp = GFP_PGTABLE_USER; if (pgdir_is_page_size()) - return (pgd_t *)__get_free_page(gfp); + return __pgd_alloc(mm, 0); else return kmem_cache_alloc(pgd_cache, gfp); } @@ -41,7 +41,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) void pgd_free(struct mm_struct *mm, pgd_t *pgd) { if (pgdir_is_page_size()) - free_page((unsigned long)pgd); + __pgd_free(mm, pgd); else kmem_cache_free(pgd_cache, pgd); } diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 9c84c9012e53..bf8400c28b5a 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -44,7 +44,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *ret; pgd_t *init; - ret = (pgd_t *) __get_free_page(GFP_KERNEL); + ret = __pgd_alloc(mm, 0); if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init((unsigned long *)ret); @@ -63,7 +63,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \ } while (0) diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index 55988625e6fb..1ee5f5f157ca 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -22,7 +22,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; - pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + pgd = __pgd_alloc(mm, 0); /* * There may be better ways to do this, but to ensure @@ -89,7 +89,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor((page_ptdesc(pte))); \ + pagetable_dtor((page_ptdesc(pte))); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 4dffc90192f7..1cc6e8843680 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -113,7 +113,10 @@ CONFIG_ZBUD=y CONFIG_ZSMALLOC=m # CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +# CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE is not set +CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO=y +# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL is not set +# CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is not set CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index a7b9c9e73593..7211dff8c969 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -57,7 +57,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 56934fe58170..edcfdfcad7d2 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -431,7 +431,7 @@ static void __init resource_init(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 188b52bbb254..ca5aa5f46a9f 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -174,9 +174,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) pmd_t *pmd; if (p4d_none(p4dp_get(p4d))) { - pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pud) - panic("%s: Failed to allocate memory\n", __func__); + pud = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); p4d_populate(&init_mm, p4d, pud); #ifndef __PAGETABLE_PUD_FOLDED pud_init(pud); @@ -185,9 +183,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) pud = pud_offset(p4d, addr); if (pud_none(pudp_get(pud))) { - pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pmd) - panic("%s: Failed to allocate memory\n", __func__); + pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, pmd); #ifndef __PAGETABLE_PMD_FOLDED pmd_init(pmd); @@ -198,10 +194,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) if (!pmd_present(pmdp_get(pmd))) { pte_t *pte; - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate memory\n", __func__); - + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); kernel_pte_init(pte); } diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 3fa69b23ff84..22a94bb3e6e8 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -23,11 +23,10 @@ EXPORT_SYMBOL(tlb_virt_to_page); pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *init, *ret = NULL; - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); + pgd_t *init, *ret; - if (ptdesc) { - ret = (pgd_t *)ptdesc_address(ptdesc); + ret = __pgd_alloc(mm, 0); + if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 302c5bf67179..4c648b51e7fd 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -37,7 +37,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable, { struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } @@ -61,7 +61,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) { struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } @@ -73,7 +73,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pagetable_free(virt_to_ptdesc(pgd)); + pagetable_dtor_free(virt_to_ptdesc(pgd)); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) @@ -84,6 +84,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) if (!ptdesc) return NULL; + pagetable_pgd_ctor(ptdesc); new_pgd = ptdesc_address(ptdesc); memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t)); diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index 74a817d9387f..5abe7da8ac5a 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -9,9 +9,9 @@ extern void mmu_page_ctor(void *page); extern void mmu_page_dtor(void *page); enum m68k_table_types { - TABLE_PGD = 0, - TABLE_PMD = 0, /* same size as PGD */ - TABLE_PTE = 1, + TABLE_PGD, + TABLE_PMD, + TABLE_PTE, }; extern void init_pointer_table(void *table, int type); diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 4a137eecb6fe..f1ae4ed890db 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -19,7 +19,7 @@ extern const char bad_pmd_string[]; #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) @@ -43,7 +43,7 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm) { pgd_t *new_pgd; - new_pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); + new_pgd = __pgd_alloc(mm, 0); memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE); memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT)); return new_pgd; diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 1b47bec15832..8b11d0d545aa 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -68,10 +68,7 @@ void __init paging_init(void) high_memory = (void *) end_mem; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 9a6fa342e872..19a75029036c 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -42,20 +42,14 @@ void __init paging_init(void) unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; int i; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pg_dir = swapper_pg_dir; memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); size = num_pages * sizeof(pte_t); size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1); - next_pgtable = (unsigned long) memblock_alloc(size, PAGE_SIZE); - if (!next_pgtable) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, PAGE_SIZE); + next_pgtable = (unsigned long) memblock_alloc_or_panic(size, PAGE_SIZE); pg_dir += PAGE_OFFSET >> PGDIR_SHIFT; diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index c1761d309fc6..73651e093c4d 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -97,17 +97,19 @@ void mmu_page_dtor(void *page) typedef struct list_head ptable_desc; -static struct list_head ptable_list[2] = { +static struct list_head ptable_list[3] = { LIST_HEAD_INIT(ptable_list[0]), LIST_HEAD_INIT(ptable_list[1]), + LIST_HEAD_INIT(ptable_list[2]), }; #define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page((void *)(page))->lru)) #define PD_PAGE(ptable) (list_entry(ptable, struct page, lru)) #define PD_MARKBITS(dp) (*(unsigned int *)&PD_PAGE(dp)->index) -static const int ptable_shift[2] = { - 7+2, /* PGD, PMD */ +static const int ptable_shift[3] = { + 7+2, /* PGD */ + 7+2, /* PMD */ 6+2, /* PTE */ }; @@ -156,12 +158,20 @@ void *get_pointer_table(int type) if (!(page = (void *)get_zeroed_page(GFP_KERNEL))) return NULL; - if (type == TABLE_PTE) { + switch (type) { + case TABLE_PTE: /* * m68k doesn't have SPLIT_PTE_PTLOCKS for not having * SMP. */ pagetable_pte_ctor(virt_to_ptdesc(page)); + break; + case TABLE_PMD: + pagetable_pmd_ctor(virt_to_ptdesc(page)); + break; + case TABLE_PGD: + pagetable_pgd_ctor(virt_to_ptdesc(page)); + break; } mmu_page_ctor(page); @@ -200,8 +210,7 @@ int free_pointer_table(void *table, int type) /* all tables in page are free, free page */ list_del(dp); mmu_page_dtor((void *)page); - if (type == TABLE_PTE) - pagetable_pte_dtor(virt_to_ptdesc((void *)page)); + pagetable_dtor(virt_to_ptdesc((void *)page)); free_page (page); return 1; } else if (ptable_list[type].next != dp) { @@ -491,10 +500,7 @@ void __init paging_init(void) * initialize the bad page table and bad page to point * to a couple of allocated pages */ - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); /* * Set up SFC/DFC registers diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index 494739c1783e..1ecf6bdd08bf 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -44,10 +44,7 @@ void __init paging_init(void) unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long size; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); address = PAGE_OFFSET; pg_dir = swapper_pg_dir; @@ -57,10 +54,7 @@ void __init paging_init(void) size = num_pages * sizeof(pte_t); size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1); - next_pgtable = (unsigned long)memblock_alloc(size, PAGE_SIZE); - if (!next_pgtable) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, PAGE_SIZE); + next_pgtable = (unsigned long)memblock_alloc_or_panic(size, PAGE_SIZE); bootmem_end = (next_pgtable + size + PAGE_SIZE) & PAGE_MASK; /* Map whole memory from PAGE_OFFSET (0x0E000000) */ diff --git a/arch/m68k/sun3/sun3dvma.c b/arch/m68k/sun3/sun3dvma.c index 6ebf52740ad7..225fc735e466 100644 --- a/arch/m68k/sun3/sun3dvma.c +++ b/arch/m68k/sun3/sun3dvma.c @@ -252,12 +252,8 @@ void __init dvma_init(void) list_add(&(hole->list), &hole_list); - iommu_use = memblock_alloc(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long), + iommu_use = memblock_alloc_or_panic(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long), SMP_CACHE_BYTES); - if (!iommu_use) - panic("%s: Failed to allocate %zu bytes\n", __func__, - IOMMU_TOTAL_ENTRIES * sizeof(unsigned long)); - dvma_unmap_iommu(DVMA_START, DVMA_SIZE); sun3_dvma_init(); diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index 6c33b05f730f..084a8a0dc239 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -21,12 +21,7 @@ extern void __bad_pte(pmd_t *pmd); -static inline pgd_t *get_pgd(void) -{ - return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0); -} - -#define pgd_alloc(mm) get_pgd() +#define pgd_alloc(mm) __pgd_alloc(mm, 0) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index f4440edcd8fe..26c7a6ede983 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -15,7 +15,6 @@ #define __HAVE_ARCH_PMD_ALLOC_ONE #define __HAVE_ARCH_PUD_ALLOC_ONE -#define __HAVE_ARCH_PGD_FREE #include static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, @@ -49,14 +48,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern void pgd_init(void *addr); extern pgd_t *pgd_alloc(struct mm_struct *mm); -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - pagetable_free(virt_to_ptdesc(pgd)); -} - #define __pte_free_tlb(tlb, pte, address) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 12a1a4ffb602..fbfe0771317e 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -704,10 +704,7 @@ static void __init resource_init(void) for_each_mem_range(i, &start, &end) { struct resource *res; - res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct resource)); + res = memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES); res->start = start; /* diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 4c8e3c0aa210..75c9d3618f58 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -97,11 +98,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return -EINTR; if (IS_ENABLED(CONFIG_MIPS_FP_SUPPORT)) { + unsigned long unused; + /* Map delay slot emulation page */ - base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, - VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - 0, NULL); + base = do_mmap(NULL, STACK_TOP, PAGE_SIZE, PROT_READ | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, 0, 0, &unused, + NULL); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c index 1506e458040d..10835414819f 100644 --- a/arch/mips/mm/pgtable.c +++ b/arch/mips/mm/pgtable.c @@ -10,12 +10,10 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *init, *ret = NULL; - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, - PGD_TABLE_ORDER); + pgd_t *init, *ret; - if (ptdesc) { - ret = ptdesc_address(ptdesc); + ret = __pgd_alloc(mm, PGD_TABLE_ORDER); + if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index ce6bb8e74271..12a536b7bfbd 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -30,7 +30,7 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) diff --git a/arch/nios2/mm/pgtable.c b/arch/nios2/mm/pgtable.c index 7c76e8a7447a..6470ed378782 100644 --- a/arch/nios2/mm/pgtable.c +++ b/arch/nios2/mm/pgtable.c @@ -11,6 +11,7 @@ #include #include +#include /* pteaddr: * ptbase | vpn* | zero @@ -54,7 +55,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret, *init; - ret = (pgd_t *) __get_free_page(GFP_KERNEL); + ret = __pgd_alloc(mm, 0); if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init(ret); diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index c6a73772a546..3372f4e6ab4b 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -41,15 +41,13 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *ret = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *ret = __pgd_alloc(mm, 0); - if (ret) { - memset(ret, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (ret) memcpy(ret + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } return ret; } @@ -68,7 +66,7 @@ extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index f59ea4c10b0f..8e63e86251ca 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -38,10 +38,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm) if (likely(mem_init_done)) { pte = (pte_t *)get_zeroed_page(GFP_KERNEL); } else { - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } return pte; diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index e3e142b1c5c5..2ca74a56415c 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -11,27 +11,12 @@ #include #define __HAVE_ARCH_PMD_ALLOC_ONE -#define __HAVE_ARCH_PMD_FREE -#define __HAVE_ARCH_PGD_FREE #include /* Allocate the top level pgd (page directory) */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd; - - pgd = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); - if (unlikely(pgd == NULL)) - return NULL; - - memset(pgd, 0, PAGE_SIZE << PGD_TABLE_ORDER); - - return pgd; -} - -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - free_pages((unsigned long)pgd, PGD_TABLE_ORDER); + return __pgd_alloc(mm, PGD_TABLE_ORDER); } #if CONFIG_PGTABLE_LEVELS == 3 @@ -46,17 +31,19 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - pmd_t *pmd; + struct ptdesc *ptdesc; + gfp_t gfp = GFP_PGTABLE_USER; - pmd = (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_TABLE_ORDER); - if (likely(pmd)) - memset ((void *)pmd, 0, PAGE_SIZE << PMD_TABLE_ORDER); - return pmd; -} - -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) -{ - free_pages((unsigned long)pmd, PMD_TABLE_ORDER); + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + ptdesc = pagetable_alloc(gfp, PMD_TABLE_ORDER); + if (!ptdesc) + return NULL; + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + return ptdesc_address(ptdesc); } #endif diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 96970fa75e4a..61c0a2477072 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -377,10 +377,8 @@ static void __ref map_pages(unsigned long start_vaddr, #if CONFIG_PGTABLE_LEVELS == 3 if (pud_none(*pud)) { - pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER, + pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER, PAGE_SIZE << PMD_TABLE_ORDER); - if (!pmd) - panic("pmd allocation failed.\n"); pud_populate(NULL, pud, pmd); } #endif @@ -388,9 +386,7 @@ static void __ref map_pages(unsigned long start_vaddr, pmd = pmd_offset(pud, vaddr); for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++, pmd++) { if (pmd_none(*pmd)) { - pg_table = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pg_table) - panic("page table allocation failed\n"); + pg_table = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(NULL, pmd, pg_table); } @@ -648,9 +644,7 @@ static void __init pagetable_init(void) } #endif - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("zero page allocation failed.\n"); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } @@ -687,19 +681,15 @@ static void __init fixmap_init(void) #if CONFIG_PGTABLE_LEVELS == 3 if (pud_none(*pud)) { - pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER, + pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER, PAGE_SIZE << PMD_TABLE_ORDER); - if (!pmd) - panic("fixmap: pmd allocation failed.\n"); pud_populate(NULL, pud, pmd); } #endif pmd = pmd_offset(pud, addr); do { - pte_t *pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("fixmap: pte allocation failed.\n"); + pte_t *pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 1ca7d4c4b90d..2058e8d3e013 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -37,6 +37,7 @@ extern void tlb_flush(struct mmu_gather *tlb); */ #define tlb_needs_table_invalidate() radix_enabled() +#define __HAVE_ARCH_TLB_REMOVE_TABLE /* Get the generic bits... */ #include diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 1bee15c013e7..3af6c06af02f 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -1087,12 +1087,10 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char /* Count and allocate space for cpu features */ of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes, &nr_dt_cpu_features); - dt_cpu_features = memblock_alloc(sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, PAGE_SIZE); - if (!dt_cpu_features) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, - sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, - PAGE_SIZE); + dt_cpu_features = + memblock_alloc_or_panic( + sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, + PAGE_SIZE); cpufeatures_setup_start(isa); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index ce0c8623e563..f8a3bd8cfae4 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -213,11 +213,8 @@ pci_create_OF_bus_map(void) struct property* of_prop; struct device_node *dn; - of_prop = memblock_alloc(sizeof(struct property) + 256, + of_prop = memblock_alloc_or_panic(sizeof(struct property) + 256, SMP_CACHE_BYTES); - if (!of_prop) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct property) + 256); dn = of_find_node_by_path("/"); if (dn) { memset(of_prop, -1, sizeof(struct property) + 256); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 6fa179448c33..f3ea1329c566 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -458,11 +458,8 @@ void __init smp_setup_cpu_maps(void) DBG("smp_setup_cpu_maps()\n"); - cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32), + cpu_to_phys_id = memblock_alloc_or_panic(nr_cpu_ids * sizeof(u32), __alignof__(u32)); - if (!cpu_to_phys_id) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, nr_cpu_ids * sizeof(u32), __alignof__(u32)); for_each_node_by_type(dn, "cpu") { const __be32 *intserv; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 75dbf3e0d9c4..5a1bf501fbe1 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -140,13 +140,7 @@ arch_initcall(ppc_init); static void *__init alloc_stack(void) { - void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN); - - if (!ptr) - panic("cannot allocate %d bytes for stack at %pS\n", - THREAD_SIZE, (void *)_RET_IP_); - - return ptr; + return memblock_alloc_or_panic(THREAD_SIZE, THREAD_ALIGN); } void __init irqstack_early_init(void) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 6978344edcb4..be9c4106e22f 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -377,10 +377,7 @@ void __init MMU_init_hw(void) * Find some memory for the hash table. */ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); - Hash = memblock_alloc(Hash_size, Hash_size); - if (!Hash) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, Hash_size, Hash_size); + Hash = memblock_alloc_or_panic(Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; pr_info("Total memory = %lldMB; using %ldkB for hash table\n", diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 1715b07c630c..4e1e45420bd4 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -253,7 +253,7 @@ static void pmd_frag_destroy(void *pmd_frag) count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 374542528080..ce64abea9e3e 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -330,11 +330,7 @@ void __init mmu_partition_table_init(void) unsigned long ptcr; /* Initialize the Partition Table with no entries */ - partition_tb = memblock_alloc(patb_size, patb_size); - if (!partition_tb) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, patb_size, patb_size); - + partition_tb = memblock_alloc_or_panic(patb_size, patb_size); ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); @@ -477,7 +473,7 @@ void pmd_fragment_free(unsigned long *pmd) BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c index 43c03b84ff32..60c78aac0f63 100644 --- a/arch/powerpc/mm/kasan/init_book3e_64.c +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -40,19 +40,19 @@ static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgpr pgdp = pgd_offset_k(ea); p4dp = p4d_offset(pgdp, ea); if (kasan_pud_table(*p4dp)) { - pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE); p4d_populate(&init_mm, p4dp, pudp); } pudp = pud_offset(p4dp, ea); if (kasan_pmd_table(*pudp)) { - pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); if (kasan_pte_table(*pmdp)) { - ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmdp, ptep); } @@ -74,7 +74,7 @@ static void __init kasan_init_phys_region(void *start, void *end) k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); - va = memblock_alloc(k_end - k_start, PAGE_SIZE); + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 3fb5ce4f48f4..7d959544c077 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -32,7 +32,7 @@ static void __init kasan_init_phys_region(void *start, void *end) k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); - va = memblock_alloc(k_end - k_start, PAGE_SIZE); + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); } diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index 0b181da40ddb..a1a4e697251a 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -385,21 +385,11 @@ void __init mmu_context_init(void) /* * Allocate the maps used by context management */ - context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!context_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), + context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); + context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1), SMP_CACHE_BYTES); - if (!context_mm) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(void *) * (LAST_CONTEXT + 1)); if (IS_ENABLED(CONFIG_SMP)) { - stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!stale_map[boot_cpuid]) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - + stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, "powerpc/mmu/ctx:prepare", mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead); diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index e89f64a0f24a..713268ccb1a0 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag) count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } } @@ -111,7 +111,7 @@ static void pte_free_now(struct rcu_head *head) struct ptdesc *ptdesc; ptdesc = container_of(head, struct ptdesc, pt_rcu_head); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 787b22206386..15276068f657 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -50,13 +50,8 @@ notrace void __init early_ioremap_init(void) void __init *early_alloc_pgtable(unsigned long size) { - void *ptr = memblock_alloc(size, size); + return memblock_alloc_or_panic(size, size); - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, size); - - return ptr; } pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va) diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index fe2e0249cbc2..a112d26185a0 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -514,10 +514,7 @@ static int __init core99_nvram_setup(struct device_node *dp, unsigned long addr) printk(KERN_ERR "nvram: no address\n"); return -EINVAL; } - nvram_image = memblock_alloc(NVRAM_SIZE, SMP_CACHE_BYTES); - if (!nvram_image) - panic("%s: Failed to allocate %u bytes\n", __func__, - NVRAM_SIZE); + nvram_image = memblock_alloc_or_panic(NVRAM_SIZE, SMP_CACHE_BYTES); nvram_data = ioremap(addr, NVRAM_SIZE*2); nvram_naddrs = 1; /* Make sure we get the correct case */ diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 877720c64515..4ac9808e55a4 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -88,26 +88,6 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop, } } -static void memtrace_clear_range(unsigned long start_pfn, - unsigned long nr_pages) -{ - unsigned long pfn; - - /* As HIGHMEM does not apply, use clear_page() directly. */ - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) - cond_resched(); - clear_page(__va(PFN_PHYS(pfn))); - } - /* - * Before we go ahead and use this range as cache inhibited range - * flush the cache. - */ - flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn), - (unsigned long)pfn_to_kaddr(start_pfn + nr_pages), - FLUSH_CHUNK_SIZE); -} - static u64 memtrace_alloc_node(u32 nid, u64 size) { const unsigned long nr_pages = PHYS_PFN(size); @@ -119,17 +99,18 @@ static u64 memtrace_alloc_node(u32 nid, u64 size) * by alloc_contig_pages(). */ page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE | - __GFP_NOWARN, nid, NULL); + __GFP_NOWARN | __GFP_ZERO, nid, NULL); if (!page) return 0; start_pfn = page_to_pfn(page); /* - * Clear the range while we still have a linear mapping. - * - * TODO: use __GFP_ZERO with alloc_contig_pages() once supported. + * Before we go ahead and use this range as cache inhibited range + * flush the cache. */ - memtrace_clear_range(start_pfn, nr_pages); + flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn), + (unsigned long)pfn_to_kaddr(start_pfn + nr_pages), + FLUSH_CHUNK_SIZE); /* * Set pages PageOffline(), to indicate that nobody (e.g., hibernation, diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 5d0f35bb917e..09bd93464b4f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -180,10 +180,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node, /* * Allocate a buffer to hold the MC recoverable ranges. */ - mc_recoverable_range = memblock_alloc(size, __alignof__(u64)); - if (!mc_recoverable_range) - panic("%s: Failed to allocate %u bytes align=0x%lx\n", - __func__, size, __alignof__(u64)); + mc_recoverable_range = memblock_alloc_or_panic(size, __alignof__(u64)); for (i = 0; i < mc_recoverable_range_len; i++) { mc_recoverable_range[i].start_addr = diff --git a/arch/powerpc/platforms/ps3/setup.c b/arch/powerpc/platforms/ps3/setup.c index 5144f11359f7..150c09b58ae8 100644 --- a/arch/powerpc/platforms/ps3/setup.c +++ b/arch/powerpc/platforms/ps3/setup.c @@ -115,10 +115,7 @@ static void __init prealloc(struct ps3_prealloc *p) if (!p->size) return; - p->address = memblock_alloc(p->size, p->align); - if (!p->address) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, p->size, p->align); + p->address = memblock_alloc_or_panic(p->size, p->align); printk(KERN_INFO "%s: %lu bytes at %p\n", p->name, p->size, p->address); diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 0b6e37f3ffb8..456a4f64ae0a 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -124,10 +124,7 @@ int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, if (bmp->bitmap_from_slab) bmp->bitmap = kzalloc(size, GFP_KERNEL); else { - bmp->bitmap = memblock_alloc(size, SMP_CACHE_BYTES); - if (!bmp->bitmap) - panic("%s: Failed to allocate %u bytes\n", __func__, - size); + bmp->bitmap = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); /* the bitmap won't be freed from memblock allocator */ kmemleak_not_leak(bmp->bitmap); } diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index f52264304f77..3e2aebea6312 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -12,16 +12,25 @@ #include #ifdef CONFIG_MMU -#define __HAVE_ARCH_PUD_ALLOC_ONE #define __HAVE_ARCH_PUD_FREE #include +/* + * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to + * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use + * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this + * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the + * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h + * for more details. + */ static inline void riscv_tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) { - if (riscv_use_sbi_for_rfence()) + if (riscv_use_sbi_for_rfence()) { tlb_remove_ptdesc(tlb, pt); - else + } else { + pagetable_dtor(pt); tlb_remove_page_ptdesc(tlb, pt); + } } static inline void pmd_populate_kernel(struct mm_struct *mm, @@ -88,15 +97,6 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, } } -#define pud_alloc_one pud_alloc_one -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - if (pgtable_l4_enabled) - return __pud_alloc_one(mm, addr); - - return NULL; -} - #define pud_free pud_free static inline void pud_free(struct mm_struct *mm, pud_t *pud) { @@ -107,39 +107,8 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long addr) { - if (pgtable_l4_enabled) { - struct ptdesc *ptdesc = virt_to_ptdesc(pud); - - pagetable_pud_dtor(ptdesc); - riscv_tlb_remove_ptdesc(tlb, ptdesc); - } -} - -#define p4d_alloc_one p4d_alloc_one -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - if (pgtable_l5_enabled) { - gfp_t gfp = GFP_PGTABLE_USER; - - if (mm == &init_mm) - gfp = GFP_PGTABLE_KERNEL; - return (p4d_t *)get_zeroed_page(gfp); - } - - return NULL; -} - -static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - free_page((unsigned long)p4d); -} - -#define p4d_free p4d_free -static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - if (pgtable_l5_enabled) - __p4d_free(mm, p4d); + if (pgtable_l4_enabled) + riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, @@ -161,9 +130,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; - pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd = __pgd_alloc(mm, 0); if (likely(pgd != NULL)) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); /* Copy kernel mappings */ sync_kernel_mappings(pgd); } @@ -175,10 +143,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { - struct ptdesc *ptdesc = virt_to_ptdesc(pmd); - - pagetable_pmd_dtor(ptdesc); - riscv_tlb_remove_ptdesc(tlb, ptdesc); + riscv_tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #endif /* __PAGETABLE_PMD_FOLDED */ @@ -186,10 +151,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - struct ptdesc *ptdesc = page_ptdesc(pte); - - pagetable_pte_dtor(ptdesc); - riscv_tlb_remove_ptdesc(tlb, ptdesc); + riscv_tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #endif /* CONFIG_MMU */ diff --git a/arch/riscv/include/asm/tlb.h b/arch/riscv/include/asm/tlb.h index 1f6c38420d8e..50b63b5c15bd 100644 --- a/arch/riscv/include/asm/tlb.h +++ b/arch/riscv/include/asm/tlb.h @@ -10,24 +10,6 @@ struct mmu_gather; static void tlb_flush(struct mmu_gather *tlb); -#ifdef CONFIG_MMU -#include - -/* - * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to - * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use - * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this - * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the - * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h - * for more details. - */ -static inline void __tlb_remove_table(void *table) -{ - free_page_and_swap_cache(table); -} - -#endif /* CONFIG_MMU */ - #define tlb_flush tlb_flush #include diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 45010e71df86..f1793630fc51 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -147,9 +147,7 @@ static void __init init_resources(void) res_idx = num_resources - 1; mem_res_sz = num_resources * sizeof(*mem_res); - mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); - if (!mem_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); + mem_res = memblock_alloc_or_panic(mem_res_sz, SMP_CACHE_BYTES); /* * Start by adding the reserved regions, if they overlap diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8d167e09f1fe..722178ae3488 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1573,7 +1573,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) return; } - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); if (PageReserved(page)) free_reserved_page(page); else @@ -1595,7 +1595,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool is_vmemm } if (!is_vmemmap) - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); if (PageReserved(page)) free_reserved_page(page); else diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index c301c8d291d2..41c635d6aca4 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -32,7 +32,7 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned pte_t *ptep, *p; if (pmd_none(pmdp_get(pmd))) { - p = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -54,7 +54,7 @@ static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned unsigned long next; if (pud_none(pudp_get(pud))) { - p = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); set_pud(pud, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -85,7 +85,7 @@ static void __init kasan_populate_pud(p4d_t *p4d, unsigned long next; if (p4d_none(p4dp_get(p4d))) { - p = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -116,7 +116,7 @@ static void __init kasan_populate_p4d(pgd_t *pgd, unsigned long next; if (pgd_none(pgdp_get(pgd))) { - p = memblock_alloc(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -385,7 +385,7 @@ static void __init kasan_shallow_populate_pud(p4d_t *p4d, next = pud_addr_end(vaddr, end); if (pud_none(pudp_get(pud_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_pud(pud_k, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } @@ -405,7 +405,7 @@ static void __init kasan_shallow_populate_p4d(pgd_t *pgd, next = p4d_addr_end(vaddr, end); if (p4d_none(p4dp_get(p4d_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_p4d(p4d_k, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } @@ -424,7 +424,7 @@ static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long next = pgd_addr_end(vaddr, end); if (pgd_none(pgdp_get(pgd_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 7b84ef6dc4b6..b19b6ed2ab53 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -53,29 +53,42 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) { unsigned long *table = crst_table_alloc(mm); - if (table) - crst_table_init(table, _REGION2_ENTRY_EMPTY); + if (!table) + return NULL; + crst_table_init(table, _REGION2_ENTRY_EMPTY); + pagetable_p4d_ctor(virt_to_ptdesc(table)); + return (p4d_t *) table; } static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { - if (!mm_p4d_folded(mm)) - crst_table_free(mm, (unsigned long *) p4d); + if (mm_p4d_folded(mm)) + return; + + pagetable_dtor(virt_to_ptdesc(p4d)); + crst_table_free(mm, (unsigned long *) p4d); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { unsigned long *table = crst_table_alloc(mm); - if (table) - crst_table_init(table, _REGION3_ENTRY_EMPTY); + + if (!table) + return NULL; + crst_table_init(table, _REGION3_ENTRY_EMPTY); + pagetable_pud_ctor(virt_to_ptdesc(table)); + return (pud_t *) table; } static inline void pud_free(struct mm_struct *mm, pud_t *pud) { - if (!mm_pud_folded(mm)) - crst_table_free(mm, (unsigned long *) pud); + if (mm_pud_folded(mm)) + return; + + pagetable_dtor(virt_to_ptdesc(pud)); + crst_table_free(mm, (unsigned long *) pud); } static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) @@ -96,7 +109,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); + pagetable_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } @@ -117,11 +130,18 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return (pgd_t *) crst_table_alloc(mm); + unsigned long *table = crst_table_alloc(mm); + + if (!table) + return NULL; + pagetable_pgd_ctor(virt_to_ptdesc(table)); + + return (pgd_t *) table; } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { + pagetable_dtor(virt_to_ptdesc(pgd)); crst_table_free(mm, (unsigned long *) pgd); } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index e95b2c8081eb..f39f8c4723f1 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -22,7 +22,6 @@ * Pages used for the page tables is a different story. FIXME: more */ -void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); @@ -87,7 +86,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb->cleared_pmds = 1; if (mm_alloc_pgste(tlb->mm)) gmap_unlink(tlb->mm, (unsigned long *)pte, address); - tlb_remove_ptdesc(tlb, pte); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte)); } /* @@ -102,12 +101,11 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_ptdesc(tlb, pmd); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } /* @@ -125,7 +123,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; - tlb_remove_ptdesc(tlb, p4d); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } /* @@ -143,7 +141,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_p4ds = 1; - tlb_remove_ptdesc(tlb, pud); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index cd0c93a8fb8b..dc7328fd2ec4 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -63,9 +63,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) { struct save_area *sa; - sa = memblock_alloc(sizeof(*sa), 8); - if (!sa) - return NULL; + sa = memblock_alloc_or_panic(sizeof(*sa), 8); if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index ddc1448ea2e1..2fc40f97c0ad 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -21,12 +21,8 @@ void __init numa_setup(void) nodes_clear(node_possible_map); node_set(0, node_possible_map); node_set_online(0); - for (nid = 0; nid < MAX_NUMNODES; nid++) { - NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); - if (!NODE_DATA(nid)) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(pg_data_t), 8); - } + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = memblock_alloc_or_panic(sizeof(pg_data_t), 8); NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index a3fea683b227..f873535eddd2 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -384,11 +384,7 @@ static unsigned long __init stack_alloc_early(void) { unsigned long stack; - stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!stack) { - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - } + stack = (unsigned long)memblock_alloc_or_panic(THREAD_SIZE, THREAD_SIZE); return stack; } @@ -512,10 +508,7 @@ static void __init setup_resources(void) bss_resource.end = __pa_symbol(__bss_stop) - 1; for_each_mem_range(i, &start, &end) { - res = memblock_alloc(sizeof(*res), 8); - if (!res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*res), 8); + res = memblock_alloc_or_panic(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; @@ -534,10 +527,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = memblock_alloc(sizeof(*sub_res), 8); - if (!sub_res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*sub_res), 8); + sub_res = memblock_alloc_or_panic(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; @@ -824,9 +814,7 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!vmms) - panic("Failed to allocate memory for sysinfo structure\n"); + vmms = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free(vmms, PAGE_SIZE); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 822d8e6f8717..7b08399b0846 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -611,9 +611,7 @@ void __init smp_save_dump_ipl_cpu(void) if (!dump_available()) return; sa = save_area_alloc(true); - regs = memblock_alloc(512, 8); - if (!sa || !regs) - panic("could not allocate memory for boot CPU save area\n"); + regs = memblock_alloc_or_panic(512, 8); copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); memblock_free(regs, 512); @@ -646,8 +644,6 @@ void __init smp_save_dump_secondary_cpus(void) SIGP_CC_NOT_OPERATIONAL) continue; sa = save_area_alloc(false); - if (!sa) - panic("could not allocate memory for save area\n"); __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); save_area_add_regs(sa, page); if (cpu_has_vx()) { @@ -792,10 +788,7 @@ void __init smp_detect_cpus(void) u16 address; /* Get CPU information */ - info = memblock_alloc(sizeof(*info), 8); - if (!info) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*info), 8); + info = memblock_alloc_or_panic(sizeof(*info), 8); smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 0fd56a1cadbd..cf5ee6032c0b 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -548,10 +548,7 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = memblock_alloc(sizeof(*mask->next), 8); - if (!mask->next) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*mask->next), 8); + mask->next = memblock_alloc_or_panic(sizeof(*mask->next), 8); mask = mask->next; } } @@ -569,10 +566,7 @@ void __init topology_init_early(void) } if (!MACHINE_HAS_TOPOLOGY) goto out; - tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!tl_info) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + tl_info = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); info = tl_info; store_topology(info); pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 58696a0c4e4a..a4e761902093 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -180,30 +180,11 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } -static void pagetable_pte_dtor_free(struct ptdesc *ptdesc) -{ - pagetable_pte_dtor(ptdesc); - pagetable_free(ptdesc); -} - void page_table_free(struct mm_struct *mm, unsigned long *table) { struct ptdesc *ptdesc = virt_to_ptdesc(table); - pagetable_pte_dtor_free(ptdesc); -} - -void __tlb_remove_table(void *table) -{ - struct ptdesc *ptdesc = virt_to_ptdesc(table); - struct page *page = ptdesc_page(ptdesc); - - if (compound_order(page) == CRST_ALLOC_ORDER) { - /* pmd, pud, or p4d */ - pagetable_free(ptdesc); - return; - } - pagetable_pte_dtor_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -211,7 +192,7 @@ static void pte_free_now(struct rcu_head *head) { struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); - pagetable_pte_dtor_free(ptdesc); + pagetable_dtor_free(ptdesc); } void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 5d8577ab1591..96d938fdf224 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -34,7 +34,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2a88b0c9e70f..289a2fecebef 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -137,10 +137,7 @@ static pmd_t * __init one_md_table_init(pud_t *pud) if (pud_none(*pud)) { pmd_t *pmd; - pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pmd) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, pmd); BUG_ON(pmd != pmd_offset(pud, 0)); } @@ -153,10 +150,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (pmd_none(*pmd)) { pte_t *pte; - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); BUG_ON(pte != pte_offset_kernel(pmd, 0)); } diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h index 3037187482db..1a6e694418e3 100644 --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -33,6 +33,7 @@ void flush_tlb_pending(void); #define tlb_needs_table_invalidate() (false) #endif +#define __HAVE_ARCH_TLB_REMOVE_TABLE #include #endif /* _SPARC64_TLB_H */ diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c index 3df960c137f7..a67dd67f10c8 100644 --- a/arch/sparc/kernel/prom_32.c +++ b/arch/sparc/kernel/prom_32.c @@ -28,9 +28,7 @@ void * __init prom_early_alloc(unsigned long size) { void *ret; - ret = memblock_alloc(size, SMP_CACHE_BYTES); - if (!ret) - panic("%s: Failed to allocate %lu bytes\n", __func__, size); + ret = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); prom_early_allocated += size; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 21f8cbbd0581..05882bca5b73 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2915,7 +2915,7 @@ static void __pte_free(pgtable_t pte) { struct ptdesc *ptdesc = virt_to_ptdesc(pte); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 9df51a62333d..dd32711022f5 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -277,19 +277,13 @@ static void __init srmmu_nocache_init(void) bitmap_bits = srmmu_nocache_size >> SRMMU_NOCACHE_BITMAP_SHIFT; - srmmu_nocache_pool = memblock_alloc(srmmu_nocache_size, + srmmu_nocache_pool = memblock_alloc_or_panic(srmmu_nocache_size, SRMMU_NOCACHE_ALIGN_MAX); - if (!srmmu_nocache_pool) - panic("%s: Failed to allocate %lu bytes align=0x%x\n", - __func__, srmmu_nocache_size, SRMMU_NOCACHE_ALIGN_MAX); memset(srmmu_nocache_pool, 0, srmmu_nocache_size); srmmu_nocache_bitmap = - memblock_alloc(BITS_TO_LONGS(bitmap_bits) * sizeof(long), + memblock_alloc_or_panic(BITS_TO_LONGS(bitmap_bits) * sizeof(long), SMP_CACHE_BYTES); - if (!srmmu_nocache_bitmap) - panic("%s: Failed to allocate %zu bytes\n", __func__, - BITS_TO_LONGS(bitmap_bits) * sizeof(long)); bit_map_init(&srmmu_nocache_map, srmmu_nocache_bitmap, bitmap_bits); srmmu_swapper_pg_dir = __srmmu_get_nocache(SRMMU_PGD_TABLE_SIZE, SRMMU_PGD_TABLE_SIZE); @@ -372,7 +366,7 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep) page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); if (page_ref_dec_return(page) == 1) - pagetable_pte_dtor(page_ptdesc(page)); + pagetable_dtor(page_ptdesc(page)); spin_unlock(&mm->page_table_lock); srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE); @@ -452,9 +446,7 @@ static void __init sparc_context_init(int numctx) unsigned long size; size = numctx * sizeof(struct ctx_list); - ctx_list_pool = memblock_alloc(size, SMP_CACHE_BYTES); - if (!ctx_list_pool) - panic("%s: Failed to allocate %lu bytes\n", __func__, size); + ctx_list_pool = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); for (ctx = 0; ctx < numctx; ctx++) { struct ctx_list *clist; diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 75d04fb4994a..d5a9c5aabaec 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -636,10 +636,7 @@ static int __init eth_setup(char *str) return 1; } - new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); - if (!new) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*new)); + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->index = n; diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 64c09db392c1..85b129e2b70b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1694,10 +1694,7 @@ static int __init vector_setup(char *str) str, error); return 1; } - new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); - if (!new) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*new)); + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->unit = n; new->arguments = str; diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 04fb4e6969a4..f0af23c3aeb2 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -27,7 +27,7 @@ extern pgd_t *pgd_alloc(struct mm_struct *); #define __pte_free_tlb(tlb, pte, address) \ do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ + pagetable_dtor(page_ptdesc(pte)); \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) @@ -35,7 +35,7 @@ do { \ #define __pmd_free_tlb(tlb, pmd, address) \ do { \ - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ + pagetable_dtor(virt_to_ptdesc(pmd)); \ tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ } while (0) @@ -43,7 +43,7 @@ do { \ #define __pud_free_tlb(tlb, pud, address) \ do { \ - pagetable_pud_dtor(virt_to_ptdesc(pud)); \ + pagetable_dtor(virt_to_ptdesc(pud)); \ tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \ } while (0) diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c index 5cecd0e291fb..cb9d178ab7d8 100644 --- a/arch/um/kernel/load_file.c +++ b/arch/um/kernel/load_file.c @@ -48,9 +48,7 @@ void *uml_load_file(const char *filename, unsigned long long *size) return NULL; } - area = memblock_alloc(*size, SMP_CACHE_BYTES); - if (!area) - panic("%s: Failed to allocate %llu bytes\n", __func__, *size); + area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES); if (__uml_load_file(filename, area, *size)) { memblock_free(area, *size); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 53248ed04771..d98812907493 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -214,14 +214,13 @@ void free_initmem(void) pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *pgd = __pgd_alloc(mm, 0); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (pgd) memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } + return pgd; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ef6cfea9df73..e95fecde30f0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -321,6 +321,7 @@ config X86 select FUNCTION_ALIGNMENT_4B imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + select ARCH_SUPPORTS_PT_RECLAIM if X86_64 config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index c5b0148b8c0a..a3c9b7c67640 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1572,9 +1572,7 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc(sizeof(*caa), PAGE_SIZE); - if (!caa) - panic("Can't allocate SVSM CA page\n"); + caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e525cd85f999..1ef08289e667 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -73,10 +73,14 @@ unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \ \ tcp_ptr__ += (__force unsigned long)(_ptr); \ - (typeof(*(_ptr)) __kernel __force *)tcp_ptr__; \ + (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)tcp_ptr__; \ }) #else -#define arch_raw_cpu_ptr(_ptr) ({ BUILD_BUG(); (typeof(_ptr))0; }) +#define arch_raw_cpu_ptr(_ptr) \ +({ \ + BUILD_BUG(); \ + (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)0; \ +}) #endif #define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel @@ -91,9 +95,23 @@ #endif /* CONFIG_SMP */ -#define __my_cpu_type(var) typeof(var) __percpu_seg_override -#define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr) -#define __my_cpu_var(var) (*__my_cpu_ptr(&(var))) +/* + * XXX: Remove test for __CHECKER__ once + * sparse learns about __typeof_unqual__. + */ +#if defined(CONFIG_USE_X86_SEG_SUPPORT) && \ + defined(CONFIG_CC_HAS_TYPEOF_UNQUAL) && !defined(__CHECKER__) +# define __my_cpu_type(var) typeof(var) +# define __my_cpu_ptr(ptr) (ptr) +# define __my_cpu_var(var) (var) + +# define __percpu_qual __percpu_seg_override +#else +# define __my_cpu_type(var) typeof(var) __percpu_seg_override +# define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr) +# define __my_cpu_var(var) (*__my_cpu_ptr(&(var))) +#endif + #define __percpu_arg(x) __percpu_prefix "%" #x #define __force_percpu_arg(x) __force_percpu_prefix "%" #x @@ -180,7 +198,7 @@ do { \ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ \ if (0) { \ - typeof(_var) pto_tmp__; \ + TYPEOF_UNQUAL(_var) pto_tmp__; \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ @@ -219,7 +237,7 @@ do { \ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ \ if (0) { \ - typeof(_var) pto_tmp__; \ + TYPEOF_UNQUAL(_var) pto_tmp__; \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ @@ -240,7 +258,7 @@ do { \ (val) == (typeof(val))-1)) ? (int)(val) : 0; \ \ if (0) { \ - typeof(var) pao_tmp__; \ + TYPEOF_UNQUAL(var) pao_tmp__; \ pao_tmp__ = (val); \ (void)pao_tmp__; \ } \ @@ -273,7 +291,7 @@ do { \ */ #define raw_percpu_xchg_op(_var, _nval) \ ({ \ - typeof(_var) pxo_old__ = raw_cpu_read(_var); \ + TYPEOF_UNQUAL(_var) pxo_old__ = raw_cpu_read(_var); \ \ raw_cpu_write(_var, _nval); \ \ @@ -287,7 +305,7 @@ do { \ */ #define this_percpu_xchg_op(_var, _nval) \ ({ \ - typeof(_var) pxo_old__ = this_cpu_read(_var); \ + TYPEOF_UNQUAL(_var) pxo_old__ = this_cpu_read(_var); \ \ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \ \ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dcd836b59beb..dd4841231bb9 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -147,24 +147,6 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4 set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT; - - if (mm == &init_mm) - gfp &= ~__GFP_ACCOUNT; - return (p4d_t *)get_zeroed_page(gfp); -} - -static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - if (!pgtable_l5_enabled()) - return; - - BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - free_page((unsigned long)p4d); -} - extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 4d3c9d00d6b6..77f52bc1578a 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -20,22 +20,9 @@ static inline void tlb_flush(struct mmu_gather *tlb) flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } -/* - * While x86 architecture in general requires an IPI to perform TLB - * shootdown, enablement code for several hypervisors overrides - * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing - * a hypercall. To keep software pagetable walkers safe in this case we - * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment - * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h - * for more details. - */ -static inline void __tlb_remove_table(void *table) -{ - free_page_and_swap_cache(table); -} - static inline void invlpg(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } + #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3a44a9dc3fb7..7c15d6e83c37 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -911,11 +911,8 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) * the resource tree during the lateinit timeframe. */ #define HPET_RESOURCE_NAME_SIZE 9 - hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, + hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, SMP_CACHE_BYTES); - if (!hpet_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); hpet_res->name = (void *)&hpet_res[1]; hpet_res->flags = IORESOURCE_MEM; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1029ea4ac8ba..a57d3fa7c6b6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2503,9 +2503,7 @@ static struct resource * __init ioapic_setup_resources(void) n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); n *= nr_ioapics; - mem = memblock_alloc(n, SMP_CACHE_BYTES); - if (!mem) - panic("%s: Failed to allocate %lu bytes\n", __func__, n); + mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES); res = (void *)mem; mem += sizeof(struct resource) * nr_ioapics; @@ -2564,11 +2562,8 @@ void __init io_apic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE, + ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - if (!ioapic_phys) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } io_apic_set_fixmap(idx, ioapic_phys); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4893d30ce438..82b96ed9890a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1146,11 +1146,8 @@ void __init e820__reserve_resources(void) struct resource *res; u64 end; - res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, + res = memblock_alloc_or_panic(sizeof(*res) * e820_table->nr_entries, SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*res) * e820_table->nr_entries); e820_res = res; for (i = 0; i < e820_table->nr_entries; i++) { diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 9c9faa1634fb..102641fd2172 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -655,7 +655,7 @@ void kgdb_arch_late(void) if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); - if (IS_ERR((void * __force)breakinfo[i].pev)) { + if (IS_ERR_PCPU(breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); breakinfo[i].pev = NULL; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index fec381533555..70161999c973 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -59,10 +59,20 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } +#ifndef CONFIG_PT_RECLAIM static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) { - tlb_remove_page(tlb, table); + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor(ptdesc); + tlb_remove_page(tlb, ptdesc_page(ptdesc)); } +#else +static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_table(tlb, table); +} +#endif struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f1fea506e20f..cebee310e200 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -259,6 +259,7 @@ static void __init relocate_initrd(void) u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); + int ret = 0; /* We need to move the initrd down into directly mapped mem */ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, @@ -272,7 +273,9 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); - copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + if (ret) + panic("Copy RAMDISK failed\n"); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5745a354a241..1fef5ad32d5a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -19,12 +19,23 @@ EXPORT_SYMBOL(physical_mask); #endif #ifndef CONFIG_PARAVIRT +#ifndef CONFIG_PT_RECLAIM static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { - tlb_remove_page(tlb, table); + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor(ptdesc); + tlb_remove_page(tlb, ptdesc_page(ptdesc)); } -#endif +#else +static inline +void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_table(tlb, table); +} +#endif /* !CONFIG_PT_RECLAIM */ +#endif /* !CONFIG_PARAVIRT */ gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; @@ -52,15 +63,13 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); - paravirt_tlb_remove_table(tlb, pte); + paravirt_tlb_remove_table(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - struct ptdesc *ptdesc = virt_to_ptdesc(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table @@ -69,25 +78,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - pagetable_pmd_dtor(ptdesc); - paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); + paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { - struct ptdesc *ptdesc = virt_to_ptdesc(pud); - - pagetable_pud_dtor(ptdesc); paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_page(pud)); + paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); + paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -222,7 +227,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } @@ -392,15 +397,14 @@ void __init pgtable_cache_init(void) SLAB_PANIC, NULL); } -static inline pgd_t *_pgd_alloc(void) +static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, - PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate @@ -409,24 +413,23 @@ static inline pgd_t *_pgd_alloc(void) return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); } -static inline void _pgd_free(pgd_t *pgd) +static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { if (!SHARED_KERNEL_PMD) - free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + __pgd_free(mm, pgd); else kmem_cache_free(pgd_cache, pgd); } #else -static inline pgd_t *_pgd_alloc(void) +static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { - return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, - PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); } -static inline void _pgd_free(pgd_t *pgd) +static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + __pgd_free(mm, pgd); } #endif /* CONFIG_X86_PAE */ @@ -436,7 +439,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[MAX_PREALLOCATED_PMDS]; - pgd = _pgd_alloc(); + pgd = _pgd_alloc(mm); if (pgd == NULL) goto out; @@ -479,7 +482,7 @@ out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: - _pgd_free(pgd); + _pgd_free(mm, pgd); out: return NULL; } @@ -489,7 +492,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - _pgd_free(pgd); + _pgd_free(mm, pgd); } /* @@ -856,7 +859,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) free_page((unsigned long)pmd_sv); - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); + pagetable_dtor(virt_to_ptdesc(pmd)); free_page((unsigned long)pmd); return 1; diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 74ebd6882690..cf5dca2dbb91 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -136,11 +136,7 @@ void * __init prom_early_alloc(unsigned long size) * fast enough on the platforms we care about while minimizing * wasted bootmem) and hand off chunks of it to callers. */ - res = memblock_alloc(chunk_size, SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - chunk_size); - BUG_ON(!res); + res = memblock_alloc_or_panic(chunk_size, SMP_CACHE_BYTES); prom_early_allocated += chunk_size; memset(res, 0, chunk_size); free_mem = chunk_size; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b52d3e17e2c1..56914e21e303 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -178,13 +178,7 @@ static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) static void * __ref alloc_p2m_page(void) { if (unlikely(!slab_is_available())) { - void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - return ptr; + return memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } return (void *)__get_free_page(GFP_KERNEL); diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index 7fc0f9126dd3..1919ee9c3dd6 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -29,7 +29,7 @@ static inline pgd_t* pgd_alloc(struct mm_struct *mm) { - return (pgd_t*) __get_free_page(GFP_KERNEL | __GFP_ZERO); + return __pgd_alloc(mm, 0); } static inline void ptes_clear(pte_t *ptep) diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c index f00d122aa806..f39c4d83173a 100644 --- a/arch/xtensa/mm/kasan_init.c +++ b/arch/xtensa/mm/kasan_init.c @@ -39,11 +39,7 @@ static void __init populate(void *start, void *end) unsigned long i, j; unsigned long vaddr = (unsigned long)start; pmd_t *pmd = pmd_off_k(vaddr); - pte_t *pte = memblock_alloc(n_pages * sizeof(pte_t), PAGE_SIZE); - - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, n_pages * sizeof(pte_t), PAGE_SIZE); + pte_t *pte = memblock_alloc_or_panic(n_pages * sizeof(pte_t), PAGE_SIZE); pr_debug("%s: %p - %p\n", __func__, start, end); diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index ca87a0939135..f7fb7205028d 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -251,6 +251,10 @@ static int __init extlog_init(void) } extlog_l1_hdr = acpi_os_map_iomem(l1_dirbase, l1_hdr_size); + if (!extlog_l1_hdr) { + rc = -ENOMEM; + goto err_release_l1_hdr; + } l1_head = (struct extlog_l1_head *)extlog_l1_hdr; l1_size = l1_head->total_len; l1_percpu_entry = l1_head->entries; @@ -268,6 +272,10 @@ static int __init extlog_init(void) goto err; } extlog_l1_addr = acpi_os_map_iomem(l1_dirbase, l1_size); + if (!extlog_l1_addr) { + rc = -ENOMEM; + goto err_release_l1_dir; + } l1_entry_base = (u64 *)((u8 *)extlog_l1_addr + l1_hdr_size); /* remap elog table */ @@ -279,6 +287,10 @@ static int __init extlog_init(void) goto err_release_l1_dir; } elog_addr = acpi_os_map_iomem(elog_base, elog_size); + if (!elog_addr) { + rc = -ENOMEM; + goto err_release_elog; + } rc = -ENOMEM; /* allocate buffer to save elog record */ @@ -300,6 +312,8 @@ err_release_l1_dir: if (extlog_l1_addr) acpi_os_unmap_iomem(extlog_l1_addr, l1_size); release_mem_region(l1_dirbase, l1_size); +err_release_l1_hdr: + release_mem_region(l1_dirbase, l1_hdr_size); err: pr_warn(FW_BUG "Extended error log disabled because of problems parsing f/w tables\n"); return rc; diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index bec0dcd1f9c3..59fffe34c9d0 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -81,6 +81,92 @@ int acpi_map_pxm_to_node(int pxm) } EXPORT_SYMBOL(acpi_map_pxm_to_node); +#ifdef CONFIG_NUMA_EMU +/* + * Take max_nid - 1 fake-numa nodes into account in both + * pxm_to_node_map()/node_to_pxm_map[] tables. + */ +int __init fix_pxm_node_maps(int max_nid) +{ + static int pxm_to_node_map_copy[MAX_PXM_DOMAINS] __initdata + = { [0 ... MAX_PXM_DOMAINS - 1] = NUMA_NO_NODE }; + static int node_to_pxm_map_copy[MAX_NUMNODES] __initdata + = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; + int i, j, index = -1, count = 0; + nodemask_t nodes_to_enable; + + if (numa_off || srat_disabled()) + return -1; + + /* find fake nodes PXM mapping */ + for (i = 0; i < MAX_NUMNODES; i++) { + if (node_to_pxm_map[i] != PXM_INVAL) { + for (j = 0; j <= max_nid; j++) { + if ((emu_nid_to_phys[j] == i) && + WARN(node_to_pxm_map_copy[j] != PXM_INVAL, + "Node %d is already binded to PXM %d\n", + j, node_to_pxm_map_copy[j])) + return -1; + if (emu_nid_to_phys[j] == i) { + node_to_pxm_map_copy[j] = + node_to_pxm_map[i]; + if (j > index) + index = j; + count++; + } + } + } + } + if (WARN(index != max_nid, "%d max nid when expected %d\n", + index, max_nid)) + return -1; + + nodes_clear(nodes_to_enable); + + /* map phys nodes not used for fake nodes */ + for (i = 0; i < MAX_NUMNODES; i++) { + if (node_to_pxm_map[i] != PXM_INVAL) { + for (j = 0; j <= max_nid; j++) + if (emu_nid_to_phys[j] == i) + break; + /* fake nodes PXM mapping has been done */ + if (j <= max_nid) + continue; + /* find first hole */ + for (j = 0; + j < MAX_NUMNODES && + node_to_pxm_map_copy[j] != PXM_INVAL; + j++) + ; + if (WARN(j == MAX_NUMNODES, + "Number of nodes exceeds MAX_NUMNODES\n")) + return -1; + node_to_pxm_map_copy[j] = node_to_pxm_map[i]; + node_set(j, nodes_to_enable); + count++; + } + } + + /* creating reverse mapping in pxm_to_node_map[] */ + for (i = 0; i < MAX_NUMNODES; i++) + if (node_to_pxm_map_copy[i] != PXM_INVAL && + pxm_to_node_map_copy[node_to_pxm_map_copy[i]] == NUMA_NO_NODE) + pxm_to_node_map_copy[node_to_pxm_map_copy[i]] = i; + + /* overwrite with new mapping */ + for (i = 0; i < MAX_NUMNODES; i++) { + node_to_pxm_map[i] = node_to_pxm_map_copy[i]; + pxm_to_node_map[i] = pxm_to_node_map_copy[i]; + } + + /* enable other nodes found in PXM for hotplug */ + nodes_or(numa_nodes_parsed, nodes_to_enable, numa_nodes_parsed); + + pr_debug("found %d total number of nodes\n", count); + return 0; +} +#endif + static void __init acpi_table_print_srat_entry(struct acpi_subtable_header *header) { diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 67858eeb92ed..348c5dbbfa68 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -512,7 +512,7 @@ static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", - online_type_to_str[mhp_default_online_type]); + online_type_to_str[mhp_get_default_online_type()]); } static ssize_t auto_online_blocks_store(struct device *dev, @@ -524,7 +524,7 @@ static ssize_t auto_online_blocks_store(struct device *dev, if (online_type < 0) return -EINVAL; - mhp_default_online_type = online_type; + mhp_set_default_online_type(online_type); return count; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7903a4da40ac..70ecaee25c20 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -55,8 +55,8 @@ static size_t huge_class_size; static const struct block_device_operations zram_devops; static void zram_free_page(struct zram *zram, size_t index); -static int zram_read_page(struct zram *zram, struct page *page, u32 index, - struct bio *parent); +static int zram_read_from_zspool(struct zram *zram, struct page *page, + u32 index); static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -112,17 +112,6 @@ static void zram_clear_flag(struct zram *zram, u32 index, zram->table[index].flags &= ~BIT(flag); } -static inline void zram_set_element(struct zram *zram, u32 index, - unsigned long element) -{ - zram->table[index].element = element; -} - -static unsigned long zram_get_element(struct zram *zram, u32 index) -{ - return zram->table[index].element; -} - static size_t zram_get_obj_size(struct zram *zram, u32 index) { return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); @@ -143,6 +132,27 @@ static inline bool zram_allocated(struct zram *zram, u32 index) zram_test_flag(zram, index, ZRAM_WB); } +static inline void update_used_max(struct zram *zram, const unsigned long pages) +{ + unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + if (cur_max >= pages) + return; + } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages, + &cur_max, pages)); +} + +static bool zram_can_store_page(struct zram *zram) +{ + unsigned long alloced_pages; + + alloced_pages = zs_get_total_pages(zram->mem_pool); + update_used_max(zram, alloced_pages); + + return !zram->limit_pages || alloced_pages <= zram->limit_pages; +} + #if PAGE_SIZE != 4096 static inline bool is_partial_io(struct bio_vec *bvec) { @@ -277,18 +287,6 @@ static struct zram_pp_slot *select_pp_slot(struct zram_pp_ctl *ctl) } #endif -static inline void update_used_max(struct zram *zram, - const unsigned long pages) -{ - unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages); - - do { - if (cur_max >= pages) - return; - } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages, - &cur_max, pages)); -} - static inline void zram_fill_page(void *ptr, unsigned long len, unsigned long value) { @@ -833,13 +831,10 @@ static ssize_t writeback_store(struct device *dev, */ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; + if (zram_read_from_zspool(zram, page, index)) + goto next; zram_slot_unlock(zram, index); - if (zram_read_page(zram, page, index, NULL)) { - release_pp_slot(zram, pps); - continue; - } - bio_init(&bio, zram->bdev, &bio_vec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); @@ -879,7 +874,7 @@ static ssize_t writeback_store(struct device *dev, zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_WB); - zram_set_element(zram, index, blk_idx); + zram_set_handle(zram, index, blk_idx); blk_idx = 0; atomic64_inc(&zram->stats.pages_stored); spin_lock(&zram->wb_limit_lock); @@ -889,6 +884,8 @@ static ssize_t writeback_store(struct device *dev, next: zram_slot_unlock(zram, index); release_pp_slot(zram, pps); + + cond_resched(); } if (blk_idx) @@ -1505,7 +1502,7 @@ static void zram_free_page(struct zram *zram, size_t index) if (zram_test_flag(zram, index, ZRAM_WB)) { zram_clear_flag(zram, index, ZRAM_WB); - free_block_bdev(zram, zram_get_element(zram, index)); + free_block_bdev(zram, zram_get_handle(zram, index)); goto out; } @@ -1533,6 +1530,56 @@ out: zram_set_obj_size(zram, index, 0); } +static int read_same_filled_page(struct zram *zram, struct page *page, + u32 index) +{ + void *mem; + + mem = kmap_local_page(page); + zram_fill_page(mem, PAGE_SIZE, zram_get_handle(zram, index)); + kunmap_local(mem); + return 0; +} + +static int read_incompressible_page(struct zram *zram, struct page *page, + u32 index) +{ + unsigned long handle; + void *src, *dst; + + handle = zram_get_handle(zram, index); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + dst = kmap_local_page(page); + copy_page(dst, src); + kunmap_local(dst); + zs_unmap_object(zram->mem_pool, handle); + + return 0; +} + +static int read_compressed_page(struct zram *zram, struct page *page, u32 index) +{ + struct zcomp_strm *zstrm; + unsigned long handle; + unsigned int size; + void *src, *dst; + int ret, prio; + + handle = zram_get_handle(zram, index); + size = zram_get_obj_size(zram, index); + prio = zram_get_priority(zram, index); + + zstrm = zcomp_stream_get(zram->comps[prio]); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + dst = kmap_local_page(page); + ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst); + kunmap_local(dst); + zs_unmap_object(zram->mem_pool, handle); + zcomp_stream_put(zram->comps[prio]); + + return ret; +} + /* * Reads (decompresses if needed) a page from zspool (zsmalloc). * Corresponding ZRAM slot should be locked. @@ -1540,47 +1587,14 @@ out: static int zram_read_from_zspool(struct zram *zram, struct page *page, u32 index) { - struct zcomp_strm *zstrm; - unsigned long handle; - unsigned int size; - void *src, *dst; - u32 prio; - int ret; + if (zram_test_flag(zram, index, ZRAM_SAME) || + !zram_get_handle(zram, index)) + return read_same_filled_page(zram, page, index); - handle = zram_get_handle(zram, index); - if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { - unsigned long value; - void *mem; - - value = handle ? zram_get_element(zram, index) : 0; - mem = kmap_local_page(page); - zram_fill_page(mem, PAGE_SIZE, value); - kunmap_local(mem); - return 0; - } - - size = zram_get_obj_size(zram, index); - - if (size != PAGE_SIZE) { - prio = zram_get_priority(zram, index); - zstrm = zcomp_stream_get(zram->comps[prio]); - } - - src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); - if (size == PAGE_SIZE) { - dst = kmap_local_page(page); - copy_page(dst, src); - kunmap_local(dst); - ret = 0; - } else { - dst = kmap_local_page(page); - ret = zcomp_decompress(zram->comps[prio], zstrm, - src, size, dst); - kunmap_local(dst); - zcomp_stream_put(zram->comps[prio]); - } - zs_unmap_object(zram->mem_pool, handle); - return ret; + if (!zram_test_flag(zram, index, ZRAM_HUGE)) + return read_compressed_page(zram, page, index); + else + return read_incompressible_page(zram, page, index); } static int zram_read_page(struct zram *zram, struct page *page, u32 index, @@ -1600,7 +1614,7 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, */ zram_slot_unlock(zram, index); - ret = read_from_bdev(zram, page, zram_get_element(zram, index), + ret = read_from_bdev(zram, page, zram_get_handle(zram, index), parent); } @@ -1638,33 +1652,89 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, return zram_read_page(zram, bvec->bv_page, index, bio); } +static int write_same_filled_page(struct zram *zram, unsigned long fill, + u32 index) +{ + zram_slot_lock(zram, index); + zram_set_flag(zram, index, ZRAM_SAME); + zram_set_handle(zram, index, fill); + zram_slot_unlock(zram, index); + + atomic64_inc(&zram->stats.same_pages); + atomic64_inc(&zram->stats.pages_stored); + + return 0; +} + +static int write_incompressible_page(struct zram *zram, struct page *page, + u32 index) +{ + unsigned long handle; + void *src, *dst; + + /* + * This function is called from preemptible context so we don't need + * to do optimistic and fallback to pessimistic handle allocation, + * like we do for compressible pages. + */ + handle = zs_malloc(zram->mem_pool, PAGE_SIZE, + GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); + if (IS_ERR_VALUE(handle)) + return PTR_ERR((void *)handle); + + if (!zram_can_store_page(zram)) { + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + zs_free(zram->mem_pool, handle); + return -ENOMEM; + } + + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + src = kmap_local_page(page); + memcpy(dst, src, PAGE_SIZE); + kunmap_local(src); + zs_unmap_object(zram->mem_pool, handle); + + zram_slot_lock(zram, index); + zram_set_flag(zram, index, ZRAM_HUGE); + zram_set_handle(zram, index, handle); + zram_set_obj_size(zram, index, PAGE_SIZE); + zram_slot_unlock(zram, index); + + atomic64_add(PAGE_SIZE, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.huge_pages); + atomic64_inc(&zram->stats.huge_pages_since); + atomic64_inc(&zram->stats.pages_stored); + + return 0; +} + static int zram_write_page(struct zram *zram, struct page *page, u32 index) { int ret = 0; - unsigned long alloced_pages; unsigned long handle = -ENOMEM; unsigned int comp_len = 0; - void *src, *dst, *mem; + void *dst, *mem; struct zcomp_strm *zstrm; unsigned long element = 0; - enum zram_pageflags flags = 0; + bool same_filled; + + /* First, free memory allocated to this slot (if any) */ + zram_slot_lock(zram, index); + zram_free_page(zram, index); + zram_slot_unlock(zram, index); mem = kmap_local_page(page); - if (page_same_filled(mem, &element)) { - kunmap_local(mem); - /* Free memory associated with this sector now. */ - flags = ZRAM_SAME; - atomic64_inc(&zram->stats.same_pages); - goto out; - } + same_filled = page_same_filled(mem, &element); kunmap_local(mem); + if (same_filled) + return write_same_filled_page(zram, element, index); compress_again: zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); - src = kmap_local_page(page); + mem = kmap_local_page(page); ret = zcomp_compress(zram->comps[ZRAM_PRIMARY_COMP], zstrm, - src, &comp_len); - kunmap_local(src); + mem, &comp_len); + kunmap_local(mem); if (unlikely(ret)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); @@ -1673,8 +1743,11 @@ compress_again: return ret; } - if (comp_len >= huge_class_size) - comp_len = PAGE_SIZE; + if (comp_len >= huge_class_size) { + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + return write_incompressible_page(zram, page, index); + } + /* * handle allocation has 2 paths: * a) fast path is executed with preemption disabled (for @@ -1690,35 +1763,23 @@ compress_again: */ if (IS_ERR_VALUE(handle)) handle = zs_malloc(zram->mem_pool, comp_len, - __GFP_KSWAPD_RECLAIM | - __GFP_NOWARN | - __GFP_HIGHMEM | - __GFP_MOVABLE); + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM | + __GFP_MOVABLE); if (IS_ERR_VALUE(handle)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); atomic64_inc(&zram->stats.writestall); handle = zs_malloc(zram->mem_pool, comp_len, - GFP_NOIO | __GFP_HIGHMEM | - __GFP_MOVABLE); + GFP_NOIO | __GFP_HIGHMEM | + __GFP_MOVABLE); if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); - if (comp_len != PAGE_SIZE) - goto compress_again; - /* - * If the page is not compressible, you need to acquire the - * lock and execute the code below. The zcomp_stream_get() - * call is needed to disable the cpu hotplug and grab the - * zstrm buffer back. It is necessary that the dereferencing - * of the zstrm variable below occurs correctly. - */ - zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); + goto compress_again; } - alloced_pages = zs_get_total_pages(zram->mem_pool); - update_used_max(zram, alloced_pages); - - if (zram->limit_pages && alloced_pages > zram->limit_pages) { + if (!zram_can_store_page(zram)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_free(zram->mem_pool, handle); return -ENOMEM; @@ -1726,41 +1787,19 @@ compress_again: dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); - src = zstrm->buffer; - if (comp_len == PAGE_SIZE) - src = kmap_local_page(page); - memcpy(dst, src, comp_len); - if (comp_len == PAGE_SIZE) - kunmap_local(src); - + memcpy(dst, zstrm->buffer, comp_len); zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_unmap_object(zram->mem_pool, handle); - atomic64_add(comp_len, &zram->stats.compr_data_size); -out: - /* - * Free memory associated with this sector - * before overwriting unused sectors. - */ + zram_slot_lock(zram, index); - zram_free_page(zram, index); - - if (comp_len == PAGE_SIZE) { - zram_set_flag(zram, index, ZRAM_HUGE); - atomic64_inc(&zram->stats.huge_pages); - atomic64_inc(&zram->stats.huge_pages_since); - } - - if (flags) { - zram_set_flag(zram, index, flags); - zram_set_element(zram, index, element); - } else { - zram_set_handle(zram, index, handle); - zram_set_obj_size(zram, index, comp_len); - } + zram_set_handle(zram, index, handle); + zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); /* Update stats */ atomic64_inc(&zram->stats.pages_stored); + atomic64_add(comp_len, &zram->stats.compr_data_size); + return ret; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 134be414e210..db78d7c01b9a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -62,10 +62,7 @@ enum zram_pageflags { /* Allocated for each disk page */ struct zram_table_entry { - union { - unsigned long handle; - unsigned long element; - }; + unsigned long handle; unsigned int flags; spinlock_t lock; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c index f2117fef7c7d..9c75dcc9a534 100644 --- a/drivers/clk/ti/clk.c +++ b/drivers/clk/ti/clk.c @@ -449,10 +449,7 @@ void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem) { struct clk_iomap *io; - io = memblock_alloc(sizeof(*io), SMP_CACHE_BYTES); - if (!io) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*io)); + io = memblock_alloc_or_panic(sizeof(*io), SMP_CACHE_BYTES); io->mem = mem; diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index a01bc5090cdf..a1534cc6c641 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -492,11 +492,7 @@ int __init smu_init (void) goto fail_np; } - smu = memblock_alloc(sizeof(struct smu_device), SMP_CACHE_BYTES); - if (!smu) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct smu_device)); - + smu = memblock_alloc_or_panic(sizeof(struct smu_device), SMP_CACHE_BYTES); spin_lock_init(&smu->lock); INIT_LIST_HEAD(&smu->cmd_list); INIT_LIST_HEAD(&smu->cmd_i2c_list); diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 0121100372b4..2eb718fbeffd 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1126,13 +1126,7 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) static void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) { - void *ptr = memblock_alloc(size, align); - - if (!ptr) - panic("%s: Failed to allocate %llu bytes align=0x%llx\n", - __func__, size, align); - - return ptr; + return memblock_alloc_or_panic(size, align); } bool __init early_init_dt_verify(void *dt_virt, phys_addr_t dt_phys) diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 0fa0c0fd9a6a..c48c630074eb 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -3680,13 +3680,7 @@ static struct device_node *overlay_base_root; static void * __init dt_alloc_memory(u64 size, u64 align) { - void *ptr = memblock_alloc(size, align); - - if (!ptr) - panic("%s: Failed to allocate %llu bytes align=0x%llx\n", - __func__, size, align); - - return ptr; + return memblock_alloc_or_panic(size, align); } /* diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index 07ed33464d71..224ca8d42721 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -624,10 +624,10 @@ static int alloc_private_pages(struct hmm_buffer_object *bo) const gfp_t gfp = __GFP_NOWARN | __GFP_RECLAIM | __GFP_FS; int ret; - ret = alloc_pages_bulk_array(gfp, bo->pgnr, bo->pages); + ret = alloc_pages_bulk(gfp, bo->pgnr, bo->pages); if (ret != bo->pgnr) { free_pages_bulk_array(ret, bo->pages); - dev_err(atomisp_dev, "alloc_pages_bulk_array() failed\n"); + dev_err(atomisp_dev, "alloc_pages_bulk() failed\n"); return -ENOMEM; } diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index eb7387ee6ebd..11eda6b207f1 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -408,7 +408,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) buf->dma_dir, 0); } - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); @@ -431,8 +431,8 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -1342,7 +1342,7 @@ static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) { int i; - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for (i = 0; i < recv_buf->npages; i++) __free_page(recv_buf->page_list[i]); @@ -1361,9 +1361,9 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, return -ENOMEM; for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, - npages - done, - recv_buf->page_list + done); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, + npages - done, + recv_buf->page_list + done); if (!filled) goto err; diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index ee54f4c17857..ba92bb4e9af9 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -77,8 +77,8 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -112,7 +112,7 @@ static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf) { struct sg_page_iter sg_iter; - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index fb02c1c36004..415a5803b8f4 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -586,7 +586,7 @@ do { \ #define per_cpu_sum(_p) \ ({ \ - typeof(*_p) _ret = 0; \ + TYPEOF_UNQUAL(*_p) _ret = 0; \ \ int cpu; \ for_each_possible_cpu(cpu) \ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b923d0cec61c..d70e9461fea8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -632,7 +632,7 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); + allocated = alloc_pages_bulk(gfp, nr_pages, page_array); if (unlikely(allocated == last)) { /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 27b2fe7f735d..3b99b1e19371 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10110,7 +10110,6 @@ out_unlock_mmap: *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; - sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 0dd65cefce33..9c5aa9d53682 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -87,8 +87,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) tmp_pages[j] = gbuf->pages[j]; do { last = j; - j = alloc_pages_bulk_array(GFP_KERNEL, nrpages, - tmp_pages); + j = alloc_pages_bulk(GFP_KERNEL, nrpages, + tmp_pages); if (last == j) goto out; } while (j != nrpages); diff --git a/fs/exec.c b/fs/exec.c index 98cb7ba9983c..1e1f79c514de 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it - * by hand ahead of time. + * ahead of time. */ - if (write && pos < vma->vm_start) { - mmap_write_lock(mm); - ret = expand_downwards(vma, pos); - if (unlikely(ret < 0)) { - mmap_write_unlock(mm); - return NULL; - } - mmap_write_downgrade(mm); - } else - mmap_read_lock(mm); + if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) + return NULL; /* * We are doing an exec(). 'current' is the process diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a2478c2afb3a..a9eddd782dbc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4043,7 +4043,6 @@ retry: cur_lblock = 1; /* force Empty message */ sis->max = cur_lblock; sis->pages = cur_lblock - 1; - sis->highest_bit = cur_lblock - 1; out: if (not_aligned) f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3cd99e2dc6ac..5980ac24c7a4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2681,6 +2681,9 @@ static void wait_sb_inodes(struct super_block *sb) if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) continue; + if (mapping_writeback_indeterminate(mapping)) + continue; + spin_unlock_irq(&sb->s_inode_wblist_lock); spin_lock(&inode->i_lock); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7d92a5479998..082ee374f694 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -415,89 +415,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_folios) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ -static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); -} - -static inline bool fuse_folio_is_writeback(struct inode *inode, - struct folio *folio) -{ - pgoff_t last = folio_next_index(folio) - 1; - return fuse_range_is_writeback(inode, folio_index(folio), last); -} - -static void fuse_wait_on_folio_writeback(struct inode *inode, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); -} - /* * Wait for all pending writepages on the inode to finish. * @@ -886,13 +808,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) ssize_t res; u64 attr_ver; - /* - * With the temporary pages that are used to complete writeback, we can - * have writeback that extends beyond the lifetime of the folio. So - * make sure we read a properly synced folio. - */ - fuse_wait_on_folio_writeback(inode, folio); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -1003,17 +918,12 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int max_pages, nr_pages; - pgoff_t first = readahead_index(rac); - pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; - wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); - max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1172,7 +1082,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, int err; for (i = 0; i < ap->num_folios; i++) - fuse_wait_on_folio_writeback(inode, ap->folios[i]); + folio_wait_writeback(ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1629,7 +1539,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1826,38 +1736,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_folios; i++) - folio_put(ap->folios[i]); - fuse_file_put(wpa->ia.ff, false); kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); -} - static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) - fuse_writepage_finish_stat(inode, ap->folios[i]); + for (i = 0; i < ap->num_folios; i++) { + /* + * Benchmarks showed that ending writeback within the + * scope of the fi->lock alleviates xarray lock + * contention and noticeably improves performance. + */ + folio_end_writeback(ap->folios[i]); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + wb_writeout_inc(&bdi->wb); + } wake_up(&fi->page_waitq); } @@ -1868,7 +1774,6 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; @@ -1905,19 +1810,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - next = aux->next; - aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, - aux->ia.ap.folios[0]); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1945,43 +1839,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_folios); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_folios) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -2001,41 +1858,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); @@ -2122,19 +1944,16 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t folio_index) + uint32_t folio_index) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; - folio_copy(tmp_folio, folio); - - ap->folios[folio_index] = tmp_folio; + ap->folios[folio_index] = folio; ap->descs[folio_index].offset = 0; ap->descs[folio_index].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2169,18 +1988,12 @@ static int fuse_writepage_locked(struct folio *folio) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct folio *tmp_folio; struct fuse_file *ff; - int error = -ENOMEM; + int error = -EIO; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto err; - - error = -EIO; ff = fuse_write_file_get(fi); if (!ff) - goto err_nofile; + goto err; wpa = fuse_writepage_args_setup(folio, ff); error = -ENOMEM; @@ -2191,22 +2004,17 @@ static int fuse_writepage_locked(struct folio *folio) ap->num_folios = 1; folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); + fuse_writepage_args_page_fill(wpa, folio, 0); spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - folio_end_writeback(folio); - return 0; err_writepage_args: fuse_file_put(ff, false); -err_nofile: - folio_put(tmp_folio); err: mapping_set_error(folio->mapping, error); return error; @@ -2216,7 +2024,6 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct folio **orig_folios; unsigned int max_folios; }; @@ -2251,69 +2058,11 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_folios = wpa->ia.ap.num_folios; - int i; spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_folios; i++) - folio_end_writeback(data->orig_folios[i]); -} - -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_folios != 0); - new_ap->num_folios = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == folio->index) { - WARN_ON(tmp->ia.ap.num_folios != 1); - swap(tmp->ia.ap.folios[0], new_ap->folios[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, - folio); - fuse_writepage_free(new_wpa); - } - - return false; } static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, @@ -2322,15 +2071,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, { WARN_ON(!ap->num_folios); - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_folio_is_writeback(data->inode, folio)) - return true; - /* Reached max pages */ if (ap->num_folios == fc->max_pages) return true; @@ -2340,7 +2080,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, return true; /* Discontinuity */ - if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) + if (ap->folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2359,7 +2099,6 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct folio *tmp_folio; int err; if (!data->ff) { @@ -2374,54 +2113,23 @@ static int fuse_writepages_fill(struct folio *folio, data->wpa = NULL; } - err = -ENOMEM; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_folios. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_setup(folio, data->ff); - if (!wpa) { - folio_put(tmp_folio); + if (!wpa) goto out_unlock; - } fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); - data->orig_folios[ap->num_folios] = folio; + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios); err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_folios++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, folio)) { + ap->num_folios++; + if (!data->wpa) data->wpa = wpa; - } else { - folio_end_writeback(folio); - } out_unlock: folio_unlock(folio); @@ -2448,13 +2156,6 @@ static int fuse_writepages(struct address_space *mapping, data.wpa = NULL; data.ff = NULL; - err = -ENOMEM; - data.orig_folios = kcalloc(fc->max_pages, - sizeof(struct folio *), - GFP_NOFS); - if (!data.orig_folios) - goto out; - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { WARN_ON(!data.wpa->ia.ap.num_folios); @@ -2463,7 +2164,6 @@ static int fuse_writepages(struct address_space *mapping, if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_folios); out: return err; } @@ -2488,8 +2188,6 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, folio->index); - if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* @@ -2552,13 +2250,9 @@ static int fuse_launder_folio(struct folio *folio) { int err = 0; if (folio_clear_dirty_for_io(folio)) { - struct inode *inode = folio->mapping->host; - - /* Serialize with pending writeback for the same page */ - fuse_wait_on_page_writeback(inode, folio->index); err = fuse_writepage_locked(folio); if (!err) - fuse_wait_on_page_writeback(inode, folio->index); + folio_wait_writeback(folio); } return err; } @@ -2602,7 +2296,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_NOPAGE; } - fuse_wait_on_folio_writeback(inode, folio); + folio_wait_writeback(folio); return VM_FAULT_LOCKED; } @@ -3420,9 +3114,12 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + if (fc->writeback_cache) + mapping_set_writeback_indeterminate(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); @@ -3430,7 +3127,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 74744c6f2860..23736c5c64c1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -141,9 +141,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; }; /* readdir cache (directory only) */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fc1ae5132127..0fc179a59830 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -99,7 +99,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -116,10 +115,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; - /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can @@ -819,7 +814,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, false); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 5fc0ac36dee3..b90d0eda9e51 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -189,7 +189,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; sis->max = isi.nr_pages; sis->pages = isi.nr_pages - 1; - sis->highest_bit = isi.nr_pages - 1; return isi.nr_extents; } EXPORT_SYMBOL_GPL(iomap_swapfile_activate); diff --git a/fs/open.c b/fs/open.c index e6911101fe71..0f75e220b700 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1504,7 +1504,7 @@ static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; - if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, "VFS: Close: file count is 0 (f_op=%ps)", filp->f_op)) { return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 0edf14a9840e..a50b222a5917 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3269,6 +3269,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; + int ret = 0; mm = get_task_mm(task); if (mm) { @@ -3276,6 +3277,16 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); + seq_printf(m, "ksm_merge_any: %s\n", + test_bit(MMF_VM_MERGE_ANY, &mm->flags) ? "yes" : "no"); + ret = mmap_read_lock_killable(mm); + if (ret) { + mmput(mm); + return ret; + } + seq_printf(m, "ksm_mergeable: %s\n", + ksm_process_mergeable(mm) ? "yes" : "no"); + mmap_read_unlock(mm); mmput(mm); } diff --git a/fs/splice.c b/fs/splice.c index 2898fa1e9e63..28cfa63aa236 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -342,7 +342,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, return -ENOMEM; pages = (struct page **)(bv + npages); - npages = alloc_pages_bulk_array(GFP_USER, npages, pages); + npages = alloc_pages_bulk(GFP_USER, npages, pages); if (!npages) { kfree(bv); return -ENOMEM; diff --git a/fs/super.c b/fs/super.c index c9c7223bc2a2..5a7db4a556e3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -647,7 +647,7 @@ void generic_shutdown_super(struct super_block *sb) */ fscrypt_destroy_keyring(sb); - if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), + if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL, "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa63b8efd782..82db3ab0e8b4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -395,8 +395,8 @@ xfs_buf_alloc_pages( for (;;) { long last = filled; - filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count, - bp->b_pages); + filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, + bp->b_pages); if (filled == bp->b_page_count) { XFS_STATS_INC(bp->b_mount, xb_page_found); break; diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index b5f594754a9e..99b960bd473c 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -17,11 +17,16 @@ extern int node_to_pxm(int); extern int acpi_map_pxm_to_node(int); extern unsigned char acpi_srat_revision; extern void disable_srat(void); +extern int fix_pxm_node_maps(int max_nid); extern void bad_srat(void); extern int srat_disabled(void); #else /* CONFIG_ACPI_NUMA */ +static inline int fix_pxm_node_maps(int max_nid) +{ + return 0; +} static inline void disable_srat(void) { } diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index 9d0479f50f97..5db59a1efb65 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -35,7 +35,7 @@ extern void early_ioremap_reset(void); /* * Early copy from unmapped memory to kernel mapped memory. */ -extern void copy_from_early_mem(void *dest, phys_addr_t src, +extern int copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size); #else diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 94cbd50cc870..02aeca21479a 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -6,6 +6,19 @@ #include #include +/* + * __percpu_qual is the qualifier for the percpu named address space. + * + * Most arches use generic named address space for percpu variables but + * some arches define percpu variables in different named address space + * (on the x86 arch, percpu variable may be declared as being relative + * to the %fs or %gs segments using __seg_fs or __seg_gs named address + * space qualifier). + */ +#ifndef __percpu_qual +# define __percpu_qual +#endif + #ifdef CONFIG_SMP /* @@ -74,7 +87,7 @@ do { \ #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ \ *__p += val; \ *__p; \ @@ -82,8 +95,8 @@ do { \ #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __ret; \ __ret = *__p; \ *__p = nval; \ __ret; \ @@ -91,7 +104,7 @@ do { \ #define __cpu_fallback_try_cmpxchg(pcp, ovalp, nval, _cmpxchg) \ ({ \ - typeof(pcp) __val, __old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) __val, __old = *(ovalp); \ __val = _cmpxchg(pcp, __old, nval); \ if (__val != __old) \ *(ovalp) = __val; \ @@ -100,8 +113,8 @@ do { \ #define raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __val = *__p, ___old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __val = *__p, ___old = *(ovalp); \ bool __ret; \ if (__val == ___old) { \ *__p = nval; \ @@ -115,14 +128,14 @@ do { \ #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __old = (oval); \ + TYPEOF_UNQUAL(pcp) __old = (oval); \ raw_cpu_generic_try_cmpxchg(pcp, &__old, nval); \ __old; \ }) #define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ preempt_disable_notrace(); \ ___ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ @@ -131,7 +144,7 @@ do { \ #define __this_cpu_generic_read_noirq(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ unsigned long ___flags; \ raw_local_irq_save(___flags); \ ___ret = raw_cpu_generic_read(pcp); \ @@ -141,7 +154,7 @@ do { \ #define this_cpu_generic_read(pcp) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ if (__native_word(pcp)) \ __ret = __this_cpu_generic_read_nopreempt(pcp); \ else \ @@ -160,7 +173,7 @@ do { \ #define this_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_add_return(pcp, val); \ @@ -170,7 +183,7 @@ do { \ #define this_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_xchg(pcp, nval); \ @@ -190,7 +203,7 @@ do { \ #define this_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \ diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 7c48f5fbf8aa..892ece4558a2 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -109,8 +109,7 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); - pagetable_pte_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } @@ -153,8 +152,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pagetable_pmd_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #endif @@ -202,8 +200,7 @@ static inline void __pud_free(struct mm_struct *mm, pud_t *pud) struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pagetable_pud_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE @@ -215,10 +212,82 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#if CONFIG_PGTABLE_LEVELS > 4 + +static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, 0); + if (!ptdesc) + return NULL; + + pagetable_p4d_ctor(ptdesc); + return ptdesc_address(ptdesc); +} +#define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) + +#ifndef __HAVE_ARCH_P4D_ALLOC_ONE +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + return __p4d_alloc_one_noprof(mm, addr); +} +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) +#endif + +static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(p4d); + + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + pagetable_dtor_free(ptdesc); +} + +#ifndef __HAVE_ARCH_P4D_FREE +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + if (!mm_p4d_folded(mm)) + __p4d_free(mm, p4d); +} +#endif + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ + +static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, order); + if (!ptdesc) + return NULL; + + pagetable_pgd_ctor(ptdesc); + return ptdesc_address(ptdesc); +} +#define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) + +static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); + + BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); + pagetable_dtor_free(ptdesc); +} + #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pagetable_free(virt_to_ptdesc(pgd)); + __pgd_free(mm, pgd); } #endif diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 709830274b75..e402aef79c93 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -153,8 +153,9 @@ * * Useful if your architecture has non-page page directories. * - * When used, an architecture is expected to provide __tlb_remove_table() - * which does the actual freeing of these pages. + * When used, an architecture is expected to provide __tlb_remove_table() or + * use the generic __tlb_remove_table(), which does the actual freeing of these + * pages. * * MMU_GATHER_RCU_TABLE_FREE * @@ -207,16 +208,31 @@ struct mmu_table_batch { #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) +#ifndef __HAVE_ARCH_TLB_REMOVE_TABLE +static inline void __tlb_remove_table(void *table) +{ + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor_free(ptdesc); +} +#endif + extern void tlb_remove_table(struct mmu_gather *tlb, void *table); -#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ +#else /* !CONFIG_MMU_GATHER_TABLE_FREE */ +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ -#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page)) +static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct page *page = (struct page *)table; + pagetable_dtor(page_ptdesc(page)); + tlb_remove_page(tlb, page); +} #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 0bbbe537c5f9..a946e0203e6d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -224,9 +224,14 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} #define alloc_hooks_tag(_tag, _do_alloc) \ ({ \ - struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \ - typeof(_do_alloc) _res = _do_alloc; \ - alloc_tag_restore(_tag, _old); \ + typeof(_do_alloc) _res; \ + if (mem_alloc_profiling_enabled()) { \ + struct alloc_tag * __maybe_unused _old; \ + _old = alloc_tag_save(_tag); \ + _res = _do_alloc; \ + alloc_tag_restore(_tag, _old); \ + } else \ + _res = _do_alloc; \ _res; \ }) diff --git a/include/linux/bug.h b/include/linux/bug.h index 348acf2558f3..a9948a9f1093 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -73,15 +73,23 @@ static inline void generic_bug_clear_once(void) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef CONFIG_PRINTK +void mem_dump_obj(void *object); +#else +static inline void mem_dump_obj(void *object) {} +#endif + /* * Since detected data corruption should stop operation on the affected * structures. Return value must be checked and sanely acted on by caller. */ static inline __must_check bool check_data_corruption(bool v) { return v; } -#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ +#define CHECK_DATA_CORRUPTION(condition, addr, fmt, ...) \ check_data_corruption(({ \ bool corruption = unlikely(condition); \ if (corruption) { \ + if (addr) \ + mem_dump_obj(addr); \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ pr_err(fmt, ##__VA_ARGS__); \ BUG(); \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 240c632c5b95..567a7af257d1 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -336,6 +336,19 @@ static inline void *offset_to_ptr(const int *off) */ #define prevent_tail_call_optimization() mb() +/* + * Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof + * operator when available, to return unqualified type of the exp. + * + * XXX: Remove test for __CHECKER__ once + * sparse learns about __typeof_unqual__. + */ +#if defined(CONFIG_CC_HAS_TYPEOF_UNQUAL) && !defined(__CHECKER__) +# define TYPEOF_UNQUAL(exp) __typeof_unqual__(exp) +#else +# define TYPEOF_UNQUAL(exp) __typeof__(exp) +#endif + #include #endif /* __LINUX_COMPILER_H */ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 981cc3d7e3aa..5d6544545658 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -57,7 +57,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __user BTF_TYPE_TAG(user) # endif # define __iomem -# define __percpu BTF_TYPE_TAG(percpu) +# define __percpu __percpu_qual BTF_TYPE_TAG(percpu) # define __rcu BTF_TYPE_TAG(rcu) # define __chk_user_ptr(x) (void)0 diff --git a/include/linux/damon.h b/include/linux/damon.h index a67f2c4940e9..af525252b853 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -193,11 +193,16 @@ struct damos_quota_goal { * size quota is set, DAMON tries to apply the action only up to &sz bytes * within &reset_interval. * - * Internally, the time quota is transformed to a size quota using estimated - * throughput of the scheme's action. DAMON then compares it against &sz and - * uses smaller one as the effective quota. + * To convince the different types of quotas and goals, DAMON internally + * converts those into one single size quota called "effective quota". DAMON + * internally uses it as the only one real quota. The conversion is made as + * follows. * - * If @goals is not empt, DAMON calculates yet another size quota based on the + * The time quota is transformed to a size quota using estimated throughput of + * the scheme's action. DAMON then compares it against &sz and uses smaller + * one as the effective quota. + * + * If @goals is not empty, DAMON calculates yet another size quota based on the * goals using its internal feedback loop algorithm, for every @reset_interval. * Then, if the new size quota is smaller than the effective quota, it uses the * new size quota as the effective quota. @@ -286,13 +291,33 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @sz_ops_filter_passed: + * Total bytes that passed ops layer-handled DAMOS filters. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + * + * "Tried an action to a region" in this context means the DAMOS core logic + * determined the region as eligible to apply the action. The access pattern + * (&struct damos_access_pattern), quotas (&struct damos_quota), watermarks + * (&struct damos_watermarks) and filters (&struct damos_filter) that handled + * on core logic can affect this. The core logic asks the operation set + * (&struct damon_operations) to apply the action to the region. + * + * "Applied an action to a region" in this context means the operation set + * (&struct damon_operations) successfully applied the action to the region, at + * least to a part of the region. The filters (&struct damos_filter) that + * handled on operation set layer and type of the action and pages of the + * region can affect this. For example, if a filter is set to exclude + * anonymous pages and the region has only anonymous pages, the region will be + * failed at applying the action. If the action is &DAMOS_PAGEOUT and all + * pages of the region are already paged out, the region will be failed at + * applying the action. */ struct damos_stat { unsigned long nr_tried; unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; @@ -327,8 +352,9 @@ enum damos_filter_type { /** * struct damos_filter - DAMOS action target memory filter. - * @type: Type of the page. - * @matching: If the matching page should filtered out or in. + * @type: Type of the target memory. + * @matching: Whether this is for @type-matching memory. + * @allow: Whether to include or exclude the @matching memory. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @target_idx: Index of the &struct damon_target of @@ -337,13 +363,15 @@ enum damos_filter_type { * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each - * page of the region matches to this and avoid applying the action if so. - * Support of each filter type depends on the running &struct damon_operations - * and the type. Refer to &enum damos_filter_type for more detai. + * byte of the region matches to this given condition and avoid applying the + * action if so. Support of each filter type depends on the running &struct + * damon_operations and the type. Refer to &enum damos_filter_type for more + * details. */ struct damos_filter { enum damos_filter_type type; bool matching; + bool allow; union { unsigned short memcg_id; struct damon_addr_range addr_range; @@ -352,6 +380,31 @@ struct damos_filter { struct list_head list; }; +struct damon_ctx; +struct damos; + +/** + * struct damos_walk_control - Control damos_walk(). + * + * @walk_fn: Function to be called back for each region. + * @data: Data that will be passed to walk functions. + * + * Control damos_walk(), which requests specific kdamond to invoke the given + * function to each region that eligible to apply actions of the kdamond's + * schemes. Refer to damos_walk() for more details. + */ +struct damos_walk_control { + void (*walk_fn)(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s, unsigned long sz_filter_passed); + void *data; +/* private: internal use only */ + /* informs if the kdamond finished handling of the walk request */ + struct completion completion; + /* informs if the walk is canceled. */ + bool canceled; +}; + /** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. @@ -415,6 +468,8 @@ struct damos { * @action */ unsigned long next_apply_sis; + /* informs if ongoing DAMOS walk for this scheme is finished */ + bool walk_completed; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; @@ -442,8 +497,6 @@ enum damon_ops_id { NR_DAMON_OPS, }; -struct damon_ctx; - /** * struct damon_operations - Monitoring operations for given use cases. * @@ -487,7 +540,8 @@ struct damon_ctx; * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action * to the region and return bytes of the region that the action is successfully - * applied. + * applied. It should also report how many bytes of the region has passed + * filters (&struct damos_filter) that handled by itself. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -504,7 +558,7 @@ struct damon_operations { struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, - struct damos *scheme); + struct damos *scheme, unsigned long *sz_filter_passed); bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; @@ -552,6 +606,27 @@ struct damon_callback { void (*before_terminate)(struct damon_ctx *context); }; +/* + * struct damon_call_control - Control damon_call(). + * + * @fn: Function to be called back. + * @data: Data that will be passed to @fn. + * @return_code: Return code from @fn invocation. + * + * Control damon_call(), which requests specific kdamond to invoke a given + * function. Refer to damon_call() for more details. + */ +struct damon_call_control { + int (*fn)(void *data); + void *data; + int return_code; +/* private: internal use only */ + /* informs if the kdamond finished handling of the request */ + struct completion completion; + /* informs if the kdamond canceled @fn infocation */ + bool canceled; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * @@ -632,6 +707,12 @@ struct damon_ctx { /* for scheme quotas prioritization */ unsigned long *regions_score_histogram; + struct damon_call_control *call_control; + struct mutex call_control_lock; + + struct damos_walk_control *walk_control; + struct mutex walk_control_lock; + /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; @@ -725,7 +806,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching); + bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); @@ -779,6 +860,9 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); + int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..58a618853574 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -2874,6 +2878,8 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end); static inline int file_write_and_wait(struct file *file) { @@ -2906,6 +2912,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; + } else if (iocb->ki_flags & IOCB_DONTCACHE) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, + iocb->ki_pos + count); } return count; @@ -3614,6 +3625,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b0fe9f62d15b..6bb1a5a7a4ae 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -212,35 +212,31 @@ struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); -#define alloc_pages_bulk_array_mempolicy(...) \ - alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_mempolicy(...) \ + alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL) - -#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array) +#define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ + __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long -alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, +alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } -#define alloc_pages_bulk_array_node(...) \ - alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_node(...) \ + alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { @@ -300,8 +296,6 @@ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); @@ -312,11 +306,6 @@ static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } -static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid) -{ - return alloc_pages_noprof(gfp, order); -} static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); @@ -331,7 +320,6 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -#define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b94c2e8ee918..93e509b6c00e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -121,6 +121,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPIN, + MTHP_STAT_SWPIN_FALLBACK, + MTHP_STAT_SWPIN_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ae4fe8615bb6..ec8c0ccc8f95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -153,11 +153,11 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_hugetlb(struct folio *folio, struct list_head *list); +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void folio_putback_active_hugetlb(struct folio *folio); +void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -414,7 +414,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } @@ -430,7 +430,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void folio_putback_active_hugetlb(struct folio *folio) +static inline void folio_putback_hugetlb(struct folio *folio) { } @@ -681,8 +681,9 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); + unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); @@ -1059,9 +1060,15 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } +static inline int replace_free_hugepage_folios(unsigned long start_pfn, + unsigned long end_pfn) +{ + return 0; +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, - int avoid_reserve) + bool cow_from_owner) { return NULL; } diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 56465af31044..890011071f2b 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -491,7 +491,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); -void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ @@ -509,7 +508,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} -static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 6a53ac4885bb..d73095b5cd96 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -93,6 +93,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); +bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 05c166811f6b..fe739d35a864 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -91,13 +91,24 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing - * nothing. Therefore the caller does not need to keep state about whether or - * not the element already belongs in the list and is allowed to lazy update - * it. Note however that this is valid for *a* list, not *this* list. If - * the caller organize itself in a way that elements can be in more than - * one type of list, it is up to the caller to fully remove the item from - * the previous list (with list_lru_del() for instance) before moving it - * to @lru. + * nothing. This means that it is not necessary to keep state about whether or + * not the element already belongs in the list. That said, this logic only + * works if the item is in *this* list. If the item might be in some other + * list, then you cannot rely on this check and you must remove it from the + * other list before trying to insert it. + * + * The lru list consists of many sublists internally; the @nid and @memcg + * parameters are used to determine which sublist to insert the item into. + * It's important to use the right value of @nid and @memcg when deleting the + * item, since it might otherwise get deleted from the wrong sublist. + * + * This also applies when attempting to insert the item multiple times - if + * the item is currently in one sublist and you call list_lru_add() again, you + * must pass the right @nid and @memcg parameters so that the same sublist is + * used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). * * Return: true if the list was updated, false otherwise */ @@ -113,7 +124,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); @@ -125,8 +136,19 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list - * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del(). + * manipulation. + * + * The comments in list_lru_add() about an element already being in a list are + * also valid for list_lru_del(), that is, you can delete an item that has + * already been removed or never been added. However, if the item is in a + * list, it must be in *this* list, and you must pass the right value of @nid + * and @memcg so that the right sublist is used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). When a memcg is deleted, list_lru entries + * are automatically moved to the parent memcg. This is done in a race-free + * way, so during deletion of an memcg both the old and new memcg will resolve + * to the same sublist internally. * * Return: true if the list was updated, false otherwise */ @@ -142,7 +164,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise. + * Return: true if the list was updated, false otherwise. */ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 673d5cae7c81..e79eb6ac516f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -378,6 +378,10 @@ static inline int memblock_get_region_node(const struct memblock_region *r) /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 +/* + * MEMBLOCK_ALLOC_NOLEAKTRACE avoids kmemleak tracing. It implies + * MEMBLOCK_ALLOC_ACCESSIBLE + */ #define MEMBLOCK_ALLOC_NOLEAKTRACE 1 /* We are using top down, so it is safe to use 0 here */ @@ -417,6 +421,12 @@ static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } +void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func); + +#define memblock_alloc_or_panic(size, align) \ + __memblock_alloc_or_panic(size, align, __func__) + static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5502aa8e138e..6e74b8254d9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -620,8 +620,6 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, page_counter_read(&memcg->memory); } -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); - int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** @@ -646,8 +644,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages); +int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); @@ -677,7 +674,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) __mem_cgroup_uncharge_folios(folios); } -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); @@ -1046,6 +1042,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + u64 id; + + if (mem_cgroup_disabled()) + return 0; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (!memcg) + memcg = root_mem_cgroup; + id = cgroup_id(memcg->css.cgroup); + rcu_read_unlock(); + return id; +} + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1135,21 +1148,15 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, return false; } -static inline void mem_cgroup_commit_charge(struct folio *folio, - struct mem_cgroup *memcg) -{ -} - static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } -static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, - gfp_t gfp, long nr_pages) +static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) { - return 0; + return 0; } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, @@ -1170,11 +1177,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } -static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ -} - static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { @@ -1466,6 +1468,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } + +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + return 0; +} #endif /* CONFIG_MEMCG */ /* diff --git a/include/linux/memfd.h b/include/linux/memfd.h index d437e3070850..246daadbfde8 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,7 +7,14 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); -unsigned int *memfd_file_seals_ptr(struct file *file); +/* + * Check for any existing seals on mmap, return an error if access is denied due + * to sealing, or 0 otherwise. + * + * We also update VMA flags if appropriate by manipulating the VMA flags pointed + * to by vm_flags_ptr. + */ +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -17,19 +24,11 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } - -static inline unsigned int *memfd_file_seals_ptr(struct file *file) +static inline int memfd_check_seals_mmap(struct file *file, + unsigned long *vm_flags_ptr) { - return NULL; + return 0; } #endif -/* Retrieve memfd seals associated with the file, if any. */ -static inline unsigned int memfd_file_seals(struct file *file) -{ - unsigned int *sealsp = memfd_file_seals_ptr(file); - - return sealsp ? *sealsp : 0; -} - #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index b27ddce5d324..eaac5ae8c05c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -144,8 +144,6 @@ extern u64 max_mem_size; extern int mhp_online_type_from_str(const char *str); -/* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int mhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) @@ -303,6 +301,9 @@ static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_MEMORY_HOTPLUG +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int mhp_get_default_online_type(void); +extern void mhp_set_default_online_type(int online_type); extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 002e49b2ebd9..29919faea2f1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -144,16 +144,14 @@ const struct movable_operations *page_movable_ops(struct page *page) #ifdef CONFIG_NUMA_BALANCING int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node); -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node); +int migrate_misplaced_folio(struct folio *folio, int node); #else static inline int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ } -static inline int migrate_misplaced_folio(struct folio *folio, - struct vm_area_struct *vma, int node) +static inline int migrate_misplaced_folio(struct folio *folio, int node) { return -EAGAIN; /* can't migrate now */ } diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 98008dd92153..eaaf5c008e4d 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -8,13 +8,10 @@ #include /* - * min()/max()/clamp() macros must accomplish three things: + * min()/max()/clamp() macros must accomplish several things: * * - Avoid multiple evaluations of the arguments (so side-effects like * "x++" happen only once) when non-constant. - * - Retain result as a constant expressions when called with only - * constant expressions (to avoid tripping VLA warnings in stack - * allocation usage). * - Perform signed v unsigned type-checking (to generate compile * errors instead of nasty runtime surprises). * - Unsigned char/short are always promoted to signed int and can be @@ -31,58 +28,54 @@ * bit #0 set if ok for unsigned comparisons * bit #1 set if ok for signed comparisons * - * In particular, statically non-negative signed integer - * expressions are ok for both. + * In particular, statically non-negative signed integer expressions + * are ok for both. * - * NOTE! Unsigned types smaller than 'int' are implicitly - * converted to 'int' in expressions, and are accepted for - * signed conversions for now. This is debatable. + * NOTE! Unsigned types smaller than 'int' are implicitly converted to 'int' + * in expressions, and are accepted for signed conversions for now. + * This is debatable. * - * Note that 'x' is the original expression, and 'ux' is - * the unique variable that contains the value. + * Note that 'x' is the original expression, and 'ux' is the unique variable + * that contains the value. * - * We use 'ux' for pure type checking, and 'x' for when - * we need to look at the value (but without evaluating - * it for side effects! Careful to only ever evaluate it - * with sizeof() or __builtin_constant_p() etc). + * We use 'ux' for pure type checking, and 'x' for when we need to look at the + * value (but without evaluating it for side effects! + * Careful to only ever evaluate it with sizeof() or __builtin_constant_p() etc). * - * Pointers end up being checked by the normal C type - * rules at the actual comparison, and these expressions - * only need to be careful to not cause warnings for - * pointer use. + * Pointers end up being checked by the normal C type rules at the actual + * comparison, and these expressions only need to be careful to not cause + * warnings for pointer use. */ -#define __signed_type_use(x,ux) (2+__is_nonneg(x,ux)) -#define __unsigned_type_use(x,ux) (1+2*(sizeof(ux)<4)) -#define __sign_use(x,ux) (is_signed_type(typeof(ux))? \ - __signed_type_use(x,ux):__unsigned_type_use(x,ux)) +#define __sign_use(ux) (is_signed_type(typeof(ux)) ? \ + (2 + __is_nonneg(ux)) : (1 + 2 * (sizeof(ux) < 4))) /* - * To avoid warnings about casting pointers to integers - * of different sizes, we need that special sign type. + * Check whether a signed value is always non-negative. * - * On 64-bit we can just always use 'long', since any - * integer or pointer type can just be cast to that. + * A cast is needed to avoid any warnings from values that aren't signed + * integer types (in which case the result doesn't matter). * - * This does not work for 128-bit signed integers since - * the cast would truncate them, but we do not use s128 - * types in the kernel (we do use 'u128', but they will - * be handled by the !is_signed_type() case). + * On 64-bit any integer or pointer type can safely be cast to 'long long'. + * But on 32-bit we need to avoid warnings about casting pointers to integers + * of different sizes without truncating 64-bit values so 'long' or 'long long' + * must be used depending on the size of the value. * - * NOTE! The cast is there only to avoid any warnings - * from when values that aren't signed integer types. + * This does not work for 128-bit signed integers since the cast would truncate + * them, but we do not use s128 types in the kernel (we do use 'u128', + * but they are handled by the !is_signed_type() case). */ -#ifdef CONFIG_64BIT - #define __signed_type(ux) long +#if __SIZEOF_POINTER__ == __SIZEOF_LONG_LONG__ +#define __is_nonneg(ux) statically_true((long long)(ux) >= 0) #else - #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux)>4,1LL,1L)) +#define __is_nonneg(ux) statically_true( \ + (typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)))(ux) >= 0) #endif -#define __is_nonneg(x,ux) statically_true((__signed_type(ux))(x)>=0) -#define __types_ok(x,y,ux,uy) \ - (__sign_use(x,ux) & __sign_use(y,uy)) +#define __types_ok(ux, uy) \ + (__sign_use(ux) & __sign_use(uy)) -#define __types_ok3(x,y,z,ux,uy,uz) \ - (__sign_use(x,ux) & __sign_use(y,uy) & __sign_use(z,uz)) +#define __types_ok3(ux, uy, uz) \ + (__sign_use(ux) & __sign_use(uy) & __sign_use(uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -97,30 +90,13 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \ + BUILD_BUG_ON_MSG(!__types_ok(ux, uy), \ #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) #define __careful_cmp(op, x, y) \ __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) -#define __clamp(val, lo, hi) \ - ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) - -#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ - __auto_type uval = (val); \ - __auto_type ulo = (lo); \ - __auto_type uhi = (hi); \ - static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ - (lo) <= (hi), true), \ - "clamp() low limit " #lo " greater than high limit " #hi); \ - BUILD_BUG_ON_MSG(!__types_ok3(val,lo,hi,uval,ulo,uhi), \ - "clamp("#val", "#lo", "#hi") signedness error"); \ - __clamp(uval, ulo, uhi); }) - -#define __careful_clamp(val, lo, hi) \ - __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) - /** * min - return minimum of two values of the same or compatible types * @x: first value @@ -154,7 +130,7 @@ #define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ - BUILD_BUG_ON_MSG(!__types_ok3(x,y,z,ux,uy,uz), \ + BUILD_BUG_ON_MSG(!__types_ok3(ux, uy, uz), \ #op"3("#x", "#y", "#z") signedness error"); \ __cmp(op, ux, __cmp(op, uy, uz)); }) @@ -176,34 +152,6 @@ #define max3(x, y, z) \ __careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) -/** - * min_not_zero - return the minimum that is _not_ zero, unless both are zero - * @x: value1 - * @y: value2 - */ -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -/** - * clamp - return a value clamped to a given range with strict typechecking - * @val: current value - * @lo: lowest allowable value - * @hi: highest allowable value - * - * This macro does strict typechecking of @lo/@hi to make sure they are of the - * same type as @val. See the unnecessary pointer comparisons. - */ -#define clamp(val, lo, hi) __careful_clamp(val, lo, hi) - -/* - * ..and if you can't take the strict - * types, you can specify one yourself. - * - * Or not use min/max/clamp at all, of course. - */ - /** * min_t - return minimum of two values, using the specified type * @type: data type to use @@ -220,6 +168,68 @@ */ #define max_t(type, x, y) __cmp_once(max, type, x, y) +/** + * min_not_zero - return the minimum that is _not_ zero, unless both are zero + * @x: value1 + * @y: value2 + */ +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) + +#define __clamp(val, lo, hi) \ + ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) + +#define __clamp_once(type, val, lo, hi, uval, ulo, uhi) ({ \ + type uval = (val); \ + type ulo = (lo); \ + type uhi = (hi); \ + BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ + "clamp() low limit " #lo " greater than high limit " #hi); \ + BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ + "clamp("#val", "#lo", "#hi") signedness error"); \ + __clamp(uval, ulo, uhi); }) + +#define __careful_clamp(type, val, lo, hi) \ + __clamp_once(type, val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) + +/** + * clamp - return a value clamped to a given range with typechecking + * @val: current value + * @lo: lowest allowable value + * @hi: highest allowable value + * + * This macro checks @val/@lo/@hi to make sure they have compatible + * signedness. + */ +#define clamp(val, lo, hi) __careful_clamp(__auto_type, val, lo, hi) + +/** + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * @type to make all the comparisons. + */ +#define clamp_t(type, val, lo, hi) __careful_clamp(type, val, lo, hi) + +/** + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed + * integer type. + */ +#define clamp_val(val, lo, hi) __careful_clamp(typeof(val), val, lo, hi) + /* * Do not check the array parameter using __must_be_array(). * In the following legit use-case where the "array" passed is a simple pointer, @@ -263,31 +273,6 @@ */ #define max_array(array, len) __minmax_array(max, array, len) -/** - * clamp_t - return a value clamped to a given range using a given type - * @type: the type of variable to use - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of type - * @type to make all the comparisons. - */ -#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) - -/** - * clamp_val - return a value clamped to a given range using val's type - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of whatever - * type the input argument @val is. This is useful when @val is an unsigned - * type and @lo and @hi are literals that will otherwise be assigned a signed - * integer type. - */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) - static inline bool in_range64(u64 val, u64 start, u64 len) { return (val - start) < len; @@ -326,9 +311,9 @@ static inline bool in_range32(u32 val, u32 start, u32 len) * Use these carefully: no type checking, and uses the arguments * multiple times. Use for obvious constants only. */ -#define MIN(a,b) __cmp(min,a,b) -#define MAX(a,b) __cmp(max,a,b) -#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) -#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) +#define MIN(a, b) __cmp(min, a, b) +#define MAX(a, b) __cmp(max, a, b) +#define MIN_T(type, a, b) __cmp(min, (type)(a), (type)(b)) +#define MAX_T(type, a, b) __cmp(max, (type)(a), (type)(b)) #endif /* _LINUX_MINMAX_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b1c3db9cf355..ac78425e9838 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -257,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); -/* Use only if VMA has no other users */ -void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; @@ -697,13 +696,54 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); + vma->vm_lock_seq = UINT_MAX; +} + +static inline bool is_vma_writer_only(int refcnt) +{ + /* + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on + * a detached vma happens only in vma_mark_detached() and is a rare + * case, therefore most of the time there will be no unnecessary wakeup. + */ + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; +} + +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *mm = vma->vm_mm; + int oldcnt; + + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + + if (is_vma_writer_only(oldcnt - 1)) + rcuwait_wake_up(&mm->vma_writer_wait); + } +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. */ -static inline bool vma_start_read(struct vm_area_struct *vma) +static inline bool vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { + int oldcnt; + /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need @@ -711,16 +751,22 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) - return false; - - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) return false; /* - * Overflow might produce false locked result. + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited() will fail + * because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + */ + if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are @@ -728,22 +774,51 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock->lock); + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); return false; } + return true; } +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +{ + int oldcnt; + + mmap_assert_locked(vma->vm_mm); + if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + return true; +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked(struct vm_area_struct *vma) +{ + return vma_start_read_locked_nested(vma, 0); +} + static inline void vma_end_read(struct vm_area_struct *vma) { - rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock->lock); - rcu_read_unlock(); + vma_refcount_put(vma); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -751,10 +826,12 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); + /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently @@ -762,43 +839,51 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock->lock); - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock->lock); + __vma_start_write(vma, mm_lock_seq); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock->lock)) + if (refcount_read(&vma->vm_refcnt) <= 1) vma_assert_write_locked(vma); } -static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) { - /* When detaching vma should be write-locked */ - if (detached) - vma_assert_write_locked(vma); - vma->detached = detached; + VM_BUG_ON_VMA(!refcount_read(&vma->vm_refcnt), vma); } +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt), vma); +} + +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set(&vma->vm_refcnt, 1); +} + +void vma_mark_detached(struct vm_area_struct *vma); + static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) @@ -820,14 +905,17 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ -static inline bool vma_start_read(struct vm_area_struct *vma) +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline bool vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } -static inline void vma_mark_detached(struct vm_area_struct *vma, - bool detached) {} +static inline void vma_assert_attached(struct vm_area_struct *vma) {} +static inline void vma_assert_detached(struct vm_area_struct *vma) {} +static inline void vma_mark_attached(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) @@ -854,18 +942,13 @@ static inline void assert_fault_locked(struct vm_fault *vmf) extern const struct vm_operations_struct vma_dummy_vm_ops; -/* - * WARNING: vma_init does not initialize vma->vm_lock. - * Use vm_area_alloc()/vm_area_free() if vma needs locking. - */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_mark_detached(vma, false); - vma_numab_state_init(vma); + vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ @@ -1058,6 +1141,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi, if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; + vma_mark_attached(vma); return 0; } @@ -2320,6 +2404,7 @@ extern void pagefault_out_of_memory(void); struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; @@ -2991,18 +3076,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +static inline void __pagetable_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); - if (!ptlock_init(ptdesc)) - return false; __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); - return true; } -static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) +static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3011,6 +3093,20 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } +static inline void pagetable_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + +static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +{ + if (!ptlock_init(ptdesc)) + return false; + __pagetable_ctor(ptdesc); + return true; +} + pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) @@ -3087,14 +3183,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) return ptlock_init(ptdesc); } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); -#endif - ptlock_free(ptdesc); -} - #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else @@ -3105,7 +3193,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3120,25 +3207,13 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!pmd_ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); ptdesc_pmd_pts_init(ptdesc); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } -static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - pmd_ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be @@ -3160,18 +3235,17 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } -static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) +static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + __pagetable_ctor(ptdesc); +} - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); +static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); @@ -3324,6 +3398,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3371,9 +3447,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, @@ -3437,9 +3510,6 @@ extern unsigned long stack_guard_gap; int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); -/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address); - /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, @@ -4096,67 +4166,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; -#ifdef CONFIG_PRINTK -void mem_dump_obj(void *object); -#else -static inline void mem_dump_obj(void *object) {} -#endif - -static inline bool is_write_sealed(int seals) -{ - return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); -} - -/** - * is_readonly_sealed - Checks whether write-sealed but mapped read-only, - * in which case writes should be disallowing moving - * forwards. - * @seals: the seals to check - * @vm_flags: the VMA flags to check - * - * Returns whether readonly sealed, in which case writess should be disallowed - * going forward. - */ -static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) -{ - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (is_write_sealed(seals) && - ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) - return true; - - return false; -} - -/** - * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and - * handle them. - * @seals: the seals to check - * @vma: the vma to operate on - * - * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper - * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. - */ -static inline int seal_check_write(int seals, struct vm_area_struct *vma) -{ - if (!is_write_sealed(seals)) - return 0; - - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - return 0; -} - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 1b6a917fffa4..f9157a0c42a5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq) return seq % NR_HIST_GENS; } -static inline int lru_tier_from_refs(int refs) +static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); - /* see the comment in folio_lru_refs() */ - return order_base_2(refs + 1); + /* see the comment on MAX_NR_TIERS */ + return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); - bool workingset = flags & BIT(PG_workingset); + if (!(flags & BIT(PG_referenced))) + return 0; /* - * Return the number of accesses beyond PG_referenced, i.e., N-1 if the - * total number of accesses is N>1, since N=0,1 both map to the first - * tier. lru_tier_from_refs() will account for this off-by-one. Also see - * the comment on MAX_NR_TIERS. + * Return the total number of accesses including PG_referenced. Also see + * the comment on LRU_REFS_FLAGS. */ - return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; -} - -static inline void folio_clear_lru_refs(struct folio *folio) -{ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) @@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } +static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, + bool reclaiming) +{ + int gen; + int type = folio_is_file_lru(folio); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + /* + * +-----------------------------------+-----------------------------------+ + * | Accessed through page tables and | Accessed through file descriptors | + * | promoted by folio_update_gen() | and protected by folio_inc_gen() | + * +-----------------------------------+-----------------------------------+ + * | PG_active (set while isolated) | | + * +-----------------+-----------------+-----------------+-----------------+ + * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | + * +-----------------------------------+-----------------------------------+ + * |<---------- MIN_NR_GENS ---------->| | + * |<---------------------------- MAX_NR_GENS ---------------------------->| + */ + if (folio_test_active(folio)) + gen = MIN_NR_GENS - folio_test_workingset(folio); + else if (reclaiming) + gen = MAX_NR_GENS; + else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || + (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio)))) + gen = MIN_NR_GENS; + else + gen = MAX_NR_GENS - folio_test_workingset(folio); + + return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; - unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, if (folio_test_unevictable(folio) || !lrugen->enabled) return false; - /* - * There are four common cases for this page: - * 1. If it's hot, i.e., freshly faulted in, add it to the youngest - * generation, and it's protected over the rest below. - * 2. If it can't be evicted immediately, i.e., a dirty page pending - * writeback, add it to the second youngest generation. - * 3. If it should be evicted first, e.g., cold and clean from - * folio_rotate_reclaimable(), add it to the oldest generation. - * 4. Everything else falls between 2 & 3 above and is added to the - * second oldest generation if it's considered inactive, or the - * oldest generation otherwise. See lru_gen_is_active(). - */ - if (folio_test_active(folio)) - seq = lrugen->max_seq; - else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || - (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio)))) - seq = lrugen->max_seq - 1; - else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) - seq = lrugen->min_seq[type]; - else - seq = lrugen->min_seq[type] + 1; + seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - mask = LRU_GEN_MASK; - /* - * Don't clear PG_workingset here because it can affect PSI accounting - * if the activation is due to workingset refault. - */ - if (folio_test_active(folio)) - mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); - set_mask_bits(&folio->flags, mask, flags); + set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ @@ -564,9 +562,9 @@ static inline pte_marker copy_pte_marker( * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * - * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. + * Returns true if an uffd-wp pte was installed, false otherwise. */ -static inline void +static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { @@ -583,7 +581,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) - return; + return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) @@ -596,10 +594,13 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; - if (unlikely(arm_uffd_pte)) + if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); + return true; + } #endif + return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 332cee285662..d366ec6302e6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -438,7 +439,9 @@ FOLIO_MATCH(compound_head, _head_2a); * struct ptdesc - Memory descriptor for page tables. * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. - * @pt_list: List of used page tables. Used for s390 and x86. + * @pt_list: List of used page tables. Used for s390 gmap shadow pages + * (which are not linked into the user page tables) and x86 + * pgds. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. @@ -571,6 +574,12 @@ static inline void *folio_get_private(struct folio *folio) typedef unsigned long vm_flags_t; +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that @@ -627,9 +636,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -struct vma_lock { - struct rw_semaphore lock; -}; +#define VMA_LOCK_OFFSET 0x40000000 +#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) struct vma_numab_state { /* @@ -675,6 +683,9 @@ struct vma_numab_state { * * Only explicitly marked struct members may be accessed by RCU readers before * getting a stable reference. + * + * WARNING: when adding new members, please update vm_area_init_from() to copy + * them during vm_area_struct content duplication. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -685,9 +696,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; /* @@ -707,19 +716,13 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* - * Flag to indicate areas detached from the mm->mm_mt tree. - * Unstable RCU readers are allowed to read this. - */ - bool detached; - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -727,21 +730,8 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; - /* Unstable RCU readers are allowed to read this. */ - struct vma_lock *vm_lock; + unsigned int vm_lock_seq; #endif - - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma @@ -761,14 +751,6 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif @@ -780,6 +762,30 @@ struct vm_area_struct { #endif #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ +#endif +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + refcount_t vm_refcnt ____cacheline_aligned_in_smp; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map vmlock_dep_map; +#endif +#endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; @@ -915,12 +921,16 @@ struct mm_struct { * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK + struct rcuwait vma_writer_wait; /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -929,7 +939,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..4706c6769902 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,85 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + +static inline void mm_lock_seqcount_init(struct mm_struct *mm) +{ + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); +} + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static inline void mmap_write_lock(struct mm_struct *mm) +{ + __mmap_lock_trace_start_locking(mm, true); + down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, true); +} + +static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) +{ + __mmap_lock_trace_start_locking(mm, true); + down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, true); +} + +static inline int mmap_write_lock_killable(struct mm_struct *mm) +{ + int ret; + + __mmap_lock_trace_start_locking(mm, true); + ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, ret == 0); + return ret; +} + /* * Drop all currently-held per-VMA locks. * This is called from the mmap_lock implementation directly before releasing @@ -82,46 +161,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); -} -#else -static inline void vma_end_write_all(struct mm_struct *mm) {} -#endif - -static inline void mmap_init_lock(struct mm_struct *mm) -{ - init_rwsem(&mm->mmap_lock); -} - -static inline void mmap_write_lock(struct mm_struct *mm) -{ - __mmap_lock_trace_start_locking(mm, true); - down_write(&mm->mmap_lock); - __mmap_lock_trace_acquire_returned(mm, true, true); -} - -static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) -{ - __mmap_lock_trace_start_locking(mm, true); - down_write_nested(&mm->mmap_lock, subclass); - __mmap_lock_trace_acquire_returned(mm, true, true); -} - -static inline int mmap_write_lock_killable(struct mm_struct *mm) -{ - int ret; - - __mmap_lock_trace_start_locking(mm, true); - ret = down_write_killable(&mm->mmap_lock); - __mmap_lock_trace_acquire_returned(mm, true, ret == 0); - return ret; + mm_lock_seqcount_end(mm); } static inline void mmap_write_unlock(struct mm_struct *mm) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index d7cb1e5ecbda..a0a3894900ed 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -9,10 +9,12 @@ struct page; struct vm_area_struct; struct mm_struct; struct vma_iterator; +struct vma_merge_struct; void dump_page(const struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason); void vma_iter_dump_tree(const struct vma_iterator *vmi); #ifdef CONFIG_DEBUG_VM @@ -87,6 +89,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_VMG(cond, vmg) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_vmg(vmg, "VM_WARN_ON_VMG(" __stringify(cond)")"); \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -104,9 +115,10 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_VMG(cond, vmg) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) -#endif +#endif /* CONFIG_DEBUG_VM */ #ifdef CONFIG_DEBUG_VM_IRQSOFF #define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b36124145a16..9540b41894da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -332,66 +332,88 @@ enum lruvec_flags { #endif /* !__GENERATING_BOUNDS_H */ /* - * Evictable pages are divided into multiple generations. The youngest and the + * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->folios[]. Otherwise it stores 0. + * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * - * A page is added to the youngest generation on faulting. The aging needs to - * check the accessed bit at least twice before handing this page over to the - * eviction. The first check takes care of the accessed bit set on the initial - * fault; the second check makes sure this page hasn't been used since then. - * This process, AKA second chance, requires a minimum of two generations, - * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive - * LRU, e.g., /proc/vmstat, these two generations are considered active; the - * rest of generations, if they exist, are considered inactive. See - * lru_gen_is_active(). + * After a folio is faulted in, the aging needs to check the accessed bit at + * least twice before handing this folio over to the eviction. The first check + * clears the accessed bit from the initial fault; the second check makes sure + * this folio hasn't been used since then. This process, AKA second chance, + * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI + * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two + * generations are considered active; the rest of generations, if they exist, + * are considered inactive. See lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->folios[] so - * that the aging needs not to worry about it. And it's set again when a page - * considered active is isolated for non-reclaiming purposes, e.g., migration. - * See lru_gen_add_folio() and lru_gen_del_folio(). + * PG_active is always cleared while a folio is on one of lrugen->folios[] so + * that the sliding window needs not to worry about it. And it's set again when + * a folio considered active is isolated for non-reclaiming purposes, e.g., + * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits - * in folio->flags. + * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* - * Each generation is divided into multiple tiers. A page accessed N times - * through file descriptors is in tier order_base_2(N). A page in the first tier - * (N=0,1) is marked by PG_referenced unless it was faulted in through page - * tables or read ahead. A page in any other tier (N>1) is marked by - * PG_referenced and PG_workingset. This implies a minimum of two tiers is - * supported without using additional bits in folio->flags. + * Each generation is divided into multiple tiers. A folio accessed N times + * through file descriptors is in tier order_base_2(N). A folio in the first + * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page + * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by + * PG_workingset. A folio in any other tier (1flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, - * comparisons of refaulted/(evicted+protected) from the first tier and the - * rest infer whether pages accessed multiple times through file descriptors - * are statistically hot and thus worth protecting. + * comparisons of refaulted/(evicted+protected) from the first tier and the rest + * infer whether folios accessed multiple times through file descriptors are + * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in - * folio->flags. + * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H -struct lruvec; -struct page_vma_mapped_walk; - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* + * For folios accessed multiple times through file descriptors, + * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags + * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its + * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily + * promoted into the second oldest generation in the eviction path. And when + * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that + * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is + * only valid when PG_referenced is set. + * + * For folios accessed multiple times through page tables, folio_update_gen() + * from a page table walk or lru_gen_set_refs() from a rmap walk sets + * PG_referenced after the accessed bit is cleared for the first time. + * Thereafter, those two paths set PG_workingset and promote folios to the + * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears + * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. + * + * For both cases above, after PG_workingset is set on a folio, it remains until + * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It + * can be set again if lru_gen_test_recent() returns true upon a refault. + */ +#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) + +struct lruvec; +struct page_vma_mapped_walk; + #ifdef CONFIG_LRU_GEN enum { @@ -406,8 +428,6 @@ enum { NR_LRU_GEN_CAPS }; -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) @@ -421,12 +441,11 @@ enum { /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are - * stored in min_seq[] separately for anon and file types as clean file pages - * can be evicted regardless of swap constraints. - * - * Normally anon and file min_seq are in sync. But if swapping is constrained, - * e.g., out of swap space, file min_seq is allowed to advance and leave anon - * min_seq behind. + * stored in min_seq[] separately for anon and file types so that they can be + * incremented independently. Ideally min_seq[] are kept in sync when both anon + * and file types are evictable. However, to adapt to situations like extreme + * swappiness, they are allowed to be out of sync by at most + * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. @@ -446,8 +465,8 @@ struct lru_gen_folio { unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; - /* the first tier doesn't need protection, hence the minus one */ - unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* can only be modified under the LRU lock */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; @@ -498,7 +517,7 @@ struct lru_gen_mm_walk { int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; - bool can_swap; + int swappiness; bool force_scan; }; diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index cfad6ce7e1bd..dd85613cdd86 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -29,7 +29,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi); int __init numa_memblks_init(int (*init_func)(void), bool memblock_force_top_down); +extern int numa_distance_cnt; + #ifdef CONFIG_NUMA_EMU +extern int emu_nid_to_phys[MAX_NUMNODES]; int numa_emu_cmdline(char *str); void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, unsigned int nr_emu_nids); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 16fa8f0cea02..3f6a64ff968a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -110,6 +110,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ + PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif @@ -599,6 +600,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) +FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not @@ -931,21 +936,9 @@ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } - -/* - * PageTransTail returns true for both transparent huge pages - * and hugetlbfs pages, so it should only be called when it's known - * that hugetlbfs pages aren't involved. - */ -static inline int PageTransTail(const struct page *page) -{ - return PageTail(page); -} #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) -TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) -TESTPAGEFLAG_FALSE(TransTail, transtail) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) @@ -955,11 +948,9 @@ TESTPAGEFLAG_FALSE(TransTail, transtail) * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ -PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) - TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) +FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else -PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) - TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) +FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 73dc2c1841ec..898bb788243b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype); int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags); + int migratetype, int flags); void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcf0865a38ae..d0be5f36082a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,6 +210,7 @@ enum mapping_flags { AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ + AS_WRITEBACK_INDETERMINATE = 9, /* Use caution when waiting on writeback */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -335,6 +336,16 @@ static inline bool mapping_inaccessible(struct address_space *mapping) return test_bit(AS_INACCESSIBLE, &mapping->flags); } +static inline void mapping_set_writeback_indeterminate(struct address_space *mapping) +{ + set_bit(AS_WRITEBACK_INDETERMINATE, &mapping->flags); +} + +static inline bool mapping_writeback_indeterminate(struct address_space *mapping) +{ + return test_bit(AS_WRITEBACK_INDETERMINATE, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; @@ -710,6 +721,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't block on the folio lock. * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) + * * %FGP_DONTCACHE - Uncached buffered IO * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() * implementation. */ @@ -723,10 +735,21 @@ typedef unsigned int __bitwise fgf_t; #define FGP_NOWAIT ((__force fgf_t)0x00000020) #define FGP_FOR_MMAP ((__force fgf_t)0x00000040) #define FGP_STABLE ((__force fgf_t)0x00000080) +#define FGP_DONTCACHE ((__force fgf_t)0x00000100) #define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned int filemap_get_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + + return shift - PAGE_SHIFT; +} + /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. @@ -740,11 +763,11 @@ typedef unsigned int __bitwise fgf_t; */ static inline fgf_t fgf_set_order(size_t size) { - unsigned int shift = ilog2(size); + unsigned int order = filemap_get_order(size); - if (shift <= PAGE_SHIFT) + if (!order) return 0; - return (__force fgf_t)((shift - PAGE_SHIFT) << 26); + return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); @@ -1270,11 +1293,6 @@ void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); -/* - * Add an arbitrary waiter to a page's wait queue - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); - /* * Fault in userspace address range. */ @@ -1353,6 +1371,7 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool dropbehind; bool _workingset; unsigned long _pflags; }; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index ac8c44dd8237..c5e9cac0575e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -33,7 +33,7 @@ struct disk_stats { #define part_stat_read(part, field) \ ({ \ - typeof((part)->bd_stats->field) res = 0; \ + TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b60..b4859e87846c 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,7 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ - (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) + ((TYPEOF_UNQUAL(*(__p)) __force __kernel *)(__force unsigned long)(__p)) #ifdef CONFIG_SMP @@ -317,7 +317,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return(stem, variable) \ ({ \ - typeof(variable) pscr_ret__; \ + TYPEOF_UNQUAL(variable) pscr_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr_ret__ = stem##1(variable); break; \ @@ -332,7 +332,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return2(stem, variable, ...) \ ({ \ - typeof(variable) pscr2_ret__; \ + TYPEOF_UNQUAL(variable) pscr2_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index adef9d6e9b1b..94d267d02372 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -533,7 +533,14 @@ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep_get_and_clear(mm, addr, ptep); + pte_t pte = ptep_get(ptep); + + pte_clear(mm, addr, ptep); + /* + * No need for ptep_get_and_clear(): page table check doesn't care about + * any bits that could have been set by HW concurrently. + */ + page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 27343424225c..9ad134a04b41 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -4,18 +4,7 @@ #include #include - -/* - * rcuwait provides a way of blocking and waking up a single - * task in an rcu-safe manner. - * - * The only time @task is non-nil is when a user is blocked (or - * checking if it needs to) on a condition, and reset as soon as we - * know that the condition has succeeded and are awoken. - */ -struct rcuwait { - struct task_struct __rcu *task; -}; +#include #define __RCUWAIT_INITIALIZER(name) \ { .task = NULL, } diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 35f039ecb272..5072ba99f05e 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -137,13 +137,23 @@ static inline unsigned int refcount_read(const refcount_t *r) } static inline __must_check __signed_wrap -bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) +bool __refcount_add_not_zero_limited(int i, refcount_t *r, int *oldp, + int limit) { int old = refcount_read(r); do { if (!old) break; + + if (statically_true(limit == INT_MAX)) + continue; + + if (i > limit - old) { + if (oldp) + *oldp = old; + return false; + } } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) @@ -155,6 +165,12 @@ bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) return old; } +static inline __must_check __signed_wrap +bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_limited(i, r, oldp, INT_MAX); +} + /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount @@ -213,6 +229,12 @@ static inline void refcount_add(int i, refcount_t *r) __refcount_add(i, r, NULL); } +static inline __must_check bool __refcount_inc_not_zero_limited(refcount_t *r, + int *oldp, int limit) +{ + return __refcount_add_not_zero_limited(1, r, oldp, limit); +} + static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 412cdaba33eb..17e04859b9a4 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -18,10 +18,6 @@ extern int sched_cpu_dying(unsigned int cpu); # define sched_cpu_dying NULL #endif -#ifdef CONFIG_HOTPLUG_CPU -extern void idle_task_exit(void); -#else static inline void idle_task_exit(void) {} -#endif #endif /* _LINUX_SCHED_HOTPLUG_H */ diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization diff --git a/include/linux/slab.h b/include/linux/slab.h index 10a971c2bde3..681b685b6c4e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -234,12 +234,6 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif -/* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * diff --git a/include/linux/swap.h b/include/linux/swap.h index 187715eec3cb..b13b72645db3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -219,7 +219,6 @@ enum { SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -258,18 +257,27 @@ struct swap_cluster_info { u8 order; struct list_head list; }; -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as - * a sentinel to indicate next is not valid in percpu_cluster. + * a sentinel to indicate an entry is not valid. */ -#define SWAP_NEXT_INVALID 0 +#define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) @@ -283,6 +291,7 @@ struct swap_cluster_info { * throughput. */ struct percpu_cluster { + local_lock_t lock; /* Protect the percpu_cluster above */ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -305,15 +314,12 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; - unsigned int lowest_bit; /* index of first free in swap_map */ - unsigned int highest_bit; /* index of last free in swap_map */ + atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ - unsigned int inuse_pages; /* number of those currently in use */ - unsigned int cluster_next; /* likely index for next allocation */ - unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ + atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index ae73a87775b3..b5ec038069da 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,10 +6,8 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new); -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents); +extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent); +extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); @@ -17,8 +15,12 @@ extern void swap_cgroup_swapoff(int type); #else static inline -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +{ +} + +static inline +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { return 0; } diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 15adfb8c813a..840aec3523b2 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -16,15 +16,12 @@ struct swap_slots_cache { swp_entry_t *slots; int nr; int cur; - spinlock_t free_lock; /* protects slots_ret, n_ret */ - swp_entry_t *slots_ret; int n_ret; }; void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); void enable_swap_slots_cache(void); -void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 2964171856e0..0646804860ff 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -19,9 +19,6 @@ enum task_work_notify_mode { TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, - - TWA_FLAGS = 0xff00, - TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) diff --git a/include/linux/types.h b/include/linux/types.h index 1c509ce8f7f6..a3d2182c2686 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -248,5 +248,17 @@ typedef void (*swap_func_t)(void *a, void *b, int size); typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); typedef int (*cmp_func_t)(const void *a, const void *b); +/* + * rcuwait provides a way of blocking and waking up a single + * task in an rcu-safe manner. + * + * The only time @task is non-nil is when a user is blocked (or + * checking if it needs to) on a condition, and reset as soon as we + * know that the condition has succeeded and are awoken. + */ +struct rcuwait { + struct task_struct __rcu *task; +}; + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ diff --git a/include/net/snmp.h b/include/net/snmp.h index 468a67836e2f..4cb4326dfebe 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -159,7 +159,7 @@ struct linux_tls_mib { #define __SNMP_ADD_STATS64(mib, field, addend) \ do { \ - __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[field] += addend; \ u64_stats_update_end(&ptr->syncp); \ @@ -176,8 +176,7 @@ struct linux_tls_mib { #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1) #define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \ do { \ - __typeof__(*mib) *ptr; \ - ptr = raw_cpu_ptr((mib)); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index bc2e3ad787b3..cf9f9faf8914 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -5,6 +5,7 @@ #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_LOCK_H +#include #include #include @@ -12,64 +13,61 @@ struct mm_struct; DECLARE_EVENT_CLASS(mmap_lock, - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), + TP_PROTO(struct mm_struct *mm, bool write), - TP_ARGS(mm, memcg_path, write), + TP_ARGS(mm, write), TP_STRUCT__entry( __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) + __field(u64, memcg_id) __field(bool, write) ), TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path); + __entry->memcg_id = cgroup_id_from_mm(mm); __entry->write = write; ), TP_printk( - "mm=%p memcg_path=%s write=%s", - __entry->mm, - __get_str(memcg_path), + "mm=%p memcg_id=%llu write=%s", + __entry->mm, __entry->memcg_id, __entry->write ? "true" : "false" ) ); #define DEFINE_MMAP_LOCK_EVENT(name) \ DEFINE_EVENT(mmap_lock, name, \ - TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ - bool write), \ - TP_ARGS(mm, memcg_path, write)) + TP_PROTO(struct mm_struct *mm, bool write), \ + TP_ARGS(mm, write)) DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); TRACE_EVENT(mmap_lock_acquire_returned, - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, - bool success), + TP_PROTO(struct mm_struct *mm, bool write, bool success), - TP_ARGS(mm, memcg_path, write, success), + TP_ARGS(mm, write, success), TP_STRUCT__entry( __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) + __field(u64, memcg_id) __field(bool, write) __field(bool, success) ), TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path); + __entry->memcg_id = cgroup_id_from_mm(mm); __entry->write = write; __entry->success = success; ), TP_printk( - "mm=%p memcg_path=%s write=%s success=%s", + "mm=%p memcg_id=%llu write=%s success=%s", __entry->mm, - __get_str(memcg_path), + __entry->memcg_id, __entry->write ? "true" : "false", __entry->success ? "true" : "false" ) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index bb8a59c6caa2..3bc8656c8359 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -116,7 +116,8 @@ DEF_PAGEFLAG_NAME(head), \ DEF_PAGEFLAG_NAME(reclaim), \ DEF_PAGEFLAG_NAME(swapbacked), \ - DEF_PAGEFLAG_NAME(unevictable) \ + DEF_PAGEFLAG_NAME(unevictable), \ + DEF_PAGEFLAG_NAME(dropbehind) \ IF_HAVE_PG_MLOCK(mlocked) \ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..56a4f93a08f4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t; /* Atomic Write */ #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ + RWF_DONTCACHE) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/init/Kconfig b/init/Kconfig index a20e6efd3f0f..c1f9eb3d5f2e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -894,6 +894,9 @@ config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH config CC_HAS_INT128 def_bool !$(cc-option,$(m64-flag) -D__SIZEOF_INT128__=0) && 64BIT +config CC_HAS_TYPEOF_UNQUAL + def_bool $(success,echo 'int foo (int a) { __typeof_unqual__(a) b = a; return b; }' | $(CC) -x c - -S -o /dev/null) + config CC_IMPLICIT_FALLTHROUGH string default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5) diff --git a/init/main.c b/init/main.c index 00fac1170294..4bae539ebc05 100644 --- a/init/main.c +++ b/init/main.c @@ -640,15 +640,11 @@ static void __init setup_command_line(char *command_line) len = xlen + strlen(boot_command_line) + ilen + 1; - saved_command_line = memblock_alloc(len, SMP_CACHE_BYTES); - if (!saved_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); + saved_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES); len = xlen + strlen(command_line) + 1; - static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); - if (!static_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); + static_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES); if (xlen) { /* @@ -1145,16 +1141,10 @@ static int __init initcall_blacklist(char *str) str_entry = strsep(&str, ","); if (str_entry) { pr_debug("blacklisting initcall %s\n", str_entry); - entry = memblock_alloc(sizeof(*entry), + entry = memblock_alloc_or_panic(sizeof(*entry), SMP_CACHE_BYTES); - if (!entry) - panic("%s: Failed to allocate %zu bytes\n", - __func__, sizeof(*entry)); - entry->buf = memblock_alloc(strlen(str_entry) + 1, + entry->buf = memblock_alloc_or_panic(strlen(str_entry) + 1, SMP_CACHE_BYTES); - if (!entry->buf) - panic("%s: Failed to allocate %zu bytes\n", - __func__, strlen(str_entry) + 1); strcpy(entry->buf, str_entry); list_add(&entry->next, &blacklisted_initcalls); } diff --git a/ipc/util.c b/ipc/util.c index 05cb9de66735..cae60f11d9c2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -615,12 +615,11 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) } /** - * ipc_obtain_object_idr + * ipc_obtain_object_idr - Look for an id in the ipc ids idr and + * return associated ipc object. * @ids: ipc identifier set * @id: ipc id to look for * - * Look for an id in the ipc ids idr and return associated ipc object. - * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ @@ -637,13 +636,11 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) } /** - * ipc_obtain_object_check + * ipc_obtain_object_check - Similar to ipc_obtain_object_idr() but + * also checks the ipc object sequence number. * @ids: ipc identifier set * @id: ipc id to look for * - * Similar to ipc_obtain_object_idr() but also checks the ipc object - * sequence number. - * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 945a5680f6a5..9927cd4c9e0e 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -443,7 +443,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } - /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */ + /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); if (!pages) return 0; diff --git a/kernel/cpu.c b/kernel/cpu.c index b605334f8ee6..7f3bf759cbdf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -905,12 +905,13 @@ static int finish_cpu(unsigned int cpu) struct mm_struct *mm = idle->active_mm; /* - * idle_task_exit() will have switched to &init_mm, now - * clean up any remaining active_mm state. + * sched_force_init_mm() ensured the use of &init_mm, + * drop that refcount now that the CPU has stopped. */ - if (mm != &init_mm) - idle->active_mm = &init_mm; + WARN_ON(mm != &init_mm); + idle->active_mm = NULL; mmdrop_lazy_tlb(mm); + return 0; } diff --git a/kernel/fork.c b/kernel/fork.c index 19d7fe31869b..5a3dab459d8a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -#ifdef CONFIG_PER_VMA_LOCK - -/* SLAB cache for vm_area_struct.lock */ -static struct kmem_cache *vma_lock_cachep; - -static bool vma_lock_alloc(struct vm_area_struct *vma) -{ - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; - - return true; -} - -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - kmem_cache_free(vma_lock_cachep, vma->vm_lock); -} - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } -static inline void vma_lock_free(struct vm_area_struct *vma) {} - -#endif /* CONFIG_PER_VMA_LOCK */ - struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -474,14 +445,46 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - kmem_cache_free(vm_area_cachep, vma); - return NULL; - } return vma; } +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +} + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); @@ -491,15 +494,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - if (!vma_lock_alloc(new)) { - kmem_cache_free(vm_area_cachep, new); - return NULL; - } + vm_area_init_from(orig, new); + vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); @@ -507,33 +503,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return new; } -void __vm_area_free(struct vm_area_struct *vma) -{ - vma_numab_state_free(vma); - free_anon_vma_name(vma); - vma_lock_free(vma); - kmem_cache_free(vm_area_cachep, vma); -} - -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); - __vm_area_free(vma); -} -#endif - void vm_area_free(struct vm_area_struct *vma) { -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); + vma_numab_state_free(vma); + free_anon_vma_name(vma); + kmem_cache_free(vm_area_cachep, vma); } static void account_kernel_stack(struct task_struct *tsk, int account) @@ -1252,6 +1228,15 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static inline void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +#ifdef CONFIG_PER_VMA_LOCK + rcuwait_init(&mm->vma_writer_wait); +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1262,9 +1247,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); -#ifdef CONFIG_PER_VMA_LOCK - mm->mm_lock_seq = 0; -#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; @@ -3176,6 +3158,11 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3192,11 +3179,10 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); -#ifdef CONFIG_PER_VMA_LOCK - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); -#endif + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index 7670a811a565..8b888a6193cc 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -264,10 +264,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) /** * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added + * @dst: profiling data set to which data is added + * @src: profiling data set which is added * - * Adds profiling counts of @source to @dest. + * Adds profiling counts of @src to @dst. */ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) { diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 2f4fb336dda1..73f7e1fd4ab4 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -147,7 +147,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; - kasan_record_aux_stack_noalloc(work); + kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 2fd0daa6b3b6..f847be88bf07 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -900,7 +900,7 @@ fail_task: } /** - * kthread_create_worker - create a kthread worker + * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 7a75eab9c179..77ee3ea8a573 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -158,9 +158,9 @@ account_global_scheduler_latency(struct task_struct *tsk, /** * __account_scheduler_latency - record an occurred latency - * @tsk - the task struct of the task hitting the latency - * @usecs - the duration of the latency in microseconds - * @inter - 1 if the sleep was interruptible, 0 if uninterruptible + * @tsk: the task struct of the task hitting the latency + * @usecs: the duration of the latency in microseconds + * @inter: 1 if the sleep was interruptible, 0 if uninterruptible * * This function is the main entry point for recording latency entries * as called by the scheduler. diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 6083883c4fe0..d6964fc29f51 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ - typeof(var) __sum = 0; \ + TYPEOF_UNQUAL(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 30894d8f0a78..c9fb559a6399 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1011,11 +1011,8 @@ void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pf } } /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), + region = memblock_alloc_or_panic(sizeof(struct nosave_region), SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index b3b3ce34df63..4b3f31911465 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -250,7 +250,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); void kvfree_call_rcu(struct rcu_head *head, void *ptr) { if (head) - kasan_record_aux_stack_noalloc(ptr); + kasan_record_aux_stack(ptr); __kvfree_call_rcu(head, ptr); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa5..3885aae5f9cb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3083,7 +3083,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } head->func = func; head->next = NULL; - kasan_record_aux_stack_noalloc(head); + kasan_record_aux_stack(head); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); lazy = lazy_in && !rcu_async_should_hurry(); @@ -3817,7 +3817,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) return; } - kasan_record_aux_stack_noalloc(ptr); + kasan_record_aux_stack(ptr); success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 109b5df48e8b..aeeeb68e6809 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7930,19 +7930,26 @@ void sched_setnuma(struct task_struct *p, int nid) #ifdef CONFIG_HOTPLUG_CPU /* - * Ensure that the idle task is using init_mm right before its CPU goes - * offline. + * Invoked on the outgoing CPU in context of the CPU hotplug thread + * after ensuring that there are no user space tasks left on the CPU. + * + * If there is a lazy mm in use on the hotplug thread, drop it and + * switch to init_mm. + * + * The reference count on init_mm is dropped in finish_cpu(). */ -void idle_task_exit(void) +static void sched_force_init_mm(void) { struct mm_struct *mm = current->active_mm; - BUG_ON(cpu_online(smp_processor_id())); - BUG_ON(current != this_rq()->idle); - if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + mmgrab_lazy_tlb(&init_mm); + local_irq_disable(); + current->active_mm = &init_mm; + switch_mm_irqs_off(mm, &init_mm, current); + local_irq_enable(); finish_arch_post_lock_switch(); + mmdrop_lazy_tlb(mm); } /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ @@ -8344,6 +8351,7 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_wait_empty(unsigned int cpu) { balance_hotplug_wait(); + sched_force_init_mm(); return 0; } @@ -10590,7 +10598,7 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); + task_work_add(curr, work, TWA_RESUME); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/task_work.c b/kernel/task_work.c index c969f1f26be5..d1efec571a4a 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,26 +55,14 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; - int flags = notify & TWA_FLAGS; - notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* - * Record the work call stack in order to print it in KASAN - * reports. - * - * Note that stack allocation can fail if TWAF_NO_ALLOC flag - * is set and new page is needed to expand the stack buffer. - */ - if (flags & TWAF_NO_ALLOC) - kasan_record_aux_stack_noalloc(work); - else - kasan_record_aux_stack(work); + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9362484a653c..56bd89a7fde4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2180,7 +2180,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack_noalloc(work); + kasan_record_aux_stack(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 65e706e1bc19..19b45617bdcf 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -29,6 +29,8 @@ EXPORT_SYMBOL(_shared_alloc_tag); DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, mem_alloc_profiling_key); +EXPORT_SYMBOL(mem_alloc_profiling_key); + DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed); struct alloc_tag_kernel_section kernel_tags = { NULL, 0 }; @@ -423,8 +425,8 @@ static int vm_module_tags_populate(void) unsigned long nr; more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT; - nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, - NUMA_NO_NODE, more_pages, next_page); + nr = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN, + NUMA_NO_NODE, more_pages, next_page); if (nr < more_pages || vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL, next_page, PAGE_SHIFT) < 0) { diff --git a/lib/cpumask.c b/lib/cpumask.c index e77ee9d46f71..57274ba8b6d9 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -83,10 +83,7 @@ EXPORT_SYMBOL(alloc_cpumask_var_node); */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { - *mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES); - if (!*mask) - panic("%s: Failed to allocate %u bytes\n", __func__, - cpumask_size()); + *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES); } /** diff --git a/lib/kunit_iov_iter.c b/lib/kunit_iov_iter.c index 497d86e039f6..27d3c7f00ede 100644 --- a/lib/kunit_iov_iter.c +++ b/lib/kunit_iov_iter.c @@ -57,7 +57,7 @@ static void *__init iov_kunit_create_buffer(struct kunit *test, KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pages); *ppages = pages; - got = alloc_pages_bulk_array(GFP_KERNEL, npages, pages); + got = alloc_pages_bulk(GFP_KERNEL, npages, pages); if (got != npages) { release_pages(pages, got); KUNIT_ASSERT_EQ(test, got, npages); diff --git a/lib/list_debug.c b/lib/list_debug.c index db602417febf..ee7eeeb8f92c 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -22,17 +22,17 @@ __list_valid_slowpath bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next) { - if (CHECK_DATA_CORRUPTION(prev == NULL, + if (CHECK_DATA_CORRUPTION(prev == NULL, NULL, "list_add corruption. prev is NULL.\n") || - CHECK_DATA_CORRUPTION(next == NULL, + CHECK_DATA_CORRUPTION(next == NULL, NULL, "list_add corruption. next is NULL.\n") || - CHECK_DATA_CORRUPTION(next->prev != prev, + CHECK_DATA_CORRUPTION(next->prev != prev, next, "list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n", prev, next->prev, next) || - CHECK_DATA_CORRUPTION(prev->next != next, + CHECK_DATA_CORRUPTION(prev->next != next, prev, "list_add corruption. prev->next should be next (%px), but was %px. (prev=%px).\n", next, prev->next, prev) || - CHECK_DATA_CORRUPTION(new == prev || new == next, + CHECK_DATA_CORRUPTION(new == prev || new == next, NULL, "list_add double add: new=%px, prev=%px, next=%px.\n", new, prev, next)) return false; @@ -49,20 +49,20 @@ bool __list_del_entry_valid_or_report(struct list_head *entry) prev = entry->prev; next = entry->next; - if (CHECK_DATA_CORRUPTION(next == NULL, + if (CHECK_DATA_CORRUPTION(next == NULL, NULL, "list_del corruption, %px->next is NULL\n", entry) || - CHECK_DATA_CORRUPTION(prev == NULL, + CHECK_DATA_CORRUPTION(prev == NULL, NULL, "list_del corruption, %px->prev is NULL\n", entry) || - CHECK_DATA_CORRUPTION(next == LIST_POISON1, + CHECK_DATA_CORRUPTION(next == LIST_POISON1, next, "list_del corruption, %px->next is LIST_POISON1 (%px)\n", entry, LIST_POISON1) || - CHECK_DATA_CORRUPTION(prev == LIST_POISON2, + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, prev, "list_del corruption, %px->prev is LIST_POISON2 (%px)\n", entry, LIST_POISON2) || - CHECK_DATA_CORRUPTION(prev->next != entry, + CHECK_DATA_CORRUPTION(prev->next != entry, prev, "list_del corruption. prev->next should be %px, but was %px. (prev=%px)\n", entry, prev->next, prev) || - CHECK_DATA_CORRUPTION(next->prev != entry, + CHECK_DATA_CORRUPTION(next->prev != entry, next, "list_del corruption. next->prev should be %px, but was %px. (next=%px)\n", entry, next->prev, next)) return false; diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 047397136f15..f7153ade1be5 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1863,11 +1863,11 @@ static inline int mab_no_null_split(struct maple_big_node *b_node, * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, - struct maple_big_node *bn, unsigned char *mid_split, unsigned long min) + struct maple_big_node *bn, unsigned char *mid_split) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ - unsigned char slot_min, slot_count = mt_slots[bn->type]; + unsigned char slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot @@ -1900,18 +1900,7 @@ static inline int mab_calc_split(struct ma_state *mas, split = b_end / 3; *mid_split = split * 2; } else { - slot_min = mt_min_slots[bn->type]; - *mid_split = 0; - /* - * Avoid having a range less than the slot count unless it - * causes one node to be deficient. - * NOTE: mt_min_slots is 1 based, b_end and split are zero. - */ - while ((split < slot_count - 1) && - ((bn->pivot[split] - min) < slot_count - 1) && - (b_end - split > slot_min)) - split++; } /* Avoid ending a node on a NULL entry */ @@ -2377,7 +2366,7 @@ static inline struct maple_enode static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, - unsigned char *mid_split, unsigned long min) + unsigned char *mid_split) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; @@ -2390,7 +2379,7 @@ static inline unsigned char mas_mab_to_node(struct ma_state *mas, if (b_node->b_end < slot_count) { split = b_node->b_end; } else { - split = mab_calc_split(mas, b_node, mid_split, min); + split = mab_calc_split(mas, b_node, mid_split); *right = mas_new_ma_node(mas, b_node); } @@ -2877,7 +2866,7 @@ static void mas_spanning_rebalance(struct ma_state *mas, mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, - &mid_split, mast->orig_l->min); + &mid_split); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); @@ -3365,7 +3354,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) if (mas_push_data(mas, height, &mast, false)) break; - split = mab_calc_split(mas, b_node, &mid_split, prev_l_mas.min); + split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites @@ -4745,29 +4734,6 @@ again: return entry; } -/* - * mas_next_entry() - Internal function to get the next entry. - * @mas: The maple state - * @limit: The maximum range start. - * - * Set the @mas->node to the next entry and the range_start to - * the beginning value for the entry. Does not check beyond @limit. - * Sets @mas->index and @mas->last to the range, Does not update @mas->index and - * @mas->last on overflow. - * Restarts on dead nodes. - * - * Return: the next entry or %NULL. - */ -static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) -{ - if (mas->last >= limit) { - mas->status = ma_overflow; - return NULL; - } - - return mas_next_slot(mas, limit, false); -} - /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. @@ -4903,15 +4869,14 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) if (gap >= size) { if (ma_is_leaf(type)) { found = true; - goto done; - } - if (mas->index <= pivot) { - mas->node = mas_slot(mas, slots, offset); - mas->min = min; - mas->max = pivot; - offset = 0; break; } + + mas->node = mas_slot(mas, slots, offset); + mas->min = min; + mas->max = pivot; + offset = 0; + break; } next_slot: min = pivot + 1; @@ -4921,9 +4886,6 @@ next_slot: } } - if (mte_is_root(mas->node)) - found = true; -done: mas->offset = offset; return found; } @@ -5027,8 +4989,8 @@ static inline void mas_awalk(struct ma_state *mas, unsigned long size) * There are 4 options: * go to child (descend) * go back to parent (ascend) - * no gap found. (return, slot == MAPLE_NODE_SLOTS) - * found the gap. (return, slot != MAPLE_NODE_SLOTS) + * no gap found. (return, error == -EBUSY) + * found the gap. (return) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) @@ -5113,9 +5075,6 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, return xa_err(mas->node); offset = mas->offset; - if (unlikely(offset == MAPLE_NODE_SLOTS)) - return -EBUSY; - node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); @@ -6938,7 +6897,7 @@ retry: goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { - entry = mas_next_entry(&mas, max); + entry = mas_next_slot(&mas, max, false); if (likely(entry && !xa_is_zero(entry))) break; } @@ -7597,7 +7556,7 @@ void mt_validate(struct maple_tree *mt) MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && - (mas.max != ULONG_MAX))) { + (!mte_is_root(mas.node)))) { pr_err("Invalid size %u of " PTR_FMT "\n", end, mas_mn(&mas)); } diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 704cb1093ae8..13e2a10d7554 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1563,6 +1563,30 @@ static noinline void __init check_root_expand(struct maple_tree *mt) mas_unlock(&mas); } +static noinline void __init check_deficient_node(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 0); + int count; + + mas_lock(&mas); + for (count = 0; count < 10; count++) { + mas_set(&mas, count); + mas_store_gfp(&mas, xa_mk_value(count), GFP_KERNEL); + } + + for (count = 20; count < 39; count++) { + mas_set(&mas, count); + mas_store_gfp(&mas, xa_mk_value(count), GFP_KERNEL); + } + + for (count = 10; count < 12; count++) { + mas_set(&mas, count); + mas_store_gfp(&mas, xa_mk_value(count), GFP_KERNEL); + } + mas_unlock(&mas); + mt_validate(mt); +} + static noinline void __init check_gap_combining(struct maple_tree *mt) { struct maple_enode *mn1, *mn2; @@ -3714,6 +3738,34 @@ static noinline void __init alloc_cyclic_testing(struct maple_tree *mt) } mtree_destroy(mt); + + /* + * Issue with reverse search was discovered + * https://lore.kernel.org/all/20241216060600.287B4C4CED0@smtp.kernel.org/ + * Exhausting the allocation area and forcing the search to wrap needs a + * mas_reset() in mas_alloc_cyclic(). + */ + next = 0; + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (int i = 0; i < 1023; i++) { + mtree_alloc_cyclic(mt, &location, mt, 2, 1024, &next, GFP_KERNEL); + MT_BUG_ON(mt, i != location - 2); + MT_BUG_ON(mt, i != next - 3); + MT_BUG_ON(mt, mtree_load(mt, location) != mt); + } + mtree_erase(mt, 123); + MT_BUG_ON(mt, mtree_load(mt, 123) != NULL); + mtree_alloc_cyclic(mt, &location, mt, 2, 1024, &next, GFP_KERNEL); + MT_BUG_ON(mt, 123 != location); + MT_BUG_ON(mt, 124 != next); + MT_BUG_ON(mt, mtree_load(mt, location) != mt); + mtree_erase(mt, 100); + mtree_alloc_cyclic(mt, &location, mt, 2, 1024, &next, GFP_KERNEL); + MT_BUG_ON(mt, 100 != location); + MT_BUG_ON(mt, 101 != next); + MT_BUG_ON(mt, mtree_load(mt, location) != mt); + mtree_destroy(mt); + /* Overflow test */ next = ULONG_MAX - 1; ret = mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); @@ -3796,6 +3848,10 @@ static int __init maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_deficient_node(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_store_null(&tree); mtree_destroy(&tree); diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 4ddf769861ff..f585949ff696 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -373,7 +373,7 @@ vm_map_ram_test(void) if (!pages) return -1; - nr_allocated = alloc_pages_bulk_array(GFP_KERNEL, map_nr_pages, pages); + nr_allocated = alloc_pages_bulk(GFP_KERNEL, map_nr_pages, pages); if (nr_allocated != map_nr_pages) goto cleanup; diff --git a/lib/test_xarray.c b/lib/test_xarray.c index b6cac747ec46..eab5971d0a48 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1511,6 +1511,41 @@ static noinline void check_pause(struct kunit *test) XA_BUG_ON(xa, count != order_limit); xa_destroy(xa); + + index = 0; + for (order = XA_CHUNK_SHIFT; order > 0; order--) { + XA_BUG_ON(xa, xa_store_order(xa, index, order, + xa_mk_index(index), GFP_KERNEL)); + index += 1UL << order; + } + + index = 0; + count = 0; + xas_set(&xas, 0); + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + XA_BUG_ON(xa, entry != xa_mk_index(index)); + index += 1UL << (XA_CHUNK_SHIFT - count); + count++; + } + rcu_read_unlock(); + XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + + index = 0; + count = 0; + xas_set(&xas, XA_CHUNK_SIZE / 2 + 1); + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + XA_BUG_ON(xa, entry != xa_mk_index(index)); + index += 1UL << (XA_CHUNK_SHIFT - count); + count++; + xas_pause(&xas); + } + rcu_read_unlock(); + XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + + xa_destroy(xa); + } static noinline void check_move_tiny(struct kunit *test) diff --git a/lib/xarray.c b/lib/xarray.c index 5da8d18899a1..116e9286c64e 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -125,19 +125,20 @@ static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) */ static void xas_squash_marks(const struct xa_state *xas) { - unsigned int mark = 0; + xa_mark_t mark = 0; unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; - if (!xas->xa_sibs) - return; + for (;;) { + unsigned long *marks = node_marks(xas->xa_node, mark); - do { - unsigned long *marks = xas->xa_node->marks[mark]; - if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) - continue; - __set_bit(xas->xa_offset, marks); - bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); - } while (mark++ != (__force unsigned)XA_MARK_MAX); + if (find_next_bit(marks, limit, xas->xa_offset + 1) != limit) { + __set_bit(xas->xa_offset, marks); + bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); + } + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } } /* extracts the offset within this node from the index */ @@ -1027,7 +1028,7 @@ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, unsigned int mask = xas->xa_sibs; /* XXX: no support for splitting really large entries yet */ - if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order)) + if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT <= order)) goto nomem; if (xas->xa_shift + XA_CHUNK_SHIFT > order) return; @@ -1152,6 +1153,7 @@ void xas_pause(struct xa_state *xas) if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) break; } + xas->xa_index &= ~0UL << node->shift; xas->xa_index += (offset - xas->xa_offset) << node->shift; if (xas->xa_index == 0) xas->xa_node = XAS_BOUNDS; @@ -1387,6 +1389,8 @@ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK)) continue; + if (xa_is_sibling(entry)) + continue; if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); diff --git a/mm/Kconfig b/mm/Kconfig index 84000b016808..1b501db06417 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -550,20 +550,63 @@ menuconfig MEMORY_HOTPLUG if MEMORY_HOTPLUG -config MEMORY_HOTPLUG_DEFAULT_ONLINE - bool "Online the newly added memory blocks by default" - depends on MEMORY_HOTPLUG +choice + prompt "Memory Hotplug Default Online Type" + default MHP_DEFAULT_ONLINE_TYPE_OFFLINE help + Default memory type for hotplugged memory. + This option sets the default policy setting for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks) which determines what happens to newly added memory regions. Policy setting can always be changed at runtime. + + The default is 'offline'. + + Select offline to defer onlining to drivers and user policy. + Select auto to let the kernel choose what zones to utilize. + Select online_kernel to generally allow kernel usage of this memory. + Select online_movable to generally disallow kernel usage of this memory. + + Example kernel usage would be page structs and page tables. + See Documentation/admin-guide/mm/memory-hotplug.rst for more information. - Say Y here if you want all hot-plugged memory blocks to appear in - 'online' state by default. - Say N here if you want the default policy to keep all hot-plugged - memory blocks in 'offline' state. +config MHP_DEFAULT_ONLINE_TYPE_OFFLINE + bool "offline" + help + Hotplugged memory will not be onlined by default. + Choose this for systems with drivers and user policy that + handle onlining of hotplug memory policy. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO + bool "auto" + help + Select this if you want the kernel to automatically online + hotplugged memory into the zone it thinks is reasonable. + This memory may be utilized for kernel data. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL + bool "kernel" + help + Select this if you want the kernel to automatically online + hotplugged memory into a zone capable of being used for kernel + data. This typically means ZONE_NORMAL. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE + bool "movable" + help + Select this if you want the kernel to automatically online + hotplug memory into ZONE_MOVABLE. This memory will generally + not be utilized for kernel data. + + This should only be used when the admin knows sufficient + ZONE_NORMAL memory is available to describe hotplug memory, + otherwise hotplug memory may fail to online. For example, + sufficient kernel-capable memory (ZONE_NORMAL) must be + available to allocate page structs to describe ZONE_MOVABLE. + +endchoice config MEMORY_HOTREMOVE bool "Allow for memory hot remove" @@ -1301,6 +1344,21 @@ config ARCH_HAS_USER_SHADOW_STACK The architecture has hardware support for userspace shadow call stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). +config ARCH_SUPPORTS_PT_RECLAIM + def_bool n + +config PT_RECLAIM + bool "reclaim empty user page table pages" + default y + depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP + select MMU_GATHER_RCU_TABLE_FREE + help + Try to reclaim empty user page table pages in paths other than munmap + and exit_mmap path. + + Note: now only empty user PTE page table pages will be reclaimed. + + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index dba52bb0da8a..850386a67b3e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -146,3 +146,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o +obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o diff --git a/mm/cma.h b/mm/cma.h index ad61cc6dd439..8485ef893e99 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -36,7 +36,7 @@ struct cma { }; extern struct cma cma_areas[MAX_CMA_AREAS]; -extern unsigned cma_area_count; +extern unsigned int cma_area_count; static inline unsigned long cma_bitmap_maxno(struct cma *cma) { diff --git a/mm/compaction.c b/mm/compaction.c index a2b16b08cbbf..a9f1261972c8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -83,6 +83,7 @@ static inline bool is_via_compact_memory(int order) { return false; } static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) { post_alloc_hook(page, order, __GFP_MOVABLE); + set_page_refcounted(page); return page; } #define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) @@ -1868,6 +1869,7 @@ again: dst = (struct folio *)freepage; post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + set_page_refcounted(&dst->page); if (order) prep_compound_page(&dst->page, order); cc->nr_freepages -= 1 << order; @@ -2381,7 +2383,27 @@ static bool __compaction_suitable(struct zone *zone, int order, int highest_zoneidx, unsigned long wmark_target) { + pg_data_t __maybe_unused *pgdat = zone->zone_pgdat; + unsigned long sum, nr_pinned; unsigned long watermark; + + sum = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_ANON) + + node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_ANON) + + node_page_state(pgdat, NR_UNEVICTABLE); + + nr_pinned = node_page_state(pgdat, NR_FOLL_PIN_ACQUIRED) - + node_page_state(pgdat, NR_FOLL_PIN_RELEASED); + + /* + * Gup-pinned pages are non-migratable. After subtracting these pages, + * we need to check if the remaining pages are sufficient for memory + * compaction. + */ + if ((sum - nr_pinned) < (1 << order)) + return false; + /* * Watermarks for order-0 must be met for compaction to be able to * isolate free pages for migration targets. This means that the diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index d0357f3e9372..c213cf8b5638 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -71,36 +71,6 @@ config DAMON_SYSFS_KUNIT_TEST If unsure, say N. -config DAMON_DBGFS_DEPRECATED - bool "DAMON debugfs interface (DEPRECATED!)" - depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS - help - This builds the debugfs interface for DAMON. The user space admins - can use the interface for arbitrary data access monitoring. - - If unsure, say N. - - This is deprecated, so users should move to the sysfs interface - (DAMON_SYSFS). If you depend on this and cannot move, please report - your usecase to damon@lists.linux.dev and linux-mm@kvack.org. - -config DAMON_DBGFS - bool - default y - depends on DAMON_DBGFS_DEPRECATED - -config DAMON_DBGFS_KUNIT_TEST - bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS - depends on DAMON_DBGFS && KUNIT=y - default KUNIT_ALL_TESTS - help - This builds the DAMON debugfs interface Kunit test suite. - - For more information on KUnit and unit tests in general, please refer - to the KUnit documentation. - - If unsure, say N. - config DAMON_RECLAIM bool "Build DAMON-based reclaim (DAMON_RECLAIM)" depends on DAMON_PADDR diff --git a/mm/damon/Makefile b/mm/damon/Makefile index f7add3f4aa79..8b49012ba8c3 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -4,6 +4,5 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o -obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/core.c b/mm/damon/core.c index 0776452a1abb..55a435bdd89d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -266,7 +266,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, } struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching) + bool matching, bool allow) { struct damos_filter *filter; @@ -275,6 +275,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, return NULL; filter->type = type; filter->matching = matching; + filter->allow = allow; INIT_LIST_HEAD(&filter->list); return filter; } @@ -504,6 +505,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->next_ops_update_sis = 0; mutex_init(&ctx->kdamond_lock); + mutex_init(&ctx->call_control_lock); + mutex_init(&ctx->walk_control_lock); ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; @@ -803,7 +806,8 @@ static int damos_commit_filters(struct damos *dst, struct damos *src) continue; new_filter = damos_new_filter( - src_filter->type, src_filter->matching); + src_filter->type, src_filter->matching, + src_filter->allow); if (!new_filter) return -ENOMEM; damos_commit_filter_arg(new_filter, src_filter); @@ -1162,6 +1166,94 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) return err; } +static bool damon_is_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + return running; +} + +/** + * damon_call() - Invoke a given function on DAMON worker thread (kdamond). + * @ctx: DAMON context to call the function for. + * @control: Control variable of the call request. + * + * Ask DAMON worker thread (kdamond) of @ctx to call a function with an + * argument data that respectively passed via &damon_call_control->fn and + * &damon_call_control->data of @control, and wait until the kdamond finishes + * handling of the request. + * + * The kdamond executes the function with the argument in the main loop, just + * after a sampling of the iteration is finished. The function can hence + * safely access the internal data of the &struct damon_ctx without additional + * synchronization. The return value of the function will be saved in + * &damon_call_control->return_code. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) +{ + init_completion(&control->completion); + control->canceled = false; + + mutex_lock(&ctx->call_control_lock); + if (ctx->call_control) { + mutex_unlock(&ctx->call_control_lock); + return -EBUSY; + } + ctx->call_control = control; + mutex_unlock(&ctx->call_control_lock); + if (!damon_is_running(ctx)) + return -EINVAL; + wait_for_completion(&control->completion); + if (control->canceled) + return -ECANCELED; + return 0; +} + +/** + * damos_walk() - Invoke a given functions while DAMOS walk regions. + * @ctx: DAMON context to call the functions for. + * @control: Control variable of the walk request. + * + * Ask DAMON worker thread (kdamond) of @ctx to call a function for each region + * that the kdamond will apply DAMOS action to, and wait until the kdamond + * finishes handling of the request. + * + * The kdamond executes the given function in the main loop, for each region + * just after it applied any DAMOS actions of @ctx to it. The invocation is + * made only within one &damos->apply_interval_us since damos_walk() + * invocation, for each scheme. The given callback function can hence safely + * access the internal data of &struct damon_ctx and &struct damon_region that + * each of the scheme will apply the action for next interval, without + * additional synchronizations against the kdamond. If every scheme of @ctx + * passed at least one &damos->apply_interval_us, kdamond marks the request as + * completed so that damos_walk() can wakeup and return. + * + * Return: 0 on success, negative error code otherwise. + */ +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) +{ + init_completion(&control->completion); + control->canceled = false; + mutex_lock(&ctx->walk_control_lock); + if (ctx->walk_control) { + mutex_unlock(&ctx->walk_control_lock); + return -EBUSY; + } + ctx->walk_control = control; + mutex_unlock(&ctx->walk_control_lock); + if (!damon_is_running(ctx)) + return -EINVAL; + wait_for_completion(&control->completion); + if (control->canceled) + return -ECANCELED; + return 0; +} + /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1272,16 +1364,18 @@ static bool damos_skip_charged_region(struct damon_target *t, } static void damos_update_stat(struct damos *s, - unsigned long sz_tried, unsigned long sz_applied) + unsigned long sz_tried, unsigned long sz_applied, + unsigned long sz_ops_filter_passed) { s->stat.nr_tried++; s->stat.sz_tried += sz_tried; if (sz_applied) s->stat.nr_applied++; s->stat.sz_applied += sz_applied; + s->stat.sz_ops_filter_passed += sz_ops_filter_passed; } -static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, +static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos_filter *filter) { bool matched = false; @@ -1335,12 +1429,98 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, struct damos_filter *filter; damos_for_each_filter(filter, s) { - if (__damos_filter_out(ctx, t, r, filter)) - return true; + if (damos_filter_match(ctx, t, r, filter)) + return !filter->allow; } return false; } +/* + * damos_walk_call_walk() - Call &damos_walk_control->walk_fn. + * @ctx: The context of &damon_ctx->walk_control. + * @t: The monitoring target of @r that @s will be applied. + * @r: The region of @t that @s will be applied. + * @s: The scheme of @ctx that will be applied to @r. + * + * This function is called from kdamond whenever it asked the operation set to + * apply a DAMOS scheme action to a region. If a DAMOS walk request is + * installed by damos_walk() and not yet uninstalled, invoke it. + */ +static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, + unsigned long sz_filter_passed) +{ + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + if (!control) + return; + control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed); +} + +/* + * damos_walk_complete() - Complete DAMOS walk request if all walks are done. + * @ctx: The context of &damon_ctx->walk_control. + * @s: A scheme of @ctx that all walks are now done. + * + * This function is called when kdamond finished applying the action of a DAMOS + * scheme to all regions that eligible for the given &damos->apply_interval_us. + * If every scheme of @ctx including @s now finished walking for at least one + * &damos->apply_interval_us, this function makrs the handling of the given + * DAMOS walk request is done, so that damos_walk() can wake up and return. + */ +static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s) +{ + struct damos *siter; + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + if (!control) + return; + + s->walk_completed = true; + /* if all schemes completed, signal completion to walker */ + damon_for_each_scheme(siter, ctx) { + if (!siter->walk_completed) + return; + } + complete(&control->completion); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control = NULL; + mutex_unlock(&ctx->walk_control_lock); +} + +/* + * damos_walk_cancel() - Cancel the current DAMOS walk request. + * @ctx: The context of &damon_ctx->walk_control. + * + * This function is called when @ctx is deactivated by DAMOS watermarks, DAMOS + * walk is requested but there is no DAMOS scheme to walk for, or the kdamond + * is already out of the main loop and therefore gonna be terminated, and hence + * cannot continue the walks. This function therefore marks the walk request + * as canceled, so that damos_walk() can wake up and return. + */ +static void damos_walk_cancel(struct damon_ctx *ctx) +{ + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + + if (!control) + return; + control->canceled = true; + complete(&control->completion); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control = NULL; + mutex_unlock(&ctx->walk_control_lock); +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -1348,6 +1528,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; + unsigned long sz_ops_filter_passed = 0; int err = 0; /* * We plan to support multiple context per kdamond, as DAMON sysfs @@ -1393,8 +1574,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (!err) { trace_damos_before_apply(cidx, sidx, tidx, r, damon_nr_regions(t), do_trace); - sz_applied = c->ops.apply_scheme(c, t, r, s); + sz_applied = c->ops.apply_scheme(c, t, r, s, + &sz_ops_filter_passed); } + damos_walk_call_walk(c, t, r, s, sz_ops_filter_passed); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -1408,7 +1591,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, r->age = 0; update_stat: - damos_update_stat(s, sz, sz_applied); + damos_update_stat(s, sz, sz_applied, sz_ops_filter_passed); } static void damon_do_apply_schemes(struct damon_ctx *c, @@ -1550,7 +1733,7 @@ static unsigned long damos_quota_score(struct damos_quota *quota) static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; - unsigned long esz; + unsigned long esz = ULONG_MAX; if (!quota->ms && list_empty("a->goals)) { quota->esz = quota->sz; @@ -1572,10 +1755,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->total_charged_ns; else throughput = PAGE_SIZE * 1024; - if (!list_empty("a->goals)) - esz = min(throughput * quota->ms, esz); - else - esz = throughput * quota->ms; + esz = min(throughput * quota->ms, esz); } if (quota->sz && quota->sz < esz) @@ -1666,6 +1846,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_scheme(s, c) { if (c->passed_sample_intervals < s->next_apply_sis) continue; + damos_walk_complete(c, s); s->next_apply_sis = c->passed_sample_intervals + (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; @@ -1920,6 +2101,39 @@ static void kdamond_usleep(unsigned long usecs) usleep_range_idle(usecs, usecs + 1); } +/* + * kdamond_call() - handle damon_call_control. + * @ctx: The &struct damon_ctx of the kdamond. + * @cancel: Whether to cancel the invocation of the function. + * + * If there is a &struct damon_call_control request that registered via + * &damon_call() on @ctx, do or cancel the invocation of the function depending + * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS + * watermarks, or the kdamond is already out of the main loop and therefore + * will be terminated. + */ +static void kdamond_call(struct damon_ctx *ctx, bool cancel) +{ + struct damon_call_control *control; + int ret = 0; + + mutex_lock(&ctx->call_control_lock); + control = ctx->call_control; + mutex_unlock(&ctx->call_control_lock); + if (!control) + return; + if (cancel) { + control->canceled = true; + } else { + ret = control->fn(control->data); + control->return_code = ret; + } + complete(&control->completion); + mutex_lock(&ctx->call_control_lock); + ctx->call_control = NULL; + mutex_unlock(&ctx->call_control_lock); +} + /* Returns negative error code if it's not activated but should return */ static int kdamond_wait_activation(struct damon_ctx *ctx) { @@ -1944,6 +2158,8 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) if (ctx->callback.after_wmarks_check && ctx->callback.after_wmarks_check(ctx)) break; + kdamond_call(ctx, true); + damos_walk_cancel(ctx); } return -EBUSY; } @@ -2014,6 +2230,7 @@ static int kdamond_fn(void *data) if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) break; + kdamond_call(ctx, false); kdamond_usleep(sample_interval); ctx->passed_sample_intervals++; @@ -2036,6 +2253,8 @@ static int kdamond_fn(void *data) */ if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); + else + damos_walk_cancel(ctx); sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; @@ -2075,6 +2294,9 @@ done: ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); + kdamond_call(ctx, true); + damos_walk_cancel(ctx); + mutex_lock(&damon_lock); nr_running_ctxs--; if (!nr_running_ctxs && running_exclusive_ctxs) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c deleted file mode 100644 index b4213bc47e44..000000000000 --- a/mm/damon/dbgfs.c +++ /dev/null @@ -1,1148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DAMON Debugfs Interface - * - * Author: SeongJae Park - */ - -#define pr_fmt(fmt) "damon-dbgfs: " fmt - -#include -#include -#include -#include -#include -#include -#include - -#define DAMON_DBGFS_DEPRECATION_NOTICE \ - "DAMON debugfs interface is deprecated, so users should move " \ - "to DAMON_SYSFS. If you cannot, please report your usecase to " \ - "damon@lists.linux.dev and linux-mm@kvack.org.\n" - -static struct damon_ctx **dbgfs_ctxs; -static int dbgfs_nr_ctxs; -static struct dentry **dbgfs_dirs; -static DEFINE_MUTEX(damon_dbgfs_lock); - -static void damon_dbgfs_warn_deprecation(void) -{ - pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE); -} - -/* - * Returns non-empty string on success, negative error code otherwise. - */ -static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - ssize_t ret; - - /* We do not accept continuous write */ - if (*ppos) - return ERR_PTR(-EINVAL); - - kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return ERR_PTR(-ENOMEM); - - ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); - if (ret != count) { - kfree(kbuf); - return ERR_PTR(-EIO); - } - kbuf[ret] = '\0'; - - return kbuf; -} - -static ssize_t dbgfs_attrs_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char kbuf[128]; - int ret; - - mutex_lock(&ctx->kdamond_lock); - ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", - ctx->attrs.sample_interval, ctx->attrs.aggr_interval, - ctx->attrs.ops_update_interval, - ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); - mutex_unlock(&ctx->kdamond_lock); - - return simple_read_from_buffer(buf, count, ppos, kbuf, ret); -} - -static ssize_t dbgfs_attrs_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - struct damon_attrs attrs; - char *kbuf; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - if (sscanf(kbuf, "%lu %lu %lu %lu %lu", - &attrs.sample_interval, &attrs.aggr_interval, - &attrs.ops_update_interval, - &attrs.min_nr_regions, - &attrs.max_nr_regions) != 5) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - ret = damon_set_attrs(ctx, &attrs); - if (!ret) - ret = count; -unlock_out: - mutex_unlock(&ctx->kdamond_lock); -out: - kfree(kbuf); - return ret; -} - -/* - * Return corresponding dbgfs' scheme action value (int) for the given - * damos_action if the given damos_action value is valid and supported by - * dbgfs, negative error code otherwise. - */ -static int damos_action_to_dbgfs_scheme_action(enum damos_action action) -{ - switch (action) { - case DAMOS_WILLNEED: - return 0; - case DAMOS_COLD: - return 1; - case DAMOS_PAGEOUT: - return 2; - case DAMOS_HUGEPAGE: - return 3; - case DAMOS_NOHUGEPAGE: - return 4; - case DAMOS_STAT: - return 5; - default: - return -EINVAL; - } -} - -static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) -{ - struct damos *s; - int written = 0; - int rc; - - damon_for_each_scheme(s, c) { - rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - s->pattern.min_sz_region, - s->pattern.max_sz_region, - s->pattern.min_nr_accesses, - s->pattern.max_nr_accesses, - s->pattern.min_age_region, - s->pattern.max_age_region, - damos_action_to_dbgfs_scheme_action(s->action), - s->quota.ms, s->quota.sz, - s->quota.reset_interval, - s->quota.weight_sz, - s->quota.weight_nr_accesses, - s->quota.weight_age, - s->wmarks.metric, s->wmarks.interval, - s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat.nr_tried, s->stat.sz_tried, - s->stat.nr_applied, s->stat.sz_applied, - s->stat.qt_exceeds); - if (!rc) - return -ENOMEM; - - written += rc; - } - return written; -} - -static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - len = sprint_schemes(ctx, kbuf, count); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) -{ - ssize_t i; - - for (i = 0; i < nr_schemes; i++) - kfree(schemes[i]); - kfree(schemes); -} - -/* - * Return corresponding damos_action for the given dbgfs input for a scheme - * action if the input is valid, negative error code otherwise. - */ -static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action) -{ - switch (dbgfs_action) { - case 0: - return DAMOS_WILLNEED; - case 1: - return DAMOS_COLD; - case 2: - return DAMOS_PAGEOUT; - case 3: - return DAMOS_HUGEPAGE; - case 4: - return DAMOS_NOHUGEPAGE; - case 5: - return DAMOS_STAT; - default: - return -EINVAL; - } -} - -/* - * Converts a string into an array of struct damos pointers - * - * Returns an array of struct damos pointers that converted if the conversion - * success, or NULL otherwise. - */ -static struct damos **str_to_schemes(const char *str, ssize_t len, - ssize_t *nr_schemes) -{ - struct damos *scheme, **schemes; - const int max_nr_schemes = 256; - int pos = 0, parsed, ret; - unsigned int action_input; - enum damos_action action; - - schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), - GFP_KERNEL); - if (!schemes) - return NULL; - - *nr_schemes = 0; - while (pos < len && *nr_schemes < max_nr_schemes) { - struct damos_access_pattern pattern = {}; - struct damos_quota quota = {}; - struct damos_watermarks wmarks; - - ret = sscanf(&str[pos], - "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", - &pattern.min_sz_region, &pattern.max_sz_region, - &pattern.min_nr_accesses, - &pattern.max_nr_accesses, - &pattern.min_age_region, - &pattern.max_age_region, - &action_input, "a.ms, - "a.sz, "a.reset_interval, - "a.weight_sz, "a.weight_nr_accesses, - "a.weight_age, &wmarks.metric, - &wmarks.interval, &wmarks.high, &wmarks.mid, - &wmarks.low, &parsed); - if (ret != 18) - break; - action = dbgfs_scheme_action_to_damos_action(action_input); - if ((int)action < 0) - goto fail; - - if (pattern.min_sz_region > pattern.max_sz_region || - pattern.min_nr_accesses > pattern.max_nr_accesses || - pattern.min_age_region > pattern.max_age_region) - goto fail; - - if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || - wmarks.mid < wmarks.low) - goto fail; - - pos += parsed; - scheme = damon_new_scheme(&pattern, action, 0, "a, - &wmarks, NUMA_NO_NODE); - if (!scheme) - goto fail; - - schemes[*nr_schemes] = scheme; - *nr_schemes += 1; - } - return schemes; -fail: - free_schemes_arr(schemes, *nr_schemes); - return NULL; -} - -static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - struct damos **schemes; - ssize_t nr_schemes = 0, ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - schemes = str_to_schemes(kbuf, count, &nr_schemes); - if (!schemes) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - damon_set_schemes(ctx, schemes, nr_schemes); - ret = count; - nr_schemes = 0; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - free_schemes_arr(schemes, nr_schemes); -out: - kfree(kbuf); - return ret; -} - -static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) -{ - struct damon_target *t; - int id; - int written = 0; - int rc; - - damon_for_each_target(t, ctx) { - if (damon_target_has_pid(ctx)) - /* Show pid numbers to debugfs users */ - id = pid_vnr(t->pid); - else - /* Show 42 for physical address space, just for fun */ - id = 42; - - rc = scnprintf(&buf[written], len - written, "%d ", id); - if (!rc) - return -ENOMEM; - written += rc; - } - if (written) - written -= 1; - written += scnprintf(&buf[written], len - written, "\n"); - return written; -} - -static ssize_t dbgfs_target_ids_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - ssize_t len; - char ids_buf[320]; - - mutex_lock(&ctx->kdamond_lock); - len = sprint_target_ids(ctx, ids_buf, 320); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - return len; - - return simple_read_from_buffer(buf, count, ppos, ids_buf, len); -} - -/* - * Converts a string into an integers array - * - * Returns an array of integers array if the conversion success, or NULL - * otherwise. - */ -static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) -{ - int *array; - const int max_nr_ints = 32; - int nr; - int pos = 0, parsed, ret; - - *nr_ints = 0; - array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); - if (!array) - return NULL; - while (*nr_ints < max_nr_ints && pos < len) { - ret = sscanf(&str[pos], "%d%n", &nr, &parsed); - pos += parsed; - if (ret != 1) - break; - array[*nr_ints] = nr; - *nr_ints += 1; - } - - return array; -} - -static void dbgfs_put_pids(struct pid **pids, int nr_pids) -{ - int i; - - for (i = 0; i < nr_pids; i++) - put_pid(pids[i]); -} - -/* - * Converts a string into an struct pid pointers array - * - * Returns an array of struct pid pointers if the conversion success, or NULL - * otherwise. - */ -static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) -{ - int *ints; - ssize_t nr_ints; - struct pid **pids; - - *nr_pids = 0; - - ints = str_to_ints(str, len, &nr_ints); - if (!ints) - return NULL; - - pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); - if (!pids) - goto out; - - for (; *nr_pids < nr_ints; (*nr_pids)++) { - pids[*nr_pids] = find_get_pid(ints[*nr_pids]); - if (!pids[*nr_pids]) { - dbgfs_put_pids(pids, *nr_pids); - kfree(ints); - kfree(pids); - return NULL; - } - } - -out: - kfree(ints); - return pids; -} - -/* - * dbgfs_set_targets() - Set monitoring targets. - * @ctx: monitoring context - * @nr_targets: number of targets - * @pids: array of target pids (size is same to @nr_targets) - * - * This function should not be called while the kdamond is running. @pids is - * ignored if the context is not configured to have pid in each target. On - * failure, reference counts of all pids in @pids are decremented. - * - * Return: 0 on success, negative error code otherwise. - */ -static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, - struct pid **pids) -{ - ssize_t i; - struct damon_target *t, *next; - - damon_for_each_target_safe(t, next, ctx) { - if (damon_target_has_pid(ctx)) - put_pid(t->pid); - damon_destroy_target(t); - } - - for (i = 0; i < nr_targets; i++) { - t = damon_new_target(); - if (!t) { - damon_for_each_target_safe(t, next, ctx) - damon_destroy_target(t); - if (damon_target_has_pid(ctx)) - dbgfs_put_pids(pids, nr_targets); - return -ENOMEM; - } - if (damon_target_has_pid(ctx)) - t->pid = pids[i]; - damon_add_target(ctx, t); - } - - return 0; -} - -static ssize_t dbgfs_target_ids_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - bool id_is_pid = true; - char *kbuf; - struct pid **target_pids = NULL; - ssize_t nr_targets; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - if (!strncmp(kbuf, "paddr\n", count)) { - id_is_pid = false; - nr_targets = 1; - } - - if (id_is_pid) { - target_pids = str_to_pids(kbuf, count, &nr_targets); - if (!target_pids) { - ret = -ENOMEM; - goto out; - } - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - if (id_is_pid) - dbgfs_put_pids(target_pids, nr_targets); - ret = -EBUSY; - goto unlock_out; - } - - /* remove previously set targets */ - dbgfs_set_targets(ctx, 0, NULL); - if (!nr_targets) { - ret = count; - goto unlock_out; - } - - /* Configure the context for the address space type */ - if (id_is_pid) - ret = damon_select_ops(ctx, DAMON_OPS_VADDR); - else - ret = damon_select_ops(ctx, DAMON_OPS_PADDR); - if (ret) - goto unlock_out; - - ret = dbgfs_set_targets(ctx, nr_targets, target_pids); - if (!ret) - ret = count; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - kfree(target_pids); -out: - kfree(kbuf); - return ret; -} - -static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) -{ - struct damon_target *t; - struct damon_region *r; - int target_idx = 0; - int written = 0; - int rc; - - damon_for_each_target(t, c) { - damon_for_each_region(r, t) { - rc = scnprintf(&buf[written], len - written, - "%d %lu %lu\n", - target_idx, r->ar.start, r->ar.end); - if (!rc) - return -ENOMEM; - written += rc; - } - target_idx++; - } - return written; -} - -static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - mutex_unlock(&ctx->kdamond_lock); - len = -EBUSY; - goto out; - } - - len = sprint_init_regions(ctx, kbuf, count); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static int add_init_region(struct damon_ctx *c, int target_idx, - struct damon_addr_range *ar) -{ - struct damon_target *t; - struct damon_region *r, *prev; - unsigned long idx = 0; - int rc = -EINVAL; - - if (ar->start >= ar->end) - return -EINVAL; - - damon_for_each_target(t, c) { - if (idx++ == target_idx) { - r = damon_new_region(ar->start, ar->end); - if (!r) - return -ENOMEM; - damon_add_region(r, t); - if (damon_nr_regions(t) > 1) { - prev = damon_prev_region(r); - if (prev->ar.end > r->ar.start) { - damon_destroy_region(r, t); - return -EINVAL; - } - } - rc = 0; - } - } - return rc; -} - -static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) -{ - struct damon_target *t; - struct damon_region *r, *next; - int pos = 0, parsed, ret; - int target_idx; - struct damon_addr_range ar; - int err; - - damon_for_each_target(t, c) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } - - while (pos < len) { - ret = sscanf(&str[pos], "%d %lu %lu%n", - &target_idx, &ar.start, &ar.end, &parsed); - if (ret != 3) - break; - err = add_init_region(c, target_idx, &ar); - if (err) - goto fail; - pos += parsed; - } - - return 0; - -fail: - damon_for_each_target(t, c) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } - return err; -} - -static ssize_t dbgfs_init_regions_write(struct file *file, - const char __user *buf, size_t count, - loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t ret = count; - int err; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - err = set_init_regions(ctx, kbuf, ret); - if (err) - ret = err; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - kfree(kbuf); - return ret; -} - -static ssize_t dbgfs_kdamond_pid_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); - else - len = scnprintf(kbuf, count, "none\n"); - mutex_unlock(&ctx->kdamond_lock); - if (!len) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static int damon_dbgfs_open(struct inode *inode, struct file *file) -{ - damon_dbgfs_warn_deprecation(); - - file->private_data = inode->i_private; - - return nonseekable_open(inode, file); -} - -static const struct file_operations attrs_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_attrs_read, - .write = dbgfs_attrs_write, -}; - -static const struct file_operations schemes_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_schemes_read, - .write = dbgfs_schemes_write, -}; - -static const struct file_operations target_ids_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_target_ids_read, - .write = dbgfs_target_ids_write, -}; - -static const struct file_operations init_regions_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_init_regions_read, - .write = dbgfs_init_regions_write, -}; - -static const struct file_operations kdamond_pid_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_kdamond_pid_read, -}; - -static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) -{ - const char * const file_names[] = {"attrs", "schemes", "target_ids", - "init_regions", "kdamond_pid"}; - const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, - &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; - int i; - - for (i = 0; i < ARRAY_SIZE(file_names); i++) - debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); -} - -static void dbgfs_before_terminate(struct damon_ctx *ctx) -{ - struct damon_target *t, *next; - - if (!damon_target_has_pid(ctx)) - return; - - mutex_lock(&ctx->kdamond_lock); - damon_for_each_target_safe(t, next, ctx) { - put_pid(t->pid); - damon_destroy_target(t); - } - mutex_unlock(&ctx->kdamond_lock); -} - -static struct damon_ctx *dbgfs_new_ctx(void) -{ - struct damon_ctx *ctx; - - ctx = damon_new_ctx(); - if (!ctx) - return NULL; - - if (damon_select_ops(ctx, DAMON_OPS_VADDR) && - damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return NULL; - } - ctx->callback.before_terminate = dbgfs_before_terminate; - return ctx; -} - -static void dbgfs_destroy_ctx(struct damon_ctx *ctx) -{ - damon_destroy_ctx(ctx); -} - -static ssize_t damon_dbgfs_deprecated_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE; - - return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf)); -} - -/* - * Make a context of @name and create a debugfs directory for it. - * - * This function should be called while holding damon_dbgfs_lock. - * - * Returns 0 on success, negative error code otherwise. - */ -static int dbgfs_mk_context(char *name) -{ - struct dentry *root, **new_dirs, *new_dir; - struct damon_ctx **new_ctxs, *new_ctx; - - if (damon_nr_running_ctxs()) - return -EBUSY; - - new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * - (dbgfs_nr_ctxs + 1), GFP_KERNEL); - if (!new_ctxs) - return -ENOMEM; - dbgfs_ctxs = new_ctxs; - - new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * - (dbgfs_nr_ctxs + 1), GFP_KERNEL); - if (!new_dirs) - return -ENOMEM; - dbgfs_dirs = new_dirs; - - root = dbgfs_dirs[0]; - if (!root) - return -ENOENT; - - new_dir = debugfs_create_dir(name, root); - /* Below check is required for a potential duplicated name case */ - if (IS_ERR(new_dir)) - return PTR_ERR(new_dir); - dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; - - new_ctx = dbgfs_new_ctx(); - if (!new_ctx) { - debugfs_remove(new_dir); - dbgfs_dirs[dbgfs_nr_ctxs] = NULL; - return -ENOMEM; - } - - dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; - dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], - dbgfs_ctxs[dbgfs_nr_ctxs]); - dbgfs_nr_ctxs++; - - return 0; -} - -static ssize_t dbgfs_mk_context_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - char *ctx_name; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ctx_name = kmalloc(count + 1, GFP_KERNEL); - if (!ctx_name) { - kfree(kbuf); - return -ENOMEM; - } - - /* Trim white space */ - if (sscanf(kbuf, "%s", ctx_name) != 1) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&damon_dbgfs_lock); - ret = dbgfs_mk_context(ctx_name); - if (!ret) - ret = count; - mutex_unlock(&damon_dbgfs_lock); - -out: - kfree(kbuf); - kfree(ctx_name); - return ret; -} - -/* - * Remove a context of @name and its debugfs directory. - * - * This function should be called while holding damon_dbgfs_lock. - * - * Return 0 on success, negative error code otherwise. - */ -static int dbgfs_rm_context(char *name) -{ - struct dentry *root, *dir, **new_dirs; - struct inode *inode; - struct damon_ctx **new_ctxs; - int i, j; - int ret = 0; - - if (damon_nr_running_ctxs()) - return -EBUSY; - - root = dbgfs_dirs[0]; - if (!root) - return -ENOENT; - - dir = debugfs_lookup(name, root); - if (!dir) - return -ENOENT; - - inode = d_inode(dir); - if (!S_ISDIR(inode->i_mode)) { - ret = -EINVAL; - goto out_dput; - } - - new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), - GFP_KERNEL); - if (!new_dirs) { - ret = -ENOMEM; - goto out_dput; - } - - new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), - GFP_KERNEL); - if (!new_ctxs) { - ret = -ENOMEM; - goto out_new_dirs; - } - - for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { - if (dbgfs_dirs[i] == dir) { - debugfs_remove(dbgfs_dirs[i]); - dbgfs_destroy_ctx(dbgfs_ctxs[i]); - continue; - } - new_dirs[j] = dbgfs_dirs[i]; - new_ctxs[j++] = dbgfs_ctxs[i]; - } - - kfree(dbgfs_dirs); - kfree(dbgfs_ctxs); - - dbgfs_dirs = new_dirs; - dbgfs_ctxs = new_ctxs; - dbgfs_nr_ctxs--; - - goto out_dput; - -out_new_dirs: - kfree(new_dirs); -out_dput: - dput(dir); - return ret; -} - -static ssize_t dbgfs_rm_context_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - ssize_t ret; - char *ctx_name; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ctx_name = kmalloc(count + 1, GFP_KERNEL); - if (!ctx_name) { - kfree(kbuf); - return -ENOMEM; - } - - /* Trim white space */ - if (sscanf(kbuf, "%s", ctx_name) != 1) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&damon_dbgfs_lock); - ret = dbgfs_rm_context(ctx_name); - if (!ret) - ret = count; - mutex_unlock(&damon_dbgfs_lock); - -out: - kfree(kbuf); - kfree(ctx_name); - return ret; -} - -static ssize_t dbgfs_monitor_on_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - char monitor_on_buf[5]; - bool monitor_on = damon_nr_running_ctxs() != 0; - int len; - - len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); - - return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); -} - -static ssize_t dbgfs_monitor_on_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - ssize_t ret; - char *kbuf; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - /* Remove white space */ - if (sscanf(kbuf, "%s", kbuf) != 1) { - kfree(kbuf); - return -EINVAL; - } - - mutex_lock(&damon_dbgfs_lock); - if (!strncmp(kbuf, "on", count)) { - int i; - - for (i = 0; i < dbgfs_nr_ctxs; i++) { - if (damon_targets_empty(dbgfs_ctxs[i])) { - kfree(kbuf); - mutex_unlock(&damon_dbgfs_lock); - return -EINVAL; - } - } - ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true); - } else if (!strncmp(kbuf, "off", count)) { - ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); - } else { - ret = -EINVAL; - } - mutex_unlock(&damon_dbgfs_lock); - - if (!ret) - ret = count; - kfree(kbuf); - return ret; -} - -static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) -{ - damon_dbgfs_warn_deprecation(); - return nonseekable_open(inode, file); -} - -static const struct file_operations deprecated_fops = { - .read = damon_dbgfs_deprecated_read, -}; - -static const struct file_operations mk_contexts_fops = { - .open = damon_dbgfs_static_file_open, - .write = dbgfs_mk_context_write, -}; - -static const struct file_operations rm_contexts_fops = { - .open = damon_dbgfs_static_file_open, - .write = dbgfs_rm_context_write, -}; - -static const struct file_operations monitor_on_fops = { - .open = damon_dbgfs_static_file_open, - .read = dbgfs_monitor_on_read, - .write = dbgfs_monitor_on_write, -}; - -static int __init __damon_dbgfs_init(void) -{ - struct dentry *dbgfs_root; - const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on_DEPRECATED", "DEPRECATED"}; - const struct file_operations *fops[] = {&mk_contexts_fops, - &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; - int i; - - dbgfs_root = debugfs_create_dir("damon", NULL); - - for (i = 0; i < ARRAY_SIZE(file_names); i++) - debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, - fops[i]); - dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); - - dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); - if (!dbgfs_dirs) { - debugfs_remove(dbgfs_root); - return -ENOMEM; - } - dbgfs_dirs[0] = dbgfs_root; - - return 0; -} - -/* - * Functions for the initialization - */ - -static int __init damon_dbgfs_init(void) -{ - int rc = -ENOMEM; - - mutex_lock(&damon_dbgfs_lock); - dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); - if (!dbgfs_ctxs) - goto out; - dbgfs_ctxs[0] = dbgfs_new_ctx(); - if (!dbgfs_ctxs[0]) { - kfree(dbgfs_ctxs); - goto out; - } - dbgfs_nr_ctxs = 1; - - rc = __damon_dbgfs_init(); - if (rc) { - kfree(dbgfs_ctxs[0]); - kfree(dbgfs_ctxs); - pr_err("%s: dbgfs init failed\n", __func__); - } - -out: - mutex_unlock(&damon_dbgfs_lock); - return rc; -} - -module_init(damon_dbgfs_init); - -#include "tests/dbgfs-kunit.h" diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a9ff35341d65..6b4397de4199 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -198,7 +198,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static bool __damos_pa_filter_out(struct damos_filter *filter, +static bool damos_pa_filter_match(struct damos_filter *filter, struct folio *folio) { bool matched = false; @@ -237,13 +237,14 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) struct damos_filter *filter; damos_for_each_filter(filter, scheme) { - if (__damos_pa_filter_out(filter, folio)) - return true; + if (damos_pa_filter_match(filter, folio)) + return !filter->allow; } return false; } -static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) +static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) { unsigned long addr, applied; LIST_HEAD(folio_list); @@ -258,7 +259,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) } } if (install_young_filter) { - filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true); + filter = damos_new_filter( + DAMOS_FILTER_TYPE_YOUNG, true, false); if (!filter) return 0; damos_add_filter(s, filter); @@ -272,6 +274,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); folio_clear_referenced(folio); folio_test_clear_young(folio); @@ -292,7 +296,8 @@ put_folio: } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, struct damos *s, bool mark_accessed) + struct damon_region *r, struct damos *s, bool mark_accessed, + unsigned long *sz_filter_passed) { unsigned long addr, applied = 0; @@ -304,6 +309,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); if (mark_accessed) folio_mark_accessed(folio); @@ -317,15 +324,17 @@ put_folio: } static unsigned long damon_pa_mark_accessed(struct damon_region *r, - struct damos *s) + struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, true); + return damon_pa_mark_accessed_or_deactivate(r, s, true, + sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, - struct damos *s) + struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, false); + return damon_pa_mark_accessed_or_deactivate(r, s, false, + sz_filter_passed); } static unsigned int __damon_pa_migrate_folio_list( @@ -449,7 +458,8 @@ static unsigned long damon_pa_migrate_pages(struct list_head *folio_list, return nr_migrated; } -static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s) +static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) { unsigned long addr, applied; LIST_HEAD(folio_list); @@ -462,6 +472,8 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s) if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); if (!folio_isolate_lru(folio)) goto put_folio; @@ -474,23 +486,56 @@ put_folio: return applied * PAGE_SIZE; } +static bool damon_pa_scheme_has_filter(struct damos *s) +{ + struct damos_filter *f; + + damos_for_each_filter(f, s) + return true; + return false; +} + +static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + unsigned long addr; + LIST_HEAD(folio_list); + + if (!damon_pa_scheme_has_filter(s)) + return 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct folio *folio = damon_get_folio(PHYS_PFN(addr)); + + if (!folio) + continue; + + if (damos_pa_filter_out(s, folio)) + goto put_folio; + else + *sz_filter_passed += folio_size(folio); +put_folio: + folio_put(folio); + } + return 0; +} static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damos *scheme, unsigned long *sz_filter_passed) { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r, scheme); + return damon_pa_pageout(r, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, scheme); + return damon_pa_mark_accessed(r, scheme, sz_filter_passed); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r, scheme); + return damon_pa_deactivate_pages(r, scheme, sz_filter_passed); case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: - return damon_pa_migrate(r, scheme); + return damon_pa_migrate(r, scheme, sz_filter_passed); case DAMOS_STAT: - break; + return damon_pa_stat(r, scheme, sz_filter_passed); default: /* DAMOS actions that not yet supported by 'paddr'. */ break; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 9e0077a9404e..a675150965e0 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -221,7 +221,7 @@ static int damon_reclaim_apply_parameters(void) } if (skip_anon) { - filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); if (!filter) goto out; damos_add_filter(scheme, filter); diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 9a18f3c535d3..70d84bdc9f5f 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -45,19 +45,13 @@ void damon_sysfs_schemes_update_stats( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); -int damon_sysfs_schemes_update_regions_start( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx, bool total_bytes_only); - -void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx); - -bool damos_sysfs_regions_upd_done(void); - -int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); +void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, + bool total_bytes_only, unsigned long sz_filter_passed); int damon_sysfs_schemes_clear_regions( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx); + struct damon_sysfs_schemes *sysfs_schemes); int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b095457380b5..98f93ae9f59e 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -19,6 +19,7 @@ struct damon_sysfs_scheme_region { struct damon_addr_range ar; unsigned int nr_accesses; unsigned int age; + unsigned long sz_filter_passed; struct list_head list; }; @@ -74,6 +75,15 @@ static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, return sysfs_emit(buf, "%u\n", region->age); } +static ssize_t sz_filter_passed_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->sz_filter_passed); +} + static void damon_sysfs_scheme_region_release(struct kobject *kobj) { struct damon_sysfs_scheme_region *region = container_of(kobj, @@ -95,11 +105,15 @@ static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = static struct kobj_attribute damon_sysfs_scheme_region_age_attr = __ATTR_RO_MODE(age, 0400); +static struct kobj_attribute damon_sysfs_scheme_region_sz_filter_passed_attr = + __ATTR_RO_MODE(sz_filter_passed, 0400); + static struct attribute *damon_sysfs_scheme_region_attrs[] = { &damon_sysfs_scheme_region_start_attr.attr, &damon_sysfs_scheme_region_end_attr.attr, &damon_sysfs_scheme_region_nr_accesses_attr.attr, &damon_sysfs_scheme_region_age_attr.attr, + &damon_sysfs_scheme_region_sz_filter_passed_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); @@ -114,55 +128,11 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * scheme regions directory */ -/* - * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update - * status - * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request. - * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started. - * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished. - * - * Each DAMON-based operation scheme (&struct damos) has its own apply - * interval, and we need to expose the scheme tried regions based on only - * single snapshot. For this, we keep the tried regions update status for each - * scheme. The status becomes 'idle' at the beginning. - * - * Once the tried regions update request is received, the request handling - * start function (damon_sysfs_scheme_update_regions_start()) sets the status - * of all schemes as 'idle' again, and register ->before_damos_apply() - * callback. - * - * Then, the first followup ->before_damos_apply() callback - * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first - * ->after_sampling() or ->after_aggregation() callback - * (damon_sysfs_cmd_request_callback()) after the call is called only after - * the scheme is completely applied to the given snapshot. Hence the callback - * knows the situation by showing 'started' status, and sets the status as - * 'finished'. Then, damon_sysfs_before_damos_apply() understands the - * situation by showing the 'finished' status and do nothing. - * - * If DAMOS is not applied to any region due to any reasons including the - * access pattern, the watermarks, the quotas, and the filters, - * ->before_damos_apply() will not be called back. Until the situation is - * changed, the update will not be finished. To avoid this, - * damon_sysfs_after_sampling() set the status as 'finished' if more than two - * apply intervals of the scheme is passed while the state is 'idle'. - * - * Finally, the tried regions request handling finisher function - * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks. - */ -enum damos_sysfs_regions_upd_status { - DAMOS_TRIED_REGIONS_UPD_IDLE, - DAMOS_TRIED_REGIONS_UPD_STARTED, - DAMOS_TRIED_REGIONS_UPD_FINISHED, -}; - struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; unsigned long total_bytes; - enum damos_sysfs_regions_upd_status upd_status; - unsigned long upd_timeout_jiffies; }; static struct damon_sysfs_scheme_regions * @@ -178,7 +148,6 @@ damon_sysfs_scheme_regions_alloc(void) INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; regions->total_bytes = 0; - regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; return regions; } @@ -233,6 +202,7 @@ struct damon_sysfs_stats { unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; @@ -277,6 +247,15 @@ static ssize_t sz_applied_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", stats->sz_applied); } +static ssize_t sz_ops_filter_passed_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_ops_filter_passed); +} + static ssize_t qt_exceeds_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -303,6 +282,9 @@ static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = __ATTR_RO_MODE(sz_applied, 0400); +static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr = + __ATTR_RO_MODE(sz_ops_filter_passed, 0400); + static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = __ATTR_RO_MODE(qt_exceeds, 0400); @@ -311,6 +293,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_sz_tried_attr.attr, &damon_sysfs_stats_nr_applied_attr.attr, &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_sz_ops_filter_passed_attr.attr, &damon_sysfs_stats_qt_exceeds_attr.attr, NULL, }; @@ -330,6 +313,7 @@ struct damon_sysfs_scheme_filter { struct kobject kobj; enum damos_filter_type type; bool matching; + bool allow; char *memcg_path; struct damon_addr_range addr_range; int target_idx; @@ -402,6 +386,30 @@ static ssize_t matching_store(struct kobject *kobj, return count; } +static ssize_t allow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N'); +} + +static ssize_t allow_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + bool allow; + int err = kstrtobool(buf, &allow); + + if (err) + return err; + + filter->allow = allow; + return count; +} + static ssize_t memcg_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -499,6 +507,9 @@ static struct kobj_attribute damon_sysfs_scheme_filter_type_attr = static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = __ATTR_RW_MODE(matching, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_allow_attr = + __ATTR_RW_MODE(allow, 0600); + static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = __ATTR_RW_MODE(memcg_path, 0600); @@ -514,6 +525,7 @@ static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr = static struct attribute *damon_sysfs_scheme_filter_attrs[] = { &damon_sysfs_scheme_filter_type_attr.attr, &damon_sysfs_scheme_filter_matching_attr.attr, + &damon_sysfs_scheme_filter_allow_attr.attr, &damon_sysfs_scheme_filter_memcg_path_attr.attr, &damon_sysfs_scheme_filter_addr_start_attr.attr, &damon_sysfs_scheme_filter_addr_end_attr.attr, @@ -1918,7 +1930,8 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme, sysfs_filters->filters_arr[i]; struct damos_filter *filter = damos_new_filter(sysfs_filter->type, - sysfs_filter->matching); + sysfs_filter->matching, + sysfs_filter->allow); int err; if (!filter) @@ -2122,32 +2135,32 @@ void damon_sysfs_schemes_update_stats( sysfs_stats->sz_tried = scheme->stat.sz_tried; sysfs_stats->nr_applied = scheme->stat.nr_applied; sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->sz_ops_filter_passed = + scheme->stat.sz_ops_filter_passed; sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; } } -/* - * damon_sysfs_schemes that need to update its schemes regions dir. Protected - * by damon_sysfs_lock +/** + * damos_sysfs_populate_region_dir() - Populate a schemes tried region dir. + * @sysfs_schemes: Schemes directory to populate regions directory. + * @ctx: Corresponding DAMON context. + * @t: DAMON target of @r. + * @r: DAMON region to populate the directory for. + * @s: Corresponding scheme. + * @total_bytes_only: Whether the request is for bytes update only. + * @sz_filter_passed: Bytes of @r that passed filters of @s. + * + * Called from DAMOS walk callback while holding damon_sysfs_lock. */ -static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; -static int damon_sysfs_schemes_region_idx; -static bool damos_regions_upd_total_bytes_only; - -/* - * DAMON callback that called before damos apply. While this callback is - * registered, damon_sysfs_lock should be held to ensure the regions - * directories exist. - */ -static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - struct damos *s) +void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, bool total_bytes_only, + unsigned long sz_filter_passed) { struct damos *scheme; struct damon_sysfs_scheme_regions *sysfs_regions; struct damon_sysfs_scheme_region *region; - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; int schemes_idx = 0; damon_for_each_scheme(scheme, ctx) { @@ -2158,152 +2171,40 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, /* user could have removed the scheme sysfs dir */ if (schemes_idx >= sysfs_schemes->nr) - return 0; + return; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; - if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED) - return 0; - if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE) - sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED; sysfs_regions->total_bytes += r->ar.end - r->ar.start; - if (damos_regions_upd_total_bytes_only) - return 0; + if (total_bytes_only) + return; region = damon_sysfs_scheme_region_alloc(r); if (!region) - return 0; + return; + region->sz_filter_passed = sz_filter_passed; list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, &damon_sysfs_scheme_region_ktype, &sysfs_regions->kobj, "%d", - damon_sysfs_schemes_region_idx++)) { + sysfs_regions->nr_regions++)) { kobject_put(®ion->kobj); } - return 0; -} - -/* - * DAMON callback that called after each accesses sampling. While this - * callback is registered, damon_sysfs_lock should be held to ensure the - * regions directories exist. - */ -void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx) -{ - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; - struct damon_sysfs_scheme_regions *sysfs_regions; - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - if (sysfs_regions->upd_status == - DAMOS_TRIED_REGIONS_UPD_STARTED || - time_after(jiffies, - sysfs_regions->upd_timeout_jiffies)) - sysfs_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_FINISHED; - } } /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_clear_regions( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) + struct damon_sysfs_schemes *sysfs_schemes) { - struct damos *scheme; - int schemes_idx = 0; + int i; - damon_for_each_scheme(scheme, ctx) { + for (i = 0; i < sysfs_schemes->nr; i++) { struct damon_sysfs_scheme *sysfs_scheme; - /* user could have removed the scheme sysfs dir */ - if (schemes_idx >= sysfs_schemes->nr) - break; - - sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + sysfs_scheme = sysfs_schemes->schemes_arr[i]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); sysfs_scheme->tried_regions->total_bytes = 0; } return 0; } - -static struct damos *damos_sysfs_nth_scheme(int n, struct damon_ctx *ctx) -{ - struct damos *scheme; - int i = 0; - - damon_for_each_scheme(scheme, ctx) { - if (i == n) - return scheme; - i++; - } - return NULL; -} - -static void damos_tried_regions_init_upd_status( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) -{ - int i; - struct damos *scheme; - struct damon_sysfs_scheme_regions *sysfs_regions; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - scheme = damos_sysfs_nth_scheme(i, ctx); - if (!scheme) { - sysfs_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_FINISHED; - continue; - } - sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; - sysfs_regions->upd_timeout_jiffies = jiffies + - 2 * usecs_to_jiffies(scheme->apply_interval_us ? - scheme->apply_interval_us : - ctx->attrs.aggr_interval); - } -} - -/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ -int damon_sysfs_schemes_update_regions_start( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx, bool total_bytes_only) -{ - damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); - damon_sysfs_schemes_for_damos_callback = sysfs_schemes; - damos_tried_regions_init_upd_status(sysfs_schemes, ctx); - damos_regions_upd_total_bytes_only = total_bytes_only; - ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; - return 0; -} - -bool damos_sysfs_regions_upd_done(void) -{ - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; - struct damon_sysfs_scheme_regions *sysfs_regions; - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - if (sysfs_regions->upd_status != - DAMOS_TRIED_REGIONS_UPD_FINISHED) - return false; - } - return true; -} - -/* - * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller - * should unlock damon_sysfs_lock which held before - * damon_sysfs_schemes_update_regions_start() - */ -int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) -{ - damon_sysfs_schemes_for_damos_callback = NULL; - ctx->callback.before_damos_apply = NULL; - damon_sysfs_schemes_region_idx = 0; - return 0; -} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 58145d59881d..deeab04d3b46 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1181,25 +1181,9 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx, return 0; } -static bool damon_sysfs_schemes_regions_updating; - static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - struct damon_sysfs_kdamond *kdamond; - enum damon_sysfs_cmd cmd; - - /* damon_sysfs_schemes_update_regions_stop() might not yet called */ - kdamond = damon_sysfs_cmd_request.kdamond; - cmd = damon_sysfs_cmd_request.cmd; - if (kdamond && ctx == kdamond->damon_ctx && - (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || - cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) && - damon_sysfs_schemes_regions_updating) { - damon_sysfs_schemes_update_regions_stop(ctx); - damon_sysfs_schemes_regions_updating = false; - mutex_unlock(&damon_sysfs_lock); - } if (!damon_target_has_pid(ctx)) return; @@ -1214,57 +1198,24 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) /* * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. + * @data: The kobject wrapper that associated to the kdamond thread. * * This function reads the schemes stats of specific kdamond and update the * related values for sysfs files. This function should be called from DAMON - * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON - * contexts-internal data and DAMON sysfs variables. + * worker thread,to safely access the DAMON contexts-internal data. Caller + * should also ensure holding ``damon_syfs_lock``, and ->damon_ctx of @data is + * not NULL but a valid pointer, to safely access DAMON sysfs variables. */ -static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_upd_schemes_stats(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *ctx = kdamond->damon_ctx; - if (!ctx) - return -EINVAL; damon_sysfs_schemes_update_stats( kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } -static int damon_sysfs_upd_schemes_regions_start( - struct damon_sysfs_kdamond *kdamond, bool total_bytes_only) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_update_regions_start( - kdamond->contexts->contexts_arr[0]->schemes, ctx, - total_bytes_only); -} - -static int damon_sysfs_upd_schemes_regions_stop( - struct damon_sysfs_kdamond *kdamond) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_update_regions_stop(ctx); -} - -static int damon_sysfs_clear_schemes_regions( - struct damon_sysfs_kdamond *kdamond) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_clear_regions( - kdamond->contexts->contexts_arr[0]->schemes, ctx); -} - static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -1318,9 +1269,9 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) return err; } -static int damon_sysfs_commit_schemes_quota_goals( - struct damon_sysfs_kdamond *sysfs_kdamond) +static int damon_sysfs_commit_schemes_quota_goals(void *data) { + struct damon_sysfs_kdamond *sysfs_kdamond = data; struct damon_ctx *ctx; struct damon_sysfs_context *sysfs_ctx; @@ -1338,20 +1289,18 @@ static int damon_sysfs_commit_schemes_quota_goals( /* * damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas * sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. + * @data: The kobject wrapper that associated to the kdamond thread. * * This function reads the schemes' effective quotas of specific kdamond and * update the related values for sysfs files. This function should be called * from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the * DAMON contexts-internal data and DAMON sysfs variables. */ -static int damon_sysfs_upd_schemes_effective_quotas( - struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_upd_schemes_effective_quotas(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *ctx = kdamond->damon_ctx; - if (!ctx) - return -EINVAL; damos_sysfs_update_effective_quotas( kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; @@ -1371,67 +1320,27 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active, bool after_aggregation) { struct damon_sysfs_kdamond *kdamond; - bool total_bytes_only = false; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ - if (!damon_sysfs_schemes_regions_updating && - !mutex_trylock(&damon_sysfs_lock)) + if (!mutex_trylock(&damon_sysfs_lock)) return 0; kdamond = damon_sysfs_cmd_request.kdamond; if (!kdamond || kdamond->damon_ctx != c) goto out; switch (damon_sysfs_cmd_request.cmd) { - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: - err = damon_sysfs_upd_schemes_stats(kdamond); - break; case DAMON_SYSFS_CMD_COMMIT: if (!after_aggregation) goto out; err = damon_sysfs_commit_input(kdamond); break; - case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: - err = damon_sysfs_commit_schemes_quota_goals(kdamond); - break; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: - total_bytes_only = true; - fallthrough; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: - if (!damon_sysfs_schemes_regions_updating) { - err = damon_sysfs_upd_schemes_regions_start(kdamond, - total_bytes_only); - if (!err) { - damon_sysfs_schemes_regions_updating = true; - goto keep_lock_out; - } - } else { - damos_sysfs_mark_finished_regions_updates(c); - /* - * Continue regions updating if DAMON is till - * active and the update for all schemes is not - * finished. - */ - if (active && !damos_sysfs_regions_upd_done()) - goto keep_lock_out; - err = damon_sysfs_upd_schemes_regions_stop(kdamond); - damon_sysfs_schemes_regions_updating = false; - } - break; - case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: - err = damon_sysfs_clear_schemes_regions(kdamond); - break; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: - err = damon_sysfs_upd_schemes_effective_quotas(kdamond); - break; default: break; } /* Mark the request as invalid now. */ damon_sysfs_cmd_request.kdamond = NULL; out: - if (!damon_sysfs_schemes_regions_updating) - mutex_unlock(&damon_sysfs_lock); -keep_lock_out: + mutex_unlock(&damon_sysfs_lock); return err; } @@ -1525,6 +1434,58 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) */ } +static int damon_sysfs_damon_call(int (*fn)(void *data), + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_call_control call_control = {}; + + if (!kdamond->damon_ctx) + return -EINVAL; + call_control.fn = fn; + call_control.data = kdamond; + return damon_call(kdamond->damon_ctx, &call_control); +} + +struct damon_sysfs_schemes_walk_data { + struct damon_sysfs_kdamond *sysfs_kdamond; + bool total_bytes_only; +}; + +/* populate the region directory */ +static void damon_sysfs_schemes_tried_regions_upd_one(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s, unsigned long sz_filter_passed) +{ + struct damon_sysfs_schemes_walk_data *walk_data = data; + struct damon_sysfs_kdamond *sysfs_kdamond = walk_data->sysfs_kdamond; + + damos_sysfs_populate_region_dir( + sysfs_kdamond->contexts->contexts_arr[0]->schemes, + ctx, t, r, s, walk_data->total_bytes_only, + sz_filter_passed); +} + +static int damon_sysfs_update_schemes_tried_regions( + struct damon_sysfs_kdamond *sysfs_kdamond, bool total_bytes_only) +{ + struct damon_sysfs_schemes_walk_data walk_data = { + .sysfs_kdamond = sysfs_kdamond, + .total_bytes_only = total_bytes_only, + }; + struct damos_walk_control control = { + .walk_fn = damon_sysfs_schemes_tried_regions_upd_one, + .data = &walk_data, + }; + struct damon_ctx *ctx = sysfs_kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + + damon_sysfs_schemes_clear_regions( + sysfs_kdamond->contexts->contexts_arr[0]->schemes); + return damos_walk(ctx, &control); +} + /* * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond. * @cmd: The command to handle. @@ -1543,12 +1504,29 @@ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, { bool need_wait = true; - /* Handle commands that doesn't access DAMON context-internal data */ switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); case DAMON_SYSFS_CMD_OFF: return damon_sysfs_turn_damon_off(kdamond); + case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: + return damon_sysfs_damon_call( + damon_sysfs_commit_schemes_quota_goals, + kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: + return damon_sysfs_damon_call( + damon_sysfs_upd_schemes_stats, kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: + return damon_sysfs_update_schemes_tried_regions(kdamond, true); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + return damon_sysfs_update_schemes_tried_regions(kdamond, false); + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: + return damon_sysfs_damon_call( + damon_sysfs_upd_schemes_effective_quotas, + kdamond); default: break; } diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig index a73be044fc9b..36a450f57b58 100644 --- a/mm/damon/tests/.kunitconfig +++ b/mm/damon/tests/.kunitconfig @@ -13,10 +13,3 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y CONFIG_SYSFS=y CONFIG_DAMON_SYSFS=y CONFIG_DAMON_SYSFS_KUNIT_TEST=y - -# for DAMON debugfs interface -CONFIG_DEBUG_FS=y -CONFIG_DAMON_PADDR=y -CONFIG_DAMON_DBGFS_DEPRECATED=y -CONFIG_DAMON_DBGFS=y -CONFIG_DAMON_DBGFS_KUNIT_TEST=y diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index cf22e09a3507..532c6a6f21f9 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -411,7 +411,7 @@ static void damos_test_new_filter(struct kunit *test) { struct damos_filter *filter; - filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); KUNIT_EXPECT_EQ(test, filter->matching, true); KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); @@ -425,7 +425,7 @@ static void damos_test_filter_out(struct kunit *test) struct damon_region *r, *r2; struct damos_filter *f; - f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true); + f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false); f->addr_range = (struct damon_addr_range){ .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; @@ -434,25 +434,25 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -465,7 +465,7 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h deleted file mode 100644 index 087e53f641a8..000000000000 --- a/mm/damon/tests/dbgfs-kunit.h +++ /dev/null @@ -1,173 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * DAMON Debugfs Interface Unit Tests - * - * Author: SeongJae Park - */ - -#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST - -#ifndef _DAMON_DBGFS_TEST_H -#define _DAMON_DBGFS_TEST_H - -#include - -static void damon_dbgfs_test_str_to_ints(struct kunit *test) -{ - char *question; - int *answers; - int expected[] = {12, 35, 46}; - ssize_t nr_integers = 0, i; - - question = "123"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123, answers[0]); - kfree(answers); - - question = "123abc"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123, answers[0]); - kfree(answers); - - question = "a123"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); - - question = "12 35"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); - for (i = 0; i < nr_integers; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = "12 35 46"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); - for (i = 0; i < nr_integers; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = "12 35 abc 46"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); - for (i = 0; i < 2; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = ""; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); - - question = "\n"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); -} - -static void damon_dbgfs_test_set_targets(struct kunit *test) -{ - struct damon_ctx *ctx = dbgfs_new_ctx(); - char buf[64]; - - if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { - dbgfs_destroy_ctx(ctx); - kunit_skip(test, "PADDR not registered"); - } - - /* Make DAMON consider target has no pid */ - damon_select_ops(ctx, DAMON_OPS_PADDR); - - dbgfs_set_targets(ctx, 0, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - - dbgfs_set_targets(ctx, 1, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); - - dbgfs_set_targets(ctx, 0, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - - dbgfs_destroy_ctx(ctx); -} - -static void damon_dbgfs_test_set_init_regions(struct kunit *test) -{ - struct damon_ctx *ctx = damon_new_ctx(); - /* Each line represents one region in `` `` */ - char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", - "1 10 20\n", - "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", - ""}; - /* Reading the file again will show sorted, clean output */ - char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", - "1 10 20\n", - "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", - ""}; - char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ - "1 10 20\n 1 14 26\n", /* regions overlap */ - "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ - char *input, *expect; - int i, rc; - char buf[256]; - - if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - kunit_skip(test, "PADDR not registered"); - } - - damon_select_ops(ctx, DAMON_OPS_PADDR); - - dbgfs_set_targets(ctx, 3, NULL); - - /* Put valid inputs and check the results */ - for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { - input = valid_inputs[i]; - expect = valid_expects[i]; - - rc = set_init_regions(ctx, input, strnlen(input, 256)); - KUNIT_EXPECT_EQ(test, rc, 0); - - memset(buf, 0, 256); - sprint_init_regions(ctx, buf, 256); - - KUNIT_EXPECT_STREQ(test, (char *)buf, expect); - } - /* Put invalid inputs and check the return error code */ - for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { - input = invalid_inputs[i]; - pr_info("input: %s\n", input); - rc = set_init_regions(ctx, input, strnlen(input, 256)); - KUNIT_EXPECT_EQ(test, rc, -EINVAL); - - memset(buf, 0, 256); - sprint_init_regions(ctx, buf, 256); - - KUNIT_EXPECT_STREQ(test, (char *)buf, ""); - } - - dbgfs_set_targets(ctx, 0, NULL); - damon_destroy_ctx(ctx); -} - -static struct kunit_case damon_test_cases[] = { - KUNIT_CASE(damon_dbgfs_test_str_to_ints), - KUNIT_CASE(damon_dbgfs_test_set_targets), - KUNIT_CASE(damon_dbgfs_test_set_init_regions), - {}, -}; - -static struct kunit_suite damon_test_suite = { - .name = "damon-dbgfs", - .test_cases = damon_test_cases, -}; -kunit_test_suite(damon_test_suite); - -#endif /* _DAMON_DBGFS_TEST_H */ - -#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index b9fe3bc8472b..7cd944266a92 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -68,7 +68,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) static struct mm_struct mm; struct damon_addr_range regions[3] = {0}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ - struct vm_area_struct vmas[] = { + static struct vm_area_struct vmas[] = { (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b9eaa20b73b9..a6174f725bd7 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -655,7 +655,7 @@ static unsigned long damos_madvise(struct damon_target *target, static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damos *scheme, unsigned long *sz_filter_passed) { int madv_action; diff --git a/mm/debug.c b/mm/debug.c index 95b6ab809c0e..325d7bf22038 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -178,6 +178,17 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { +#ifdef CONFIG_PER_VMA_LOCK + pr_emerg("vma %px start %px end %px mm %px\n" + "prot %lx anon_vma %px vm_ops %px\n" + "pgoff %lx file %px private_data %px\n" + "flags: %#lx(%pGv) refcnt %x\n", + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, + (unsigned long)pgprot_val(vma->vm_page_prot), + vma->anon_vma, vma->vm_ops, vma->vm_pgoff, + vma->vm_file, vma->vm_private_data, + vma->vm_flags, &vma->vm_flags, refcount_read(&vma->vm_refcnt)); +#else pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" @@ -187,6 +198,7 @@ void dump_vma(const struct vm_area_struct *vma) vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, vma->vm_flags, &vma->vm_flags); +#endif } EXPORT_SYMBOL(dump_vma); @@ -249,6 +261,77 @@ void dump_mm(const struct mm_struct *mm) } EXPORT_SYMBOL(dump_mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason) +{ + if (reason) + pr_warn("vmg %px dumped because: %s\n", vmg, reason); + + if (!vmg) { + pr_warn("vmg %px state: (NULL)\n", vmg); + return; + } + + pr_warn("vmg %px state: mm %px pgoff %lx\n" + "vmi %px [%lx,%lx)\n" + "prev %px next %px vma %px\n" + "start %lx end %lx flags %lx\n" + "file %px anon_vma %px policy %px\n" + "uffd_ctx %px\n" + "anon_name %px\n" + "merge_flags %x state %x\n", + vmg, vmg->mm, vmg->pgoff, + vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0, + vmg->vmi ? vma_iter_end(vmg->vmi) : 0, + vmg->prev, vmg->next, vmg->vma, + vmg->start, vmg->end, vmg->flags, + vmg->file, vmg->anon_vma, vmg->policy, +#ifdef CONFIG_USERFAULTFD + vmg->uffd_ctx.ctx, +#else + (void *)0, +#endif + vmg->anon_name, + (int)vmg->merge_flags, (int)vmg->state); + + if (vmg->mm) { + pr_warn("vmg %px mm:\n", vmg); + dump_mm(vmg->mm); + } else { + pr_warn("vmg %px mm: (NULL)\n", vmg); + } + + if (vmg->vma) { + pr_warn("vmg %px vma:\n", vmg); + dump_vma(vmg->vma); + } else { + pr_warn("vmg %px vma: (NULL)\n", vmg); + } + + if (vmg->prev) { + pr_warn("vmg %px prev:\n", vmg); + dump_vma(vmg->prev); + } else { + pr_warn("vmg %px prev: (NULL)\n", vmg); + } + + if (vmg->next) { + pr_warn("vmg %px next:\n", vmg); + dump_vma(vmg->next); + } else { + pr_warn("vmg %px next: (NULL)\n", vmg); + } + +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + if (vmg->vmi) { + pr_warn("vmg %px vmi:\n", vmg); + vma_iter_dump_tree(vmg->vmi); + } else { + pr_warn("vmg %px vmi: (NULL)\n", vmg); + } +#endif +} +EXPORT_SYMBOL(dump_vmg); + static bool page_init_poisoning __read_mostly = true; static int __init setup_vm_debug(char *str) diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index ce06b2884789..ff35b84a7b50 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -245,7 +245,10 @@ early_memremap_prot(resource_size_t phys_addr, unsigned long size, #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) -void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) +/* + * If no empty slot, handle that and return -ENOMEM. + */ +int __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) { unsigned long slop, clen; char *p; @@ -256,12 +259,15 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) if (clen > MAX_MAP_CHUNK - slop) clen = MAX_MAP_CHUNK - slop; p = early_memremap(src & PAGE_MASK, clen + slop); + if (!p) + return -ENOMEM; memcpy(dest, p + slop, clen); early_memunmap(p, clen + slop); dest += clen; src += clen; size -= clen; } + return 0; } #else /* CONFIG_MMU */ diff --git a/mm/filemap.c b/mm/filemap.c index 4f476411a9a2..279959cf9300 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -440,6 +440,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, } EXPORT_SYMBOL(filemap_fdatawrite_range); +/** + * filemap_fdatawrite_range_kick - start writeback on a range + * @mapping: target address_space + * @start: index to start writeback on + * @end: last (non-inclusive) index for writeback + * + * This is a non-integrity writeback helper, to start writing back folios + * for the indicated range. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); +} +EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); + /** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space @@ -1463,25 +1481,6 @@ static int folio_put_wait_locked(struct folio *folio, int state) return folio_wait_bit_common(folio, PG_locked, state, DROP); } -/** - * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue - * @folio: Folio defining the wait queue of interest - * @waiter: Waiter to add to the queue - * - * Add an arbitrary @waiter to the wait queue for the nominated @folio. - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) -{ - wait_queue_head_t *q = folio_waitqueue(folio); - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, waiter); - folio_set_waiters(folio); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(folio_add_wait_queue); - /** * folio_unlock - Unlock a locked folio. * @folio: The folio. @@ -1590,6 +1589,27 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +/* + * If folio was marked as dropbehind, then pages should be dropped when writeback + * completes. Do that now. If we fail, it's likely because of a big folio - + * just reset dropbehind for that case and latter completions should invalidate. + */ +static void folio_end_dropbehind_write(struct folio *folio) +{ + /* + * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, + * but can happen if normal writeback just happens to find dirty folios + * that were created as part of uncached writeback, and that writeback + * would otherwise not need non-IRQ handling. Just skip the + * invalidation in that case. + */ + if (in_task() && folio_trylock(folio)) { + if (folio->mapping) + folio_unmap_invalidate(folio->mapping, folio, 0); + folio_unlock(folio); + } +} + /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. @@ -1600,6 +1620,8 @@ EXPORT_SYMBOL(folio_wait_private_2_killable); */ void folio_end_writeback(struct folio *folio) { + bool folio_dropbehind = false; + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* @@ -1621,9 +1643,14 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); + if (!folio_test_dirty(folio)) + folio_dropbehind = folio_test_clear_dropbehind(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); + + if (folio_dropbehind) + folio_end_dropbehind_write(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); @@ -1946,6 +1973,8 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); + if (fgp_flags & FGP_DONTCACHE) + __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) @@ -1968,6 +1997,9 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); + /* not an uncached lookup, clear uncached if set */ + if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) + folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); @@ -2450,18 +2482,22 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, loff_t pos, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; + if (iocb->ki_flags & IOCB_DONTCACHE) + __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock @@ -2477,7 +2513,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - index = (pos >> (PAGE_SHIFT + min_order)) << min_order; + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2485,7 +2521,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2506,6 +2543,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } @@ -2515,7 +2554,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; @@ -2530,20 +2568,21 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -2584,6 +2623,20 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +static void filemap_end_dropbehind_read(struct address_space *mapping, + struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (folio_trylock(folio)) { + if (folio_test_clear_dropbehind(folio)) + folio_unmap_invalidate(mapping, folio, 0); + folio_unlock(folio); + } +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2697,8 +2750,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + filemap_end_dropbehind_read(mapping, folio); + folio_put(folio); + } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); diff --git a/mm/gup.c b/mm/gup.c index 3b75e631f369..2cc3a9d28e70 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -596,6 +596,33 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, } #endif /* CONFIG_HAVE_GUP_FAST */ +/* Common code for can_follow_write_* */ +static inline bool can_follow_write_common(struct page *page, + struct vm_area_struct *vma, unsigned int flags) +{ + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + return page && PageAnon(page) && PageAnonExclusive(page); +} + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { @@ -622,6 +649,18 @@ static struct page *no_page_table(struct vm_area_struct *vma, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ +static inline bool can_follow_write_pud(pud_t pud, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pud is writable, we can write to the page. */ + if (pud_write(pud)) + return true; + + return can_follow_write_common(page, vma, flags); +} + static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) @@ -634,10 +673,11 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, assert_spin_locked(pud_lockptr(mm, pudp)); - if ((flags & FOLL_WRITE) && !pud_write(pud)) + if (!pud_present(pud)) return NULL; - if (!pud_present(pud)) + if ((flags & FOLL_WRITE) && + !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags)) return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; @@ -686,27 +726,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, if (pmd_write(pmd)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -807,27 +827,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, if (pte_write(pte)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -1294,9 +1294,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; - /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ - if (is_vm_hugetlb_page(vma)) - return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could @@ -2347,7 +2344,7 @@ static unsigned long collect_longterm_unpinnable_folios( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(folio, movable_folio_list); + folio_isolate_hugetlb(folio, movable_folio_list); continue; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db64116a4f84..3d3ebdc002d5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -617,6 +617,8 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); +DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -637,6 +639,8 @@ static struct attribute *anon_stats_attrs[] = { #ifndef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -669,6 +673,8 @@ static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -2003,7 +2009,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) spin_unlock(vmf->ptl); writable = false; - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { flags |= TNF_MIGRATED; nid = target_nid; task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); @@ -3284,7 +3290,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = folio_lruvec_lock(folio); - ClearPageHasHWPoisoned(head); + folio_clear_has_hwpoisoned(folio); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { struct folio *tail; @@ -4175,20 +4181,21 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, if (input_buf[0] == '/') { char *tok; - char *buf = input_buf; + char *tok_buf = input_buf; char file_path[MAX_INPUT_BUF_SZ]; pgoff_t off_start = 0, off_end = 0; size_t input_len = strlen(input_buf); - tok = strsep(&buf, ","); - if (tok && buf) { + tok = strsep(&tok_buf, ","); + if (tok && tok_buf) { strscpy(file_path, tok); } else { ret = -EINVAL; goto out; } - ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); + ret = sscanf(tok_buf, "0x%lx,0x%lx,%d", &off_start, + &off_end, &new_order); if (ret != 2 && ret != 3) { ret = -EINVAL; goto out; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eaaec19caa7c..6a0ea28f5bac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -48,6 +48,7 @@ #include #include "internal.h" #include "hugetlb_vmemmap.h" +#include int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; @@ -1246,69 +1247,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) hugetlb_dup_vma_private(vma); } -/* Returns true if the VMA has associated reserve pages */ -static bool vma_has_reserves(struct vm_area_struct *vma, long chg) -{ - if (vma->vm_flags & VM_NORESERVE) { - /* - * This address is already reserved by other process(chg == 0), - * so, we should decrement reserved count. Without decrementing, - * reserve count remains after releasing inode, because this - * allocated page will go into page cache and is regarded as - * coming from reserved pool in releasing step. Currently, we - * don't have any other solution to deal with this situation - * properly, so add work-around here. - */ - if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return true; - else - return false; - } - - /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) { - /* - * We know VM_NORESERVE is not set. Therefore, there SHOULD - * be a region map for all pages. The only situation where - * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reserves to - * use. This situation is indicated if chg != 0. - */ - if (chg) - return false; - else - return true; - } - - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Like the shared case above, a hole punch or truncate - * could have been performed on the private mapping. - * Examine the value of chg to determine if reserves - * actually exist or were previously consumed. - * Very Subtle - The value of chg comes from a previous - * call to vma_needs_reserves(). The reserve map for - * private mappings has different (opposite) semantics - * than that of shared mappings. vma_needs_reserves() - * has already taken this difference in semantics into - * account. Therefore, the meaning of chg is the same - * as in the shared case above. Code could easily be - * combined, but keeping it separate draws attention to - * subtle differences. - */ - if (chg) - return false; - else - return true; - } - - return false; -} - static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); @@ -1336,6 +1274,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, if (folio_test_hwpoison(folio)) continue; + if (is_migrate_isolate_page(&folio->page)) + continue; + list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); @@ -1394,8 +1335,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve, - long chg) + unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1404,15 +1344,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, int nid; /* - * A child process with MAP_PRIVATE mappings created by their parent - * have no page reserves. This check ensures that reservations are - * not "stolen". The child may still get SIGKILLed + * gbl_chg==1 means the allocation requires a new page that was not + * reserved before. Making sure there's at least one free page. */ - if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) - goto err; - - /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && !available_huge_pages(h)) + if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1430,11 +1365,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } - mpol_cond_put(mpol); return folio; @@ -2463,7 +2393,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; int node; - nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + nodemask_t *mbind_nodemask, alloc_nodemask; + + mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + if (mbind_nodemask) + nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed); + else + alloc_nodemask = cpuset_current_mems_allowed; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2479,8 +2415,16 @@ retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { folio = NULL; - for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) { + + /* Prioritize current node */ + if (node_isset(numa_mem_id(), alloc_nodemask)) + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + numa_mem_id(), NULL); + + if (!folio) { + for_each_node_mask(node, alloc_nodemask) { + if (node == numa_mem_id()) + continue; folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), node, NULL); if (folio) @@ -2868,7 +2812,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - isolated = isolate_hugetlb(old_folio, list); + isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; @@ -2953,7 +2897,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -2961,69 +2905,129 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } +/* + * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn + * range with new folios. + * @start_pfn: start pfn of the given pfn range + * @end_pfn: end pfn of the given pfn range + * Returns 0 on success, otherwise negated error. + */ +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) +{ + struct hstate *h; + struct folio *folio; + int ret = 0; + + LIST_HEAD(isolate_list); + + while (start_pfn < end_pfn) { + folio = pfn_folio(start_pfn); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); + } else { + start_pfn++; + continue; + } + + if (!folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(h, folio, + &isolate_list); + if (ret) + break; + + putback_movable_pages(&isolate_list); + } + start_pfn++; + } + + return ret; +} + +typedef enum { + /* + * For either 0/1: we checked the per-vma resv map, and one resv + * count either can be reused (0), or an extra needed (1). + */ + MAP_CHG_REUSE = 0, + MAP_CHG_NEEDED = 1, + /* + * Cannot use per-vma resv count can be used, hence a new resv + * count is enforced. + * + * NOTE: This is mostly identical to MAP_CHG_NEEDED, except + * that currently vma_needs_reservation() has an unwanted side + * effect to either use end() or commit() to complete the + * transaction. Hence it needs to differenciate from NEEDED. + */ + MAP_CHG_ENFORCED = 2, +} map_chg_state; + +/* + * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW + * faults of hugetlb private mappings on top of a non-page-cache folio (in + * which case even if there's a private vma resv map it won't cover such + * allocation). New call sites should (probably) never set it to true!! + * When it's set, the allocation will bypass all vma level reservations. + */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) + unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); - long gbl_chg; - int memcg_charge_ret, ret, idx; + long retval, gbl_chg; + map_chg_state map_chg; + int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - struct mem_cgroup *memcg; - bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; - memcg = get_mem_cgroup_from_current(); - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); - if (memcg_charge_ret == -ENOMEM) { - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); - } - idx = hstate_index(h); - /* - * Examine the region/reserve map to determine if the process - * has a reservation for the page to be allocated. A return - * code of zero indicates a reservation exists (no change). - */ - map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) { - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); + + /* Whether we need a separate per-vma reservation? */ + if (cow_from_owner) { + /* + * Special case! Since it's a CoW on top of a reserved + * page, the private resv map doesn't count. So it cannot + * consume the per-vma resv map even if it's reserved. + */ + map_chg = MAP_CHG_ENFORCED; + } else { + /* + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). + */ + retval = vma_needs_reservation(h, vma, addr); + if (retval < 0) + return ERR_PTR(-ENOMEM); + map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* + * Whether we need a separate global reservation? + * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. - * Allocations for MAP_NORESERVE mappings also need to be - * checked against any subpool limit. + * Or if it can get one from the pool reservation directly. */ - if (map_chg || avoid_reserve) { + if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; - + } else { /* - * Even though there was no reservation in the region/reserve - * map, there could be reservations associated with the - * subpool that can be used. This would be indicated if the - * return value of hugepage_subpool_get_pages() is zero. - * However, if avoid_reserve is specified we still avoid even - * the subpool reservations. + * If we have the vma reservation ready, no need for extra + * global reservation. */ - if (avoid_reserve) - gbl_chg = 1; + gbl_chg = 0; } - /* If this allocation is not consuming a reservation, charge it now. + /* + * If this allocation is not consuming a per-vma reservation, + * charge the hugetlb cgroup now. */ - deferred_reserve = map_chg || avoid_reserve; - if (deferred_reserve) { + if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) @@ -3040,27 +3044,32 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } + /* + * Either dequeued or buddy-allocated folio needs to add special + * mark to the folio when it consumes a global reservation. + */ + if (!gbl_chg) { + folio_set_hugetlb_restore_reserve(folio); + h->resv_huge_pages--; + } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ - if (deferred_reserve) { + if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } @@ -3069,50 +3078,61 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_set_folio_subpool(folio, spool); - map_commit = vma_commit_reservation(h, vma, addr); - if (unlikely(map_chg > map_commit)) { + if (map_chg != MAP_CHG_ENFORCED) { + /* commit() is only needed if the map_chg is not enforced */ + retval = vma_commit_reservation(h, vma, addr); /* + * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND - * in hugetlb_reserve_pages for the same page. Also, + * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ - long rsv_adjust; + if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { + long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); - if (deferred_reserve) { - spin_lock_irq(&hugetlb_lock); - hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), - pages_per_huge_page(h), folio); - spin_unlock_irq(&hugetlb_lock); + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + if (map_chg) { + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), + folio); + spin_unlock_irq(&hugetlb_lock); + } } } - if (!memcg_charge_ret) - mem_cgroup_commit_charge(folio, memcg); + ret = mem_cgroup_charge_hugetlb(folio, gfp); + /* + * Unconditionally increment NR_HUGETLB here. If it turns out that + * mem_cgroup_charge_hugetlb failed, then immediately free the page and + * decrement NR_HUGETLB. + */ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); - mem_cgroup_put(memcg); + + if (ret == -ENOMEM) { + free_huge_folio(folio); + return ERR_PTR(-ENOMEM); + } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: - if (deferred_reserve) + if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg || avoid_reserve) + if (map_chg) hugepage_subpool_put_pages(spool, 1); out_end_reservation: - vma_end_reservation(h, vma, addr); - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); + if (map_chg != MAP_CHG_ENFORCED) + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3806,13 +3826,15 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); + struct folio *new_folio; page->mapping = NULL; clear_compound_head(page); prep_compound_page(page, dst->order); + new_folio = page_folio(page); - init_new_hugetlb_folio(dst, page_folio(page)); - list_add(&page->lru, &dst_list); + init_new_hugetlb_folio(dst, new_folio); + list_add(&new_folio->lru, &dst_list); } } @@ -5141,12 +5163,12 @@ const struct vm_operations_struct hugetlb_vm_ops = { }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, - int writable) + bool try_mkwrite) { pte_t entry; unsigned int shift = huge_page_shift(hstate_vma(vma)); - if (writable) { + if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, vma->vm_page_prot))); } else { @@ -5169,6 +5191,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + if (vma->vm_flags & VM_WRITE) + set_huge_ptep_writable(vma, address, ptep); +} + bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; @@ -5199,7 +5228,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); + pte_t newpte = make_huge_pte(vma, &new_folio->page, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5333,7 +5362,7 @@ again: spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); @@ -5799,7 +5828,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; - int outside_reserve = 0; + bool cow_from_owner = 0; vm_fault_t ret = 0; struct mmu_notifier_range range; @@ -5814,13 +5843,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, if (!unshare && huge_pte_uffd_wp(pte)) return 0; - /* - * hugetlb does not support FOLL_FORCE-style write faults that keep the - * PTE mapped R/O such as maybe_mkwrite() would do. - */ - if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) - return VM_FAULT_SIGSEGV; - /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); @@ -5849,7 +5871,8 @@ retry_avoidcopy: SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, vmf->address, vmf->pte); + set_huge_ptep_maybe_writable(vma, vmf->address, + vmf->pte); delayacct_wpcopy_end(); return 0; @@ -5868,7 +5891,7 @@ retry_avoidcopy: */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) - outside_reserve = 1; + cow_from_owner = true; folio_get(old_folio); @@ -5877,7 +5900,7 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(vmf->ptl); - new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* @@ -5887,7 +5910,7 @@ retry_avoidcopy: * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ - if (outside_reserve) { + if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; @@ -6138,7 +6161,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, goto out; } - folio = alloc_hugetlb_folio(vma, vmf->address, 0); + folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6235,8 +6258,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) - && (vma->vm_flags & VM_SHARED))); + new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. @@ -6568,7 +6590,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; - int writable; bool folio_in_pagecache = false; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { @@ -6606,7 +6627,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; @@ -6648,7 +6669,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; @@ -6722,12 +6743,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - if (wp_enabled || (is_continue && !vm_shared)) - writable = 0; - else - writable = dst_vma->vm_flags & VM_WRITE; - - _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, + !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -7406,7 +7423,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -bool isolate_hugetlb(struct folio *folio, struct list_head *list) +/** + * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio + * @folio: the folio to isolate + * @list: the list to add the folio to on success + * + * Isolate an allocated (refcount > 0) hugetlb folio, marking it as + * isolated/non-migratable, and moving it from the active list to the + * given list. + * + * Isolation will fail if @folio is not an allocated hugetlb folio, or if + * it is already isolated/non-migratable. + * + * On success, an additional folio reference is taken that must be dropped + * using folio_putback_hugetlb() to undo the isolation. + * + * Return: True if isolation worked, otherwise False. + */ +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; @@ -7454,7 +7488,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void folio_putback_active_hugetlb(struct folio *folio) +/** + * folio_putback_hugetlb: unisolate a hugetlb folio + * @folio: the isolated hugetlb folio + * + * Putback/un-isolate the hugetlb folio that was previous isolated using + * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it + * back onto the active list. + * + * Will drop the additional folio reference obtained through + * folio_isolate_hugetlb(). + */ +void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); @@ -7501,6 +7546,11 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re } spin_unlock_irq(&hugetlb_lock); } + /* + * Our old folio is isolated and has "migratable" cleared until it + * is putback. As migration succeeded, set the new folio "migratable". + */ + folio_set_hugetlb_migratable(new_folio); } static void hugetlb_unshare_pmds(struct vm_area_struct *vma, diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index e716c4671a15..bb9578bd99f9 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -195,24 +195,23 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { unsigned int nr_pages; struct page_counter *counter; - struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); - struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_folio(folio); + hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ - if (!page_hcg || page_hcg != h_cg) + if (!hcg || hcg != h_cg) goto out; - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); if (!parent) { parent = root_h_cgroup; /* root has no limit */ @@ -235,13 +234,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; - struct page *page; + struct folio *folio; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); + list_for_each_entry(folio, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); spin_unlock_irq(&hugetlb_lock); } @@ -917,7 +916,6 @@ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - return; } static struct cftype hugetlb_files[] = { diff --git a/mm/init-mm.c b/mm/init-mm.c index 24c809379274..4600e7605cab 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,7 +40,8 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), #ifdef CONFIG_PER_VMA_LOCK - .mm_lock_seq = 0, + .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait), + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, diff --git a/mm/internal.h b/mm/internal.h index 9826f7dce607..109ef30fee11 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -392,6 +392,8 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, + gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); @@ -735,15 +737,30 @@ static inline void prep_compound_tail(struct page *head, int tail_idx) extern void prep_compound_page(struct page *page, unsigned int order); -extern void post_alloc_hook(struct page *page, unsigned int order, - gfp_t gfp_flags); +void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; -void free_unref_page(struct page *page, unsigned int order); +struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, + nodemask_t *); +#define __alloc_frozen_pages(...) \ + alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) +void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); +#ifdef CONFIG_NUMA +struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); +#else +static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) +{ + return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); +} +#endif + +#define alloc_frozen_pages(...) \ + alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) + extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); @@ -824,10 +841,6 @@ int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype); - /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); @@ -1440,22 +1453,6 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); -#ifdef CONFIG_64BIT -static inline int can_do_mseal(unsigned long flags) -{ - if (flags) - return -EINVAL; - - return 0; -} - -#else -static inline int can_do_mseal(unsigned long flags) -{ - return -EPERM; -} -#endif - #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) @@ -1530,4 +1527,23 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +/* pt_reclaim.c */ +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval); +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb); + +#ifdef CONFIG_PT_RECLAIM +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details); +#else +static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return false; +} +#endif /* CONFIG_PT_RECLAIM */ + + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 8b9e348113b1..d54e89f8c3e7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -524,7 +524,11 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) sizeof(struct kasan_free_meta) : 0); } -static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) +/* + * This function avoids dynamic memory allocations and thus can be called from + * contexts that do not allow allocating memory. + */ +void kasan_record_aux_stack(void *addr) { struct slab *slab = kasan_addr_to_slab(addr); struct kmem_cache *cache; @@ -541,17 +545,7 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); -} - -void kasan_record_aux_stack(void *addr) -{ - return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); -} - -void kasan_record_aux_stack_noalloc(void *addr) -{ - return __kasan_record_aux_stack(addr, 0); + alloc_meta->aux_stack[0] = kasan_save_stack(0, 0); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b7e4b81421b3..129178be5e64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -501,18 +501,18 @@ static inline bool kasan_byte_accessible(const void *addr) /** * kasan_poison - mark the memory range as inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, must be aligned to KASAN_GRANULE_SIZE - * @value - value that's written to metadata for the range - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, must be aligned to KASAN_GRANULE_SIZE + * @value: value that's written to metadata for the range + * @init: whether to initialize the memory range (only for hardware tag-based) */ void kasan_poison(const void *addr, size_t size, u8 value, bool init); /** * kasan_unpoison - mark the memory range as accessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, can be unaligned - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, can be unaligned + * @init: whether to initialize the memory range (only for hardware tag-based) * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. @@ -530,8 +530,8 @@ bool kasan_byte_accessible(const void *addr); /** * kasan_poison_last_granule - mark the last granule of the memory range as * inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @address: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size * * This function is only available for the generic mode, as it's the only mode * that has partially poisoned memory granules. diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 99d4ff0ed57a..59d673400085 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -47,8 +47,8 @@ static struct { * Some tests use these global variables to store return values from function * calls that could otherwise be eliminated by the compiler as dead code. */ -void *kasan_ptr_result; -int kasan_int_result; +static volatile void *kasan_ptr_result; +static volatile int kasan_int_result; /* Probe for console output: obtains test_status lines of interest. */ static void probe_console(void *ignore, const char *buf, size_t len) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bad1e130eda8..5f0be134141e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -948,17 +948,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -static int find_pmd_or_thp_or_none(struct mm_struct *mm, - unsigned long address, - pmd_t **pmd) +static inline int check_pmd_state(pmd_t *pmd) { - pmd_t pmde; + pmd_t pmde = pmdp_get_lockless(pmd); - *pmd = mm_find_pmd(mm, address); - if (!*pmd) - return SCAN_PMD_NULL; - - pmde = pmdp_get_lockless(*pmd); if (pmd_none(pmde)) return SCAN_PMD_NONE; if (!pmd_present(pmde)) @@ -972,6 +965,17 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, return SCAN_SUCCEED; } +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + return check_pmd_state(*pmd); +} + static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) @@ -1721,7 +1725,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; - bool skipped_uffd = false; + bool success = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that @@ -1758,6 +1762,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); + /* + * The lock of new_folio is still held, we will be blocked in + * the page fault path, which prevents the pte entries from + * being set again. So even though the old empty PTE page may be + * concurrently freed and a new PTE page is filled into the pmd + * entry, it is still empty and can be removed. + * + * So here we only need to recheck if the state of pmd entry + * still meets our requirements, rather than checking pmd_same() + * like elsewhere. + */ + if (check_pmd_state(pmd) != SCAN_SUCCEED) + goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); @@ -1771,20 +1788,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * repeating the anon_vma check protects from one category, * and repeating the userfaultfd_wp() check from another. */ - if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { - skipped_uffd = true; - } else { + if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); + success = true; } if (ptl != pml) spin_unlock(ptl); +drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); - if (!skipped_uffd) { + if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 9c58f081d84f..1bb505a08415 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -280,12 +280,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) start = (void *)PAGE_ALIGN_DOWN((u64)start); size = PAGE_ALIGN((u64)end - (u64)start); - shadow = memblock_alloc(size, PAGE_SIZE); - origin = memblock_alloc(size, PAGE_SIZE); - - if (!shadow || !origin) - panic("%s: Failed to allocate metadata memory for early boot range of size %llu", - __func__, size); + shadow = memblock_alloc_or_panic(size, PAGE_SIZE); + origin = memblock_alloc_or_panic(size, PAGE_SIZE); for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { page = virt_to_page_or_null((char *)start + addr); diff --git a/mm/ksm.c b/mm/ksm.c index 31a9bc365437..8be2b144fefd 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3262,6 +3262,25 @@ static void wait_while_offlining(void) #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS +/* + * The process is mergeable only if any VMA is currently + * applicable to KSM. + * + * The mmap lock must be held in read mode. + */ +bool ksm_process_mergeable(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + VMA_ITERATOR(vmi, mm, 0); + for_each_vma(vmi, vma) + if (vma->vm_flags & VM_MERGEABLE) + return true; + + return false; +} + long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - diff --git a/mm/madvise.c b/mm/madvise.c index 0ceae57da7da..49f3a75046f6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -851,7 +851,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range_single(vma, start, end - start, NULL); + struct zap_details details = { + .reclaim_pt = true, + .even_cows = true, + }; + + zap_page_range_single(vma, start, end - start, &details); return 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 095c18b5c430..95af35fd1389 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1691,6 +1691,26 @@ void * __init memblock_alloc_try_nid( return ptr; } +/** + * __memblock_alloc_or_panic - Try to allocate memory and panic on failure + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @func: caller func name + * + * This function attempts to allocate memory using memblock_alloc, + * and in case of failure, it calls panic with the formatted message. + * This function should not be used directly, please use the macro memblock_alloc_or_panic. + */ +void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func) +{ + void *addr = memblock_alloc(size, align); + + if (unlikely(!addr)) + panic("%s: Failed to allocate %pap bytes\n", func, &size); + return addr; +} + /** * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index a071fa43d479..2be6b9112808 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -899,7 +899,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct mem_cgroup_event *event = @@ -1040,13 +1040,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, } else if (!strcmp(name, "memory.oom_control")) { pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org" - " if you depend on this functionality. \n"); + " if you depend on this functionality.\n"); event->register_event = mem_cgroup_oom_register_event; event->unregister_event = mem_cgroup_oom_unregister_event; } else if (!strcmp(name, "memory.pressure_level")) { pr_warn_once("pressure_level is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org " - "if you depend on this functionality. \n"); + "if you depend on this functionality.\n"); event->register_event = vmpressure_register_event; event->unregister_event = vmpressure_unregister_event; } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { @@ -1134,8 +1134,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) failed = iter; mem_cgroup_iter_break(memcg, iter); break; - } else - iter->oom_lock = true; + } + iter->oom_lock = true; } if (failed) { @@ -1202,7 +1202,7 @@ struct oom_wait_info { }; static int memcg_oom_wake_function(wait_queue_entry_t *wait, - unsigned mode, int sync, void *arg) + unsigned int mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; struct mem_cgroup *oom_wait_memcg; @@ -1644,7 +1644,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, unsigned long nr = 0; enum lru_list lru; - VM_BUG_ON((unsigned)nid >= nr_node_ids); + VM_BUG_ON((unsigned int)nid >= nr_node_ids); for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) @@ -1881,7 +1881,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org if you " - "depend on this functionality. \n"); + "depend on this functionality.\n"); /* cannot set to root cgroup and only 0 and 1 are allowed */ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b3503d12aaf..46f8b372d212 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; + int i = 0; BUG_ON(mem_cgroup_is_root(memcg)); @@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct task_struct *task; css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); - while (!ret && (task = css_task_iter_next(&it))) + while (!ret && (task = css_task_iter_next(&it))) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + cond_resched(); ret = fn(task, arg); + } css_task_iter_end(&it); if (ret) { mem_cgroup_iter_break(memcg, iter); @@ -1448,6 +1453,18 @@ unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item) memcg_page_state_output_unit(item); } +#ifdef CONFIG_HUGETLB_PAGE +static bool memcg_accounts_hugetlb(void) +{ + return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; +} +#else /* CONFIG_HUGETLB_PAGE */ +static bool memcg_accounts_hugetlb(void) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE */ + static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { int i; @@ -1469,7 +1486,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) #ifdef CONFIG_HUGETLB_PAGE if (unlikely(memory_stats[i].idx == NR_HUGETLB) && - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + !memcg_accounts_hugetlb()) continue; #endif size = memcg_page_state_output(memcg, memory_stats[i].idx); @@ -2371,21 +2388,6 @@ done_restock: return 0; } -/** - * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. - * @memcg: memcg previously charged. - * @nr_pages: number of pages previously charged. - */ -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - if (mem_cgroup_is_root(memcg)) - return; - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); -} - static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); @@ -2399,18 +2401,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -/** - * mem_cgroup_commit_charge - commit a previously successful try_charge(). - * @folio: folio to commit the charge to. - * @memcg: memcg previously charged. - */ -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) -{ - css_get(&memcg->css); - commit_charge(folio, memcg); - memcg1_commit_charge(folio, memcg); -} - static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) @@ -4498,7 +4488,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, if (ret) goto out; - mem_cgroup_commit_charge(folio, memcg); + css_get(&memcg->css); + commit_charge(folio, memcg); + memcg1_commit_charge(folio, memcg); out: return ret; } @@ -4516,38 +4508,37 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio - * @memcg: memcg to charge. - * @gfp: reclaim mode. - * @nr_pages: number of pages to charge. + * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio + * @folio: folio being charged + * @gfp: reclaim mode * - * This function is called when allocating a huge page folio to determine if - * the memcg has the capacity for it. It does not commit the charge yet, - * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * This function is called when allocating a huge page folio, after the page has + * already been obtained and charged to the appropriate hugetlb cgroup + * controller (if it is enabled). * - * Once we have obtained the hugetlb folio, we can call - * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the - * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect - * of try_charge(). - * - * Returns 0 on success. Otherwise, an error code is returned. + * Returns ENOMEM if the memcg is already full. + * Returns 0 if either the charge was successful, or if we skip the charging. */ -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages) +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) { + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); + int ret = 0; + /* - * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, - * but do not attempt to commit charge later (or cancel on error) either. + * Even memcg does not account for hugetlb, we still want to update + * system-level stats via lruvec_stat_mod_folio. Return 0, and skip + * charging the memcg. */ - if (mem_cgroup_disabled() || !memcg || - !cgroup_subsys_on_dfl(memory_cgrp_subsys) || - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) - return -EOPNOTSUPP; + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + goto out; - if (try_charge(memcg, gfp, nr_pages)) - return -ENOMEM; + if (charge_memcg(folio, memcg, gfp)) + ret = -ENOMEM; - return 0; +out: + mem_cgroup_put(memcg); + return ret; } /** @@ -4609,7 +4600,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (!mem_cgroup_disabled() && do_memsw_account()) { + if (do_memsw_account()) { /* * The swap entry might not get freed for a long time, * let's not wait for it. The page already received a @@ -4973,7 +4964,6 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; unsigned int nr_entries; - unsigned short oldid; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); @@ -5000,11 +4990,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) /* Get references for the tail pages, too */ if (nr_entries > 1) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), - nr_entries); - VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); + swap_cgroup_record(folio, entry); + folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -5035,7 +5024,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; - unsigned short oldid; if (do_memsw_account()) return 0; @@ -5064,10 +5052,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) /* Get references for the tail pages, too */ if (nr_pages > 1) mem_cgroup_id_get_many(memcg, nr_pages - 1); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); - VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); + swap_cgroup_record(folio, entry); + return 0; } @@ -5081,7 +5069,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - id = swap_cgroup_record(entry, 0, nr_pages); + id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { diff --git a/mm/memfd.c b/mm/memfd.c index 35a370d75c9a..37f7be57c2f5 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -unsigned int *memfd_file_seals_ptr(struct file *file) +static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; @@ -327,15 +327,51 @@ static int check_sysctl_memfd_noexec(unsigned int *flags) return 0; } -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) +static inline bool is_write_sealed(unsigned int seals) { - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +static int check_write_seal(unsigned long *vm_flags_ptr) +{ + unsigned long vm_flags = *vm_flags_ptr; + unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); + + /* If a private matting then writability is irrelevant. */ + if (!(mask & VM_SHARED)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if (mask & VM_WRITE) + return -EPERM; + + /* + * This is a read-only mapping, disallow mprotect() from making a + * write-sealed mapping writable in future. + */ + *vm_flags_ptr &= ~VM_MAYWRITE; + + return 0; +} + +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) +{ + int err = 0; + unsigned int *seals_ptr = memfd_file_seals_ptr(file); + unsigned int seals = seals_ptr ? *seals_ptr : 0; + + if (is_write_sealed(seals)) + err = check_write_seal(vm_flags_ptr); + + return err; +} + +static int sanitize_flags(unsigned int *flags_ptr) +{ + unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { if (flags & ~(unsigned int)MFD_ALL_FLAGS) @@ -351,50 +387,52 @@ SYSCALL_DEFINE2(memfd_create, if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; - error = check_sysctl_memfd_noexec(&flags); - if (error < 0) - return error; + return check_sysctl_memfd_noexec(flags_ptr); +} - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; +static char *alloc_name(const char __user *uname) +{ + int error; + char *name; + long len; - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + /* returned length does not include terminating zero */ + len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); + if (len < 0) { error = -EFAULT; goto err_name; + } else if (len > MFD_NAME_MAX_LEN) { + error = -EINVAL; + goto err_name; } - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; - goto err_name; - } + return name; - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } +err_name: + kfree(name); + return ERR_PTR(error); +} + +static struct file *alloc_file(const char *name, unsigned int flags) +{ + unsigned int *file_seals; + struct file *file; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); - } else + } else { file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; } + if (IS_ERR(file)) + return file; file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; @@ -414,6 +452,37 @@ SYSCALL_DEFINE2(memfd_create, *file_seals &= ~F_SEAL_SEAL; } + return file; +} + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct file *file; + int fd, error; + char *name; + + error = sanitize_flags(&flags); + if (error < 0) + return error; + + name = alloc_name(uname); + if (IS_ERR(name)) + return PTR_ERR(name); + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = alloc_file(name, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + fd_install(fd, file); kfree(name); return fd; diff --git a/mm/memory.c b/mm/memory.c index 398c031be9ba..67cfcebb0f94 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1436,7 +1436,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details) + if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ @@ -1466,34 +1466,42 @@ static inline bool zap_drop_markers(struct zap_details *details) /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. + * + * Returns true if uffd-wp ptes was installed, false otherwise. */ -static inline void +static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { + bool was_installed = false; + +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) - return; + return false; if (zap_drop_markers(details)) - return; + return false; for (;;) { /* the PFN in the PTE is irrelevant. */ - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) + was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } +#endif + return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, - bool *force_flush, bool *force_break) + bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; @@ -1519,8 +1527,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, - ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, + nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); @@ -1544,7 +1552,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + bool *force_break, bool *any_skipped) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; @@ -1559,15 +1567,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, - details, ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, + pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) + if (unlikely(!should_zap_folio(details, folio))) { + *any_skipped = true; return 1; + } /* * Make sure that the common "small folio" case is as fast as possible @@ -1579,14 +1589,121 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, - force_break); + force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, - details, rss, force_flush, force_break); + details, rss, force_flush, force_break, any_skipped); return 1; } +static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *any_skipped) +{ + swp_entry_t entry; + int nr = 1; + + *any_skipped = true; + entry = pte_to_swp_entry(ptent); + if (is_device_private_entry(entry) || + is_device_exclusive_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return 1; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); + rss[mm_counter(folio)]--; + if (is_device_private_entry(entry)) + folio_remove_rmap_pte(folio, page, vma); + folio_put(folio); + } else if (!non_swap_entry(entry)) { + /* Genuine swap entries, hence a private anon pages */ + if (!should_zap_cows(details)) + return 1; + + nr = swap_pte_batch(pte, max_nr, ptent); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); + } else if (is_migration_entry(entry)) { + struct folio *folio = pfn_swap_entry_folio(entry); + + if (!should_zap_folio(details, folio)) + return 1; + rss[mm_counter(folio)]--; + } else if (pte_marker_entry_uffd_wp(entry)) { + /* + * For anon: always drop the marker; for file: only + * drop the marker if explicitly requested. + */ + if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) + return 1; + } else if (is_guard_swp_entry(entry)) { + /* + * Ordinary zapping should not remove guard PTE + * markers. Only do so if we should remove PTE markers + * in general. + */ + if (!zap_drop_markers(details)) + return 1; + } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { + if (!should_zap_cows(details)) + return 1; + } else { + /* We should have covered all the swap entry types */ + pr_alert("unrecognized swap entry 0x%lx\n", entry.val); + WARN_ON_ONCE(1); + } + clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); + + return nr; +} + +static inline int do_zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, unsigned long end, + struct zap_details *details, int *rss, + bool *force_flush, bool *force_break, + bool *any_skipped) +{ + pte_t ptent = ptep_get(pte); + int max_nr = (end - addr) / PAGE_SIZE; + int nr = 0; + + /* Skip all consecutive none ptes */ + if (pte_none(ptent)) { + for (nr = 1; nr < max_nr; nr++) { + ptent = ptep_get(pte + nr); + if (!pte_none(ptent)) + break; + } + max_nr -= nr; + if (!max_nr) + return nr; + pte += nr; + addr += nr * PAGE_SIZE; + } + + if (pte_present(ptent)) + nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, force_flush, force_break, + any_skipped); + else + nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, any_skipped); + + return nr; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1598,9 +1715,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - swp_entry_t entry; + pmd_t pmdval; + unsigned long start = addr; + bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); + bool direct_reclaim = false; int nr; +retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1610,90 +1731,24 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = ptep_get(pte); - struct folio *folio; - struct page *page; - int max_nr; - - nr = 1; - if (pte_none(ptent)) - continue; + bool any_skipped = false; if (need_resched()) break; - if (pte_present(ptent)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, - addr, details, rss, &force_flush, - &force_break); - if (unlikely(force_break)) { - addr += nr * PAGE_SIZE; - break; - } - continue; + nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, + &force_flush, &force_break, &any_skipped); + if (any_skipped) + can_reclaim_pt = false; + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; + break; } - - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - continue; - /* - * Both device private/exclusive mappings should only - * work with anonymous page so far, so we don't need to - * consider uffd-wp bit when zap. For more information, - * see zap_install_uffd_wp_if_needed(). - */ - WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(folio)]--; - if (is_device_private_entry(entry)) - folio_remove_rmap_pte(folio, page, vma); - folio_put(folio); - } else if (!non_swap_entry(entry)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = swap_pte_batch(pte, max_nr, ptent); - /* Genuine swap entries, hence a private anon pages */ - if (!should_zap_cows(details)) - continue; - rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); - if (!should_zap_folio(details, folio)) - continue; - rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { - /* - * For anon: always drop the marker; for file: only - * drop the marker if explicitly requested. - */ - if (!vma_is_anonymous(vma) && - !zap_drop_markers(details)) - continue; - } else if (is_guard_swp_entry(entry)) { - /* - * Ordinary zapping should not remove guard PTE - * markers. Only do so if we should remove PTE markers - * in general. - */ - if (!zap_drop_markers(details)) - continue; - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { - if (!should_zap_cows(details)) - continue; - } else { - /* We should have covered all the swap entry types */ - pr_alert("unrecognized swap entry 0x%lx\n", entry.val); - WARN_ON_ONCE(1); - } - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + if (can_reclaim_pt && addr == end) + direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1713,6 +1768,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) tlb_flush_mmu(tlb); + if (addr != end) { + cond_resched(); + force_flush = false; + force_break = false; + goto retry; + } + + if (can_reclaim_pt) { + if (direct_reclaim) + free_pte(mm, start, tlb, pmdval); + else + try_to_free_pte(mm, pmd, start, tlb); + } + return addr; } @@ -1935,7 +2004,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_notifier_range range; struct mmu_gather tlb; - lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); @@ -3013,7 +3081,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, { return __apply_to_page_range(mm, addr, size, fn, data, false); } -EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was @@ -4189,8 +4256,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } @@ -5625,7 +5694,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) ignore_writable = true; /* Migrate to the requested node */ - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); @@ -6069,7 +6138,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, } /* - * By the time we get here, we already hold the mm semaphore + * By the time we get here, we already hold either the VMA lock or the + * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). @@ -6185,7 +6255,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r /* * Helper for page fault handling. * - * This is kind of equivalend to "mmap_read_lock()" followed + * This is kind of equivalent to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * @@ -6258,6 +6328,88 @@ fail: #endif #ifdef CONFIG_PER_VMA_LOCK +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +{ + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + + /* Additional refcnt if the vma is attached. */ + if (!detaching) + tgt_refcnt++; + + /* + * If vma is detached then only vma_mark_attached() can raise the + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + */ + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + return false; + + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + refcount_read(&vma->vm_refcnt) == tgt_refcnt, + TASK_UNINTERRUPTIBLE); + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + + return true; +} + +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); +} + +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +{ + bool locked; + + /* + * __vma_enter_locked() returns false immediately if the vma is not + * attached, otherwise it waits until refcnt is indicating that vma + * is attached with no readers. + */ + locked = __vma_enter_locked(vma, false); + + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + + if (locked) { + bool detached; + + __vma_exit_locked(vma, &detached); + VM_BUG_ON_VMA(detached, vma); /* vma should remain attached */ + } +} +EXPORT_SYMBOL_GPL(__vma_start_write); + +void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * We are the only writer, so no need to use vma_refcount_put(). + * The condition below is unlikely because the vma has been already + * write-locked and readers can increment vm_refcnt only temporarily + * before they check vm_lock_seq, realize the vma is locked and drop + * back the vm_refcnt. That is a narrow window for observing a raised + * vm_refcnt. + */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* Wait until vma is detached with no readers. */ + if (__vma_enter_locked(vma, true)) { + bool detached; + + __vma_exit_locked(vma, &detached); + VM_BUG_ON_VMA(!detached, vma); + } + } +} + /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the @@ -6270,21 +6422,13 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, struct vm_area_struct *vma; rcu_read_lock(); -retry: vma = mas_walk(&mas); if (!vma) goto inval; - if (!vma_start_read(vma)) + if (!vma_start_read(mm, vma)) goto inval; - /* Check if the VMA got isolated after we found it */ - if (vma->detached) { - vma_end_read(vma); - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; - } /* * At this point, we have a stable reference to a VMA: The VMA is * locked and we know it hasn't already been isolated. @@ -6292,8 +6436,9 @@ retry: * fields are accessible for RCU readers. */ - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + /* Check if the vma we locked is the right one. */ + if (unlikely(vma->vm_mm != mm || + address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); @@ -6961,7 +7106,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, ptdesc->ptl); + if (ptdesc->ptl) + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c43b4e7fb298..e3655f07dd6e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -219,11 +219,30 @@ void put_online_mems(void) bool movable_node_enabled = false; -#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int mhp_default_online_type = MMOP_OFFLINE; -#else -int mhp_default_online_type = MMOP_ONLINE; -#endif +static int mhp_default_online_type = -1; +int mhp_get_default_online_type(void) +{ + if (mhp_default_online_type >= 0) + return mhp_default_online_type; + + if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE)) + mhp_default_online_type = MMOP_OFFLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO)) + mhp_default_online_type = MMOP_ONLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL)) + mhp_default_online_type = MMOP_ONLINE_KERNEL; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE)) + mhp_default_online_type = MMOP_ONLINE_MOVABLE; + else + mhp_default_online_type = MMOP_OFFLINE; + + return mhp_default_online_type; +} + +void mhp_set_default_online_type(int online_type) +{ + mhp_default_online_type = online_type; +} static int __init setup_memhp_default_state(char *str) { @@ -650,6 +669,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { + struct page *page = pfn_to_page(pfn); int order; /* @@ -664,7 +684,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) else order = MAX_PAGE_ORDER; - (*online_page_callback)(pfn_to_page(pfn), order); + /* + * Exposing the page to the buddy by freeing can cause + * issues with debug_pagealloc enabled: some archs don't + * like double-unmappings. So treat them like any pages that + * were allocated from the buddy. + */ + debug_pagealloc_map_pages(page, 1 << order); + (*online_page_callback)(page, order); pfn += (1UL << order); } @@ -1320,7 +1347,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = mhp_default_online_type; + mem->online_type = mhp_get_default_online_type(); return device_online(&mem->dev); } @@ -1567,7 +1594,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (mhp_default_online_type != MMOP_OFFLINE) + if (mhp_get_default_online_type() != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; @@ -1830,7 +1857,7 @@ put_folio: nodemask_t nmask = node_states[N_MEMORY]; struct migration_target_control mtc = { .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, .reason = MR_MEMORY_HOTPLUG, }; int ret; @@ -1992,8 +2019,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE, - GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); + MEMORY_OFFLINE | REPORT_FAILURE); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 162407fbf2bc..bbaadbeeb291 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -647,7 +647,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, */ if ((flags & MPOL_MF_MOVE_ALL) || (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) - if (!isolate_hugetlb(folio, qp->pagelist)) + if (!folio_isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); @@ -2205,9 +2205,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask); + page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) - page = __alloc_pages_noprof(gfp, order, nid, NULL); + page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); return page; } @@ -2222,7 +2222,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, +static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; @@ -2253,8 +2253,9 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_pages_node_noprof(nid, - gfp | __GFP_THISNODE | __GFP_NORETRY, order); + page = __alloc_frozen_pages_noprof( + gfp | __GFP_THISNODE | __GFP_NORETRY, order, + nid, NULL); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* @@ -2266,7 +2267,7 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, } } - page = __alloc_pages_noprof(gfp, order, nid, nodemask); + page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { @@ -2285,8 +2286,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { - return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP, - order, pol, ilx, nid)); + struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, + ilx, nid); + if (!page) + return NULL; + + set_page_refcounted(page); + return page_rmappable_folio(page); } /** @@ -2300,7 +2306,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and - * excepting where direct use of alloc_pages_mpol() is more appropriate. + * excepting where direct use of folio_alloc_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ @@ -2321,6 +2327,21 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct } EXPORT_SYMBOL(vma_alloc_folio_noprof); +struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = &default_policy; + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); +} + /** * alloc_pages - Allocate pages. * @gfp: GFP flags. @@ -2337,17 +2358,11 @@ EXPORT_SYMBOL(vma_alloc_folio_noprof); */ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { - struct mempolicy *pol = &default_policy; + struct page *page = alloc_frozen_pages_noprof(gfp, order); - /* - * No reference counting needed for current->mempolicy - * nor system default_policy - */ - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); - - return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, - numa_node_id()); + if (page) + set_page_refcounted(page); + return page; } EXPORT_SYMBOL(alloc_pages_noprof); @@ -2357,7 +2372,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) } EXPORT_SYMBOL(folio_alloc_noprof); -static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2376,13 +2391,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, if (delta) { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node + 1, NULL, + nr_pages_per_node + 1, page_array); delta--; } else { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node, NULL, page_array); + nr_pages_per_node, page_array); } page_array += nr_allocated; @@ -2392,7 +2407,7 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2431,7 +2446,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (weight && node_isset(node, nodes)) { node_pages = min(rem_pages, weight); nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, - NULL, page_array); + page_array); page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ @@ -2494,7 +2509,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (!node_pages) break; nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, - NULL, page_array); + page_array); page_array += nr_allocated; total_allocated += nr_allocated; if (total_allocated == nr_pages) @@ -2507,7 +2522,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, +static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2518,11 +2533,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, - nr_pages, NULL, page_array); + nr_pages, page_array); if (nr_allocated < nr_pages) nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, - nr_pages - nr_allocated, NULL, + nr_pages - nr_allocated, page_array + nr_allocated); return nr_allocated; } @@ -2533,7 +2548,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; @@ -2544,21 +2559,21 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) - return alloc_pages_bulk_array_interleave(gfp, pol, + return alloc_pages_bulk_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) - return alloc_pages_bulk_array_weighted_interleave( + return alloc_pages_bulk_weighted_interleave( gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) - return alloc_pages_bulk_array_preferred_many(gfp, + return alloc_pages_bulk_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return alloc_pages_bulk_noprof(gfp, nid, nodemask, - nr_pages, NULL, page_array); + nr_pages, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) diff --git a/mm/migrate.c b/mm/migrate.c index cc68583c86f9..be9e3b48cd62 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -68,10 +68,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (!folio) goto out; - if (unlikely(folio_test_slab(folio))) - goto out_putfolio; - /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ - smp_rmb(); /* * Check movable flag before taking the page lock because * we use non-atomic bitops on newly allocated page flags so @@ -79,10 +75,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) */ if (unlikely(!__folio_test_movable(folio))) goto out_putfolio; - /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ - smp_rmb(); - if (unlikely(folio_test_slab(folio))) - goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent @@ -136,7 +128,7 @@ static void putback_movable_folio(struct folio *folio) * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_hugetlb(). + * and folio_isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -145,7 +137,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(folio, folio2, l, lru) { if (unlikely(folio_test_hugetlb(folio))) { - folio_putback_active_hugetlb(folio); + folio_putback_hugetlb(folio); continue; } list_del(&folio->lru); @@ -177,7 +169,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) bool isolated, lru; if (folio_test_hugetlb(folio)) - return isolate_hugetlb(folio, list); + return folio_isolate_hugetlb(folio, list); lru = !__folio_test_movable(folio); if (lru) @@ -1262,7 +1254,10 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, */ switch (mode) { case MIGRATE_SYNC: - break; + if (!src->mapping || + !mapping_writeback_indeterminate(src->mapping)) + break; + fallthrough; default: rc = -EBUSY; goto out; @@ -1459,7 +1454,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1542,19 +1537,19 @@ out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); /* - * If migration was not successful and there's a freeing callback, use - * it. Otherwise, put_page() will drop the reference grabbed during - * isolation. + * If migration was not successful and there's a freeing callback, + * return the folio to that special allocator. Otherwise, simply drop + * our additional reference. */ if (put_new_folio) put_new_folio(dst, private); else - folio_putback_active_hugetlb(dst); + folio_put(dst); return rc; } @@ -2208,7 +2203,7 @@ static int __add_folio_for_migration(struct folio *folio, int node, return -EACCES; if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(folio, pagelist)) + if (folio_isolate_hugetlb(folio, pagelist)) return 1; } else if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, pagelist); @@ -2683,8 +2678,7 @@ int migrate_misplaced_folio_prepare(struct folio *folio, * elevated reference count on the folio. This function will un-isolate the * folio, dereferencing the folio before returning. */ -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node) +int migrate_misplaced_folio(struct folio *folio, int node) { pg_data_t *pgdat = NODE_DATA(node); int nr_remaining; diff --git a/mm/mm_init.c b/mm/mm_init.c index 24b68b425afb..2630cc30147e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1585,13 +1585,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, { void *ptr; + /* + * Kmemleak will explicitly scan mem_map by traversing all valid + * `struct *page`,so memblock does not need to be added to the scan list. + */ if (exact_nid) ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); else ptr = memblock_alloc_try_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); if (ptr && size > 0) diff --git a/mm/mmap.c b/mm/mmap.c index aec208f90337..7aa36216ecc0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -111,8 +111,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } -static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, unsigned long flags); + SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; @@ -278,8 +277,62 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; } -/* +/** + * do_mmap() - Perform a userland memory mapping into the current process + * address space of length @len with protection bits @prot, mmap flags @flags + * (from which VMA flags will be inferred), and any additional VMA flags to + * apply @vm_flags. If this is a file-backed mapping then the file is specified + * in @file and page offset into the file via @pgoff. + * + * This function does not perform security checks on the file and assumes, if + * @uf is non-NULL, the caller has provided a list head to track unmap events + * for userfaultfd @uf. + * + * It also simply indicates whether memory population is required by setting + * @populate, which must be non-NULL, expecting the caller to actually perform + * this task itself if appropriate. + * + * This function will invoke architecture-specific (and if provided and + * relevant, file system-specific) logic to determine the most appropriate + * unmapped area in which to place the mapping if not MAP_FIXED. + * + * Callers which require userland mmap() behaviour should invoke vm_mmap(), + * which is also exported for module use. + * + * Those which require this behaviour less security checks, userfaultfd and + * populate behaviour, and who handle the mmap write lock themselves, should + * call this function. + * + * Note that the returned address may reside within a merged VMA if an + * appropriate merge were to take place, so it doesn't necessarily specify the + * start of a VMA, rather only the start of a valid mapped range of length + * @len bytes, rounded down to the nearest page size. + * * The caller must write-lock current->mm->mmap_lock. + * + * @file: An optional struct file pointer describing the file which is to be + * mapped, if a file-backed mapping. + * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the + * address at which to perform this mapping. See mmap (2) for details. Must be + * page-aligned. + * @len: The length of the mapping. Will be page-aligned and must be at least 1 + * page in size. + * @prot: Protection bits describing access required to the mapping. See mmap + * (2) for details. + * @flags: Flags specifying how the mapping should be performed, see mmap (2) + * for details. + * @vm_flags: VMA flags which should be set by default, or 0 otherwise. + * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. + * @populate: A pointer to a value which will be set to 0 if no population of + * the range is required, or the number of bytes to populate if it is. Must be + * non-NULL. See mmap (2) for details as to under what circumstances population + * of the range occurs. + * @uf: An optional pointer to a list head to track userfaultfd unmap events + * should unmapping events arise. If provided, it is up to the caller to manage + * this. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -292,6 +345,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, *populate = 0; + mmap_assert_write_locked(mm); + if (!len) return -EINVAL; @@ -369,8 +424,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); - unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; + int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; @@ -410,8 +465,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - else if (is_readonly_sealed(seals, vm_flags)) - vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) @@ -431,6 +484,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr, default: return -EINVAL; } + + /* + * Check to see if we are violating any seals and update VMA + * flags if necessary to avoid future seal violations. + */ + err = memfd_check_seals_mmap(file, &vm_flags); + if (err) + return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: @@ -581,115 +642,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ -/** - * unmapped_area() - Find an area between the low_limit and the high_limit with - * the correct alignment and offset, all from @info. Note: current->mm is used - * for the search. - * - * @info: The unmapped area information including the range [low_limit - - * high_limit), the alignment offset and mask. - * - * Return: A memory address or -ENOMEM. - */ -static unsigned long unmapped_area(struct vm_unmapped_area_info *info) -{ - unsigned long length, gap; - unsigned long low_limit, high_limit; - struct vm_area_struct *tmp; - VMA_ITERATOR(vmi, current->mm, 0); - - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask + info->start_gap; - if (length < info->length) - return -ENOMEM; - - low_limit = info->low_limit; - if (low_limit < mmap_min_addr) - low_limit = mmap_min_addr; - high_limit = info->high_limit; -retry: - if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) - return -ENOMEM; - - /* - * Adjust for the gap first so it doesn't interfere with the - * later alignment. The first step is the minimum needed to - * fulill the start gap, the next steps is the minimum to align - * that. It is the minimum needed to fulill both. - */ - gap = vma_iter_addr(&vmi) + info->start_gap; - gap += (info->align_offset - gap) & info->align_mask; - tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) < gap + length - 1) { - low_limit = tmp->vm_end; - vma_iter_reset(&vmi); - goto retry; - } - } else { - tmp = vma_prev(&vmi); - if (tmp && vm_end_gap(tmp) > gap) { - low_limit = vm_end_gap(tmp); - vma_iter_reset(&vmi); - goto retry; - } - } - - return gap; -} - -/** - * unmapped_area_topdown() - Find an area between the low_limit and the - * high_limit with the correct alignment and offset at the highest available - * address, all from @info. Note: current->mm is used for the search. - * - * @info: The unmapped area information including the range [low_limit - - * high_limit), the alignment offset and mask. - * - * Return: A memory address or -ENOMEM. - */ -static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) -{ - unsigned long length, gap, gap_end; - unsigned long low_limit, high_limit; - struct vm_area_struct *tmp; - VMA_ITERATOR(vmi, current->mm, 0); - - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask + info->start_gap; - if (length < info->length) - return -ENOMEM; - - low_limit = info->low_limit; - if (low_limit < mmap_min_addr) - low_limit = mmap_min_addr; - high_limit = info->high_limit; -retry: - if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) - return -ENOMEM; - - gap = vma_iter_end(&vmi) - info->length; - gap -= (gap - info->align_offset) & info->align_mask; - gap_end = vma_iter_end(&vmi); - tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) < gap_end) { - high_limit = vm_start_gap(tmp); - vma_iter_reset(&vmi); - goto retry; - } - } else { - tmp = vma_prev(&vmi); - if (tmp && vm_end_gap(tmp) > gap) { - high_limit = tmp->vm_start; - vma_iter_reset(&vmi); - goto retry; - } - } - - return gap; -} - /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. @@ -989,211 +941,6 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, return vma; } -/* - * Verify that the stack growth is acceptable and - * update accounting. This is shared with both the - * grow-up and grow-down cases. - */ -static int acct_stack_growth(struct vm_area_struct *vma, - unsigned long size, unsigned long grow) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long new_start; - - /* address space limit tests */ - if (!may_expand_vm(mm, vma->vm_flags, grow)) - return -ENOMEM; - - /* Stack limit test */ - if (size > rlimit(RLIMIT_STACK)) - return -ENOMEM; - - /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) - return -ENOMEM; - - /* Check to ensure the stack will not grow into a hugetlb-only region */ - new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : - vma->vm_end - size; - if (is_hugepage_only_range(vma->vm_mm, new_start, size)) - return -EFAULT; - - /* - * Overcommit.. This must be the final test, as it will - * update security statistics. - */ - if (security_vm_enough_memory_mm(mm, grow)) - return -ENOMEM; - - return 0; -} - -#if defined(CONFIG_STACK_GROWSUP) -/* - * PA-RISC uses this for its stack. - * vma is the last one with address > vma->vm_end. Have to extend vma. - */ -static int expand_upwards(struct vm_area_struct *vma, unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next; - unsigned long gap_addr; - int error = 0; - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (!(vma->vm_flags & VM_GROWSUP)) - return -EFAULT; - - mmap_assert_write_locked(mm); - - /* Guard against exceeding limits of the address space. */ - address &= PAGE_MASK; - if (address >= (TASK_SIZE & PAGE_MASK)) - return -ENOMEM; - address += PAGE_SIZE; - - /* Enforce stack_guard_gap */ - gap_addr = address + stack_guard_gap; - - /* Guard against overflow */ - if (gap_addr < address || gap_addr > TASK_SIZE) - gap_addr = TASK_SIZE; - - next = find_vma_intersection(mm, vma->vm_end, gap_addr); - if (next && vma_is_accessible(next)) { - if (!(next->vm_flags & VM_GROWSUP)) - return -ENOMEM; - /* Check that both stack segments have the same anon_vma? */ - } - - if (next) - vma_iter_prev_range_limit(&vmi, address); - - vma_iter_config(&vmi, vma->vm_start, address); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) { - vma_iter_free(&vmi); - return -ENOMEM; - } - - /* Lock the VMA before expanding to prevent concurrent page faults */ - vma_start_write(vma); - /* We update the anon VMA tree. */ - anon_vma_lock_write(vma->anon_vma); - - /* Somebody else might have raced and expanded it already */ - if (address > vma->vm_end) { - unsigned long size, grow; - - size = address - vma->vm_start; - grow = (address - vma->vm_end) >> PAGE_SHIFT; - - error = -ENOMEM; - if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { - error = acct_stack_growth(vma, size, grow); - if (!error) { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, grow); - anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_end = address; - /* Overwrite old entry in mtree. */ - vma_iter_store(&vmi, vma); - anon_vma_interval_tree_post_update_vma(vma); - - perf_event_mmap(vma); - } - } - } - anon_vma_unlock_write(vma->anon_vma); - vma_iter_free(&vmi); - validate_mm(mm); - return error; -} -#endif /* CONFIG_STACK_GROWSUP */ - -/* - * vma is the first one with address < vma->vm_start. Have to extend vma. - * mmap_lock held for writing. - */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *prev; - int error = 0; - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (!(vma->vm_flags & VM_GROWSDOWN)) - return -EFAULT; - - mmap_assert_write_locked(mm); - - address &= PAGE_MASK; - if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) - return -EPERM; - - /* Enforce stack_guard_gap */ - prev = vma_prev(&vmi); - /* Check that both stack segments have the same anon_vma? */ - if (prev) { - if (!(prev->vm_flags & VM_GROWSDOWN) && - vma_is_accessible(prev) && - (address - prev->vm_end < stack_guard_gap)) - return -ENOMEM; - } - - if (prev) - vma_iter_next_range_limit(&vmi, vma->vm_start); - - vma_iter_config(&vmi, address, vma->vm_end); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) { - vma_iter_free(&vmi); - return -ENOMEM; - } - - /* Lock the VMA before expanding to prevent concurrent page faults */ - vma_start_write(vma); - /* We update the anon VMA tree. */ - anon_vma_lock_write(vma->anon_vma); - - /* Somebody else might have raced and expanded it already */ - if (address < vma->vm_start) { - unsigned long size, grow; - - size = vma->vm_end - address; - grow = (vma->vm_start - address) >> PAGE_SHIFT; - - error = -ENOMEM; - if (grow <= vma->vm_pgoff) { - error = acct_stack_growth(vma, size, grow); - if (!error) { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, grow); - anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_start = address; - vma->vm_pgoff -= grow; - /* Overwrite old entry in mtree. */ - vma_iter_store(&vmi, vma); - anon_vma_interval_tree_post_update_vma(vma); - - perf_event_mmap(vma); - } - } - } - anon_vma_unlock_write(vma->anon_vma); - vma_iter_free(&vmi); - validate_mm(mm); - return error; -} - /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<f_mapping); - - if (error) - return error; - writable_file_mapping = true; - } - - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); - - /* Clear our write mapping regardless of error. */ - if (writable_file_mapping) - mapping_unmap_writable(file->f_mapping); - - validate_mm(current->mm); - return ret; -} - -static int __vm_munmap(unsigned long start, size_t len, bool unlock) -{ - int ret; - struct mm_struct *mm = current->mm; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, start); - - if (mmap_write_lock_killable(mm)) - return -EINTR; - - ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); - if (ret || !unlock) - mmap_write_unlock(mm); - - userfaultfd_unmap_complete(mm, &uf); - return ret; -} - int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); @@ -1512,88 +1207,6 @@ out: return ret; } -/* - * do_brk_flags() - Increase the brk vma if the flags match. - * @vmi: The vma iterator - * @addr: The start address - * @len: The length of the increase - * @vma: The vma, - * @flags: The VMA Flags - * - * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags - * do not match then create a new anonymous VMA. Eventually we may be able to - * do some brk-specific accounting here. - */ -static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - - /* - * Check against address space limits by the changed size - * Note: This happens *after* clearing old mappings in some code paths. - */ - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) - return -ENOMEM; - - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - - if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) - return -ENOMEM; - - /* - * Expand the existing vma if possible; Note that singular lists do not - * occur after forking, so the expand will only happen on new VMAs. - */ - if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); - - vmg.prev = vma; - /* vmi is positioned at prev, which this mode expects. */ - vmg.merge_flags = VMG_FLAG_JUST_EXPAND; - - if (vma_merge_new_range(&vmg)) - goto out; - else if (vmg_nomem(&vmg)) - goto unacct_fail; - } - - if (vma) - vma_iter_next_range(vmi); - /* create a vma struct for an anonymous mapping */ - vma = vm_area_alloc(mm); - if (!vma) - goto unacct_fail; - - vma_set_anonymous(vma); - vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, flags); - vma->vm_page_prot = vm_get_page_prot(flags); - vma_start_write(vma); - if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) - goto mas_store_fail; - - mm->map_count++; - validate_mm(mm); - ksm_add_vma(vma); -out: - perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; - mm->data_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) - mm->locked_vm += (len >> PAGE_SHIFT); - vm_flags_set(vma, VM_SOFTDIRTY); - return 0; - -mas_store_fail: - vm_area_free(vma); -unacct_fail: - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; -} - int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; @@ -1664,7 +1277,6 @@ void exit_mmap(struct mm_struct *mm) goto destroy; } - lru_add_drain(); flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ @@ -1693,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, /* unreachable = */ true); + vma_mark_detached(vma); + remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); @@ -2107,7 +1720,6 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) vma, new_start, length, false, true)) return -ENOMEM; - lru_add_drain(); tlb_gather_mmu(&tlb, mm); next = vma_next(&vmi); if (new_end > old_start) { @@ -2132,3 +1744,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) /* Shrink the vma to just the new range */ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } + +#ifdef CONFIG_MMU +/* + * Obtain a read lock on mm->mmap_lock, if the specified address is below the + * start of the VMA, the intent is to perform a write, and it is a + * downward-growing stack, then attempt to expand the stack to contain it. + * + * This function is intended only for obtaining an argument page from an ELF + * image, and is almost certainly NOT what you want to use for any other + * purpose. + * + * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the + * VMA referenced must not be linked in any user-visible tree, i.e. it must be a + * new VMA being mapped. + * + * The function assumes that addr is either contained within the VMA or below + * it, and makes no attempt to validate this value beyond that. + * + * Returns true if the read lock was obtained and a stack was perhaps expanded, + * false if the stack expansion failed. + * + * On stack expansion the function temporarily acquires an mmap write lock + * before downgrading it. + */ +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, + struct vm_area_struct *new_vma, + unsigned long addr, bool write) +{ + if (!write || addr >= new_vma->vm_start) { + mmap_read_lock(mm); + return true; + } + + if (!(new_vma->vm_flags & VM_GROWSDOWN)) + return false; + + mmap_write_lock(mm); + if (expand_downwards(new_vma, addr)) { + mmap_write_unlock(mm); + return false; + } + + mmap_write_downgrade(mm); + return true; +} +#else +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write) +{ + return false; +} +#endif diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f186d57df2c6..e7dbaf96aa17 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); -#ifdef CONFIG_MEMCG - -/* - * Size of the buffer for memcg path names. Ignoring stack trace support, - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. - */ -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - if (trace_mmap_lock_##type##_enabled()) { \ - char buf[MEMCG_PATH_BUF_SIZE]; \ - get_mm_memcg_path(mm, buf, sizeof(buf)); \ - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ - } \ - } while (0) - -#else /* !CONFIG_MEMCG */ - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) - -#endif /* CONFIG_MEMCG */ - #ifdef CONFIG_TRACING -#ifdef CONFIG_MEMCG -/* - * Write the given mm_struct's memcg path to a buffer. If the path cannot be - * determined, empty string is written. - */ -static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) -{ - struct mem_cgroup *memcg; - - buf[0] = '\0'; - memcg = get_mem_cgroup_from_mm(mm); - if (memcg == NULL) - return; - if (memcg->css.cgroup) - cgroup_path(memcg->css.cgroup, buf, buflen); - css_put(&memcg->css); -} - -#endif /* CONFIG_MEMCG */ - /* * Trace calls must be in a separate file, as otherwise there's a circular * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. @@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) { - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); + trace_mmap_lock_start_locking(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); + trace_mmap_lock_acquire_returned(mm, write, success); } EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) { - TRACE_MMAP_LOCK_EVENT(released, mm, write); + trace_mmap_lock_released(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_released); #endif /* CONFIG_TRACING */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 99b3e9408aa0..7aa6f18c500b 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -311,11 +311,34 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb) } } -static void tlb_remove_table_one(void *table) +#ifdef CONFIG_PT_RECLAIM +static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) +{ + struct ptdesc *ptdesc; + + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + __tlb_remove_table(ptdesc); +} + +static inline void __tlb_remove_table_one(void *table) +{ + struct ptdesc *ptdesc; + + ptdesc = table; + call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); +} +#else +static inline void __tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } +#endif /* CONFIG_PT_RECLAIM */ + +static void tlb_remove_table_one(void *table) +{ + __tlb_remove_table_one(table); +} static void tlb_table_flush(struct mmu_gather *tlb) { diff --git a/mm/mseal.c b/mm/mseal.c index 81d6e980e8a9..c27197ac04e8 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -217,9 +217,9 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) unsigned long end; struct mm_struct *mm = current->mm; - ret = can_do_mseal(flags); - if (ret) - return ret; + /* Verify flags not set. */ + if (flags) + return -EINVAL; start = untagged_addr(start); if (!PAGE_ALIGNED(start)) diff --git a/mm/numa.c b/mm/numa.c index e2eec07707d1..f1787d7713a6 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -37,13 +37,7 @@ void __init alloc_node_data(int nid) void __init alloc_offline_node_data(int nid) { pg_data_t *pgdat; - - pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - - node_data[nid] = pgdat; + node_data[nid] = memblock_alloc_or_panic(sizeof(*pgdat), SMP_CACHE_BYTES); } /* Stub functions: */ diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c index 031fb9961bf7..9d55679d99ce 100644 --- a/mm/numa_emulation.c +++ b/mm/numa_emulation.c @@ -8,11 +8,12 @@ #include #include #include +#include #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -static int emu_nid_to_phys[MAX_NUMNODES]; +int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; int __init numa_emu_cmdline(char *str) @@ -379,6 +380,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); int max_emu_nid, dfl_phys_nid; int i, j, ret; + nodemask_t physnode_mask = numa_nodes_parsed; if (!emu_cmdline) goto no_emu; @@ -395,7 +397,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * split the system RAM into N fake nodes. */ if (strchr(emu_cmdline, 'U')) { - nodemask_t physnode_mask = numa_nodes_parsed; unsigned long n; int nid = 0; @@ -465,9 +466,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) */ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); - /* commit */ - *numa_meminfo = ei; - /* Make sure numa_nodes_parsed only contains emulated nodes */ nodes_clear(numa_nodes_parsed); for (i = 0; i < ARRAY_SIZE(ei.blk); i++) @@ -475,10 +473,21 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) ei.blk[i].nid != NUMA_NO_NODE) node_set(ei.blk[i].nid, numa_nodes_parsed); - numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); + /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision + * with faked numa nodes, particularly during later memory hotplug + * handling, and also update numa_nodes_parsed accordingly. + */ + ret = fix_pxm_node_maps(max_emu_nid); + if (ret < 0) + goto no_emu; + + /* commit */ + *numa_meminfo = ei; + + numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1); /* make sure all emulated nodes are mapped to a physical node */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + for (i = 0; i < max_emu_nid + 1; i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = dfl_phys_nid; @@ -501,12 +510,34 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) numa_set_distance(i, j, dist); } } + for (i = 0; i < numa_distance_cnt; i++) { + for (j = 0; j < numa_distance_cnt; j++) { + int physi, physj; + u8 dist; + + /* distance between fake nodes is already ok */ + if (emu_nid_to_phys[i] != NUMA_NO_NODE && + emu_nid_to_phys[j] != NUMA_NO_NODE) + continue; + if (emu_nid_to_phys[i] != NUMA_NO_NODE) + physi = emu_nid_to_phys[i]; + else + physi = i - max_emu_nid; + if (emu_nid_to_phys[j] != NUMA_NO_NODE) + physj = emu_nid_to_phys[j]; + else + physj = j - max_emu_nid; + dist = phys_dist[physi * numa_dist_cnt + physj]; + numa_set_distance(i, j, dist); + } + } /* free the copied physical distance table */ memblock_free(phys_dist, phys_size); return; no_emu: + numa_nodes_parsed = physnode_mask; /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) emu_nid_to_phys[i] = i; diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index a3877e9bc878..ff4054f4334d 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -7,7 +7,7 @@ #include #include -static int numa_distance_cnt; +int numa_distance_cnt; static u8 *numa_distance; nodemask_t numa_nodes_parsed __initdata; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1c485beb0b93..044ebab2c941 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "internal.h" @@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); else { struct task_struct *p; + int i = 0; rcu_read_lock(); - for_each_process(p) + for_each_process(p) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + touch_softlockup_watchdog(); dump_task(p, oc); + } rcu_read_unlock(); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d9861e42b2bd..4f5970723cf2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -942,26 +942,25 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + + /* + * It's very possible that wb_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ + if (thresh > dtc->dirty) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); + else + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); + } + wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); if (wb_thresh > wb_max_thresh) wb_thresh = wb_max_thresh; - /* - * With strictlimit flag, the wb_thresh is treated as - * a hard limit in balance_dirty_pages() and wb_position_ratio(). - * It's possible that wb_thresh is close to zero, not because - * the device is slow, but because it has been inactive. - * To prevent occasional writes from being blocked, we raise wb_thresh. - */ - if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - unsigned long limit = hard_dirty_limit(dom, dtc->thresh); - u64 wb_scale_thresh = 0; - - if (limit > dtc->dirty) - wb_scale_thresh = (limit - dtc->dirty) / 100; - wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4)); - } - return wb_thresh; } @@ -969,6 +968,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + domain_dirty_avail(&gdtc, true); return __wb_calc_thresh(&gdtc, thresh); } @@ -1145,12 +1145,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; - if (dtc->wb_dirty < 8) { - dtc->pos_ratio = min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); - return; - } - if (dtc->wb_dirty >= wb_thresh) return; @@ -1221,14 +1215,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) */ if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; - /* - * It's very possible that wb_thresh is close to 0 not because the - * device is slow, but that it has remained inactive for long time. - * Honour such devices a reasonable good (hopefully IO efficient) - * threshold, so that the occasional writes won't be blocked and active - * writes can rampup the threshold quickly. - */ - wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh @@ -1484,17 +1470,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". - * - * We rampup dirty_ratelimit forcibly if wb_dirty is low because - * it's possible that wb_thresh is close to zero due to inactivity - * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; - if (dtc->wb_dirty < 8) - setpoint = dtc->wb_dirty + 1; - else - setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 01eab25edf89..c9d5f2450a08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1295,12 +1295,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order, set_page_count(p, 0); } - /* - * Freeing the page with debug_pagealloc enabled will try to - * unmap it; some archs don't like double-unmappings, so - * map it first. - */ - debug_pagealloc_map_pages(page, nr_pages); adjust_managed_page_count(page, nr_pages); } else { for (loop = 0; loop < nr_pages; loop++, p++) { @@ -1508,7 +1502,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, int i; set_page_private(page, 0); - set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); @@ -1856,6 +1849,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt) if (order >= pageblock_order) return true; + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, you just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. Unmovable and + * reclaimable allocations need to actually steal pages. + */ if (order >= pageblock_order / 2 || start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE || @@ -2592,9 +2593,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, - struct page *page, int migratetype, - unsigned int order) +static void free_frozen_page_commit(struct zone *zone, + struct per_cpu_pages *pcp, struct page *page, int migratetype, + unsigned int order) { int high, batch; int pindex; @@ -2643,7 +2644,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, /* * Free a pcp page */ -void free_unref_page(struct page *page, unsigned int order) +void free_frozen_pages(struct page *page, unsigned int order) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2666,20 +2667,20 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ + zone = page_zone(page); migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } - zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, FPI_NONE); @@ -2743,7 +2744,7 @@ void free_unref_folios(struct folio_batch *folios) /* * Free isolated pages directly to the - * allocator, see comment in free_unref_page. + * allocator, see comment in free_frozen_pages. */ if (is_migrate_isolate(migratetype)) { free_one_page(zone, &folio->page, pfn, @@ -2774,7 +2775,7 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, + free_frozen_page_commit(zone, pcp, &folio->page, migratetype, order); } @@ -3567,7 +3568,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, if (!page) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - return page; } @@ -4531,28 +4531,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array * @gfp: GFP flags for the allocation * @preferred_nid: The preferred NUMA node ID to allocate from * @nodemask: Set of nodes to allocate from, may be NULL - * @nr_pages: The number of pages desired on the list or array - * @page_list: Optional list to store the allocated pages - * @page_array: Optional array to store the pages + * @nr_pages: The number of pages desired in the array + * @page_array: Array to store the pages * * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to page_list if page_list - * is not NULL, otherwise it is assumed that the page_array is valid. + * allocate nr_pages quickly. Pages are added to the page_array. * - * For lists, nr_pages is the number of pages that should be allocated. - * - * For arrays, only NULL elements are populated with pages and nr_pages + * Note that only NULL elements are populated with pages and nr_pages * is the maximum number of pages that will be stored in the array. * - * Returns the number of pages on the list or array. + * Returns the number of pages in the array. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array) { struct page *page; @@ -4570,7 +4565,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. */ - while (page_array && nr_populated < nr_pages && page_array[nr_populated]) + while (nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; /* No pages requested? */ @@ -4578,7 +4573,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto out; /* Already populated array? */ - if (unlikely(page_array && nr_pages - nr_populated == 0)) + if (unlikely(nr_pages - nr_populated == 0)) goto out; /* Bulk allocator does not support memcg accounting. */ @@ -4660,7 +4655,7 @@ retry_this_zone: while (nr_populated < nr_pages) { /* Skip existing pages */ - if (page_array && page_array[nr_populated]) { + if (page_array[nr_populated]) { nr_populated++; continue; } @@ -4678,11 +4673,8 @@ retry_this_zone: nr_account++; prep_new_page(page, 0, gfp, 0); - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; + set_page_refcounted(page); + page_array[nr_populated++] = page; } pcp_spin_unlock(pcp); @@ -4699,14 +4691,8 @@ failed_irq: failed: page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); - if (page) { - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; - } - + if (page) + page_array[nr_populated++] = page; goto out; } EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); @@ -4714,8 +4700,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, - int preferred_nid, nodemask_t *nodemask) +struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4768,7 +4754,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, out: if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { - __free_pages(page, order); + free_frozen_pages(page, order); page = NULL; } @@ -4777,6 +4763,18 @@ out: return page; } +EXPORT_SYMBOL(__alloc_frozen_pages_noprof); + +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct page *page; + + page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask); + if (page) + set_page_refcounted(page); + return page; +} EXPORT_SYMBOL(__alloc_pages_noprof); struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, @@ -4837,11 +4835,11 @@ void __free_pages(struct page *page, unsigned int order) struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_unref_page(page, order); + free_frozen_pages(page, order); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_unref_page(page + (1 << order), order); + free_frozen_pages(page + (1 << order), order); } } EXPORT_SYMBOL(__free_pages); @@ -6270,9 +6268,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list) * @migratetype: using migratetype to filter the type of migration in * trace_mm_alloc_contig_migrate_range_info. */ -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end, int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6281,7 +6278,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, int ret = 0; struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = cc->gfp_mask, .reason = MR_CONTIG_RANGE, }; struct page *page; @@ -6351,7 +6348,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } -static void split_free_pages(struct list_head *list) +static void split_free_pages(struct list_head *list, gfp_t gfp_mask) { int order; @@ -6362,7 +6359,8 @@ static void split_free_pages(struct list_head *list) list_for_each_entry_safe(page, next, &list[order], lru) { int i; - post_alloc_hook(page, order, __GFP_MOVABLE); + post_alloc_hook(page, order, gfp_mask); + set_page_refcounted(page); if (!order) continue; @@ -6376,6 +6374,40 @@ static void split_free_pages(struct list_head *list) } } +static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) +{ + const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO; + const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + + /* + * We are given the range to allocate; node, mobility and placement + * hints are irrelevant at this point. We'll simply ignore them. + */ + gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE | + __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE); + + /* + * We only support most reclaim flags (but not NOFAIL/NORETRY), and + * selected action flags. + */ + if (gfp_mask & ~(reclaim_mask | action_mask)) + return -EINVAL; + + /* + * Flags to control page compaction/migration/reclaim, to free up our + * page range. Migratable pages are movable, __GFP_MOVABLE is implied + * for them. + * + * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that + * to not degrade callers. + */ + *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) | + __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; + return 0; +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -6384,7 +6416,9 @@ static void split_free_pages(struct list_head *list) * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. - * @gfp_mask: GFP mask to use during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * * The PFN range does not have to be pageblock aligned. The PFN range must * belong to a single zone. @@ -6410,11 +6444,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, - .gfp_mask = current_gfp_context(gfp_mask), .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); + gfp_mask = current_gfp_context(gfp_mask); + if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) + return -EINVAL; + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -6436,7 +6473,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); + ret = start_isolate_page_range(start, end, migratetype, 0); if (ret) goto done; @@ -6455,7 +6492,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; - ret = 0; + + /* + * When in-use hugetlb pages are migrated, they may simply be released + * back into the free hugepage pool instead of being returned to the + * buddy system. After the migration of in-use huge pages is completed, + * we will invoke replace_free_hugepage_folios() to ensure that these + * hugepages are properly released to the buddy system. + */ + ret = replace_free_hugepage_folios(start, end); + if (ret) + goto done; /* * Pages from [start, end) are within a pageblock_nr_pages @@ -6489,7 +6536,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, } if (!(gfp_mask & __GFP_COMP)) { - split_free_pages(cc.freepages); + split_free_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ if (start != outer_start) @@ -6502,6 +6549,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); + set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", @@ -6556,7 +6604,9 @@ static bool zone_spans_last_pfn(const struct zone *zone, /** * alloc_contig_pages() -- tries to find and allocate contiguous range of pages * @nr_pages: Number of contiguous pages to allocate - * @gfp_mask: GFP mask to limit search and used during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * @nid: Target node * @nodemask: Mask for other possible nodes * diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c index 3f7a203d35c6..d2423f30577e 100644 --- a/mm/page_frag_cache.c +++ b/mm/page_frag_cache.c @@ -86,7 +86,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -138,7 +138,7 @@ refill: goto refill; if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) { - free_unref_page(page, + free_frozen_pages(page, encoded_page_decode_order(encoded_page)); goto refill; } @@ -166,6 +166,6 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); diff --git a/mm/page_idle.c b/mm/page_idle.c index 41ea77f22011..947c7c7a3728 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -112,7 +112,7 @@ static void page_idle_clear_pte_refs(struct folio *folio) } static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { u64 *out = (u64 *)buf; @@ -157,7 +157,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, } static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { const u64 *in = (u64 *)buf; @@ -193,17 +193,17 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, return (char *)in - buf; } -static struct bin_attribute page_idle_bitmap_attr = +static const struct bin_attribute page_idle_bitmap_attr = __BIN_ATTR(bitmap, 0600, page_idle_bitmap_read, page_idle_bitmap_write, 0); -static struct bin_attribute *page_idle_bin_attrs[] = { +static const struct bin_attribute *const page_idle_bin_attrs[] = { &page_idle_bitmap_attr, NULL, }; static const struct attribute_group page_idle_attr_group = { - .bin_attrs = page_idle_bin_attrs, + .bin_attrs_new = page_idle_bin_attrs, .name = "page_idle", }; diff --git a/mm/page_io.c b/mm/page_io.c index 4b4ea8e49cf6..9b983de351f9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -163,7 +163,6 @@ reprobe: page_no = 1; /* force Empty message */ sis->max = page_no; sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; out: return ret; bad_bmap: diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 7e04047977cf..c608e9d72865 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -286,7 +286,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * within a free or in-use page. * @boundary_pfn: pageblock-aligned pfn that a page might cross * @flags: isolation flags - * @gfp_flags: GFP flags used for migrating pages * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() @@ -306,8 +305,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before, bool skip_isolation, - int migratetype) + bool isolate_before, bool skip_isolation, int migratetype) { unsigned long start_pfn; unsigned long isolate_pageblock; @@ -444,8 +442,6 @@ failed: * and PageOffline() pages. * REPORT_FAILURE - report details about the failure to * isolate the range - * @gfp_flags: GFP flags used for migrating pages that sit across the - * range boundaries. * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -478,7 +474,7 @@ failed: * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags) + int migratetype, int flags) { unsigned long pfn; struct page *page; @@ -489,7 +485,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, + ret = isolate_single_pageblock(isolate_start, flags, false, skip_isolation, migratetype); if (ret) return ret; @@ -498,7 +494,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, + ret = isolate_single_pageblock(isolate_end, flags, true, skip_isolation, migratetype); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); diff --git a/mm/percpu.c b/mm/percpu.c index d8dd31a2e407..ac61e3fc5f15 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1359,10 +1359,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); - chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1374,24 +1371,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); - chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->alloc_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); - chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->bound_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); - chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->md_blocks) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); - + chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); #ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ chunk->obj_exts = NULL; @@ -2595,28 +2582,16 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); - group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_offsets) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); - group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_sizes) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); - unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); - unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_off) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2685,12 +2660,9 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_free_slot = pcpu_sidelined_slot + 1; pcpu_to_depopulate_slot = pcpu_free_slot + 1; pcpu_nr_slots = pcpu_to_depopulate_slot + 1; - pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * + pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]), SMP_CACHE_BYTES); - if (!pcpu_chunk_lists) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pcpu_nr_slots * sizeof(pcpu_chunk_lists[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_lists[i]); @@ -3155,25 +3127,19 @@ void __init __weak pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (pgd_none(*pgd)) { - p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - if (!p4d) - goto err_alloc; + p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!pud) - goto err_alloc; + pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!pmd) - goto err_alloc; + pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); pud_populate(&init_mm, pud, pmd); } @@ -3181,16 +3147,11 @@ void __init __weak pcpu_populate_pte(unsigned long addr) if (!pmd_present(*pmd)) { pte_t *new; - new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); - if (!new) - goto err_alloc; + new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmd, new); } return; - -err_alloc: - panic("%s: Failed to allocate memory\n", __func__); } /** @@ -3237,10 +3198,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); - if (!pages) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pages_size); + pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c new file mode 100644 index 000000000000..7e9455a18aae --- /dev/null +++ b/mm/pt_reclaim.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "internal.h" + +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return details && details->reclaim_pt && (end - start >= PMD_SIZE); +} + +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval) +{ + spinlock_t *pml = pmd_lockptr(mm, pmd); + + if (!spin_trylock(pml)) + return false; + + *pmdval = pmdp_get_lockless(pmd); + pmd_clear(pmd); + spin_unlock(pml); + + return true; +} + +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval) +{ + pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); + mm_dec_nr_ptes(mm); +} + +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb) +{ + pmd_t pmdval; + spinlock_t *pml, *ptl = NULL; + pte_t *start_pte, *pte; + int i; + + pml = pmd_lock(mm, pmd); + start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl); + if (!start_pte) + goto out_ptl; + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + /* Check if it is empty PTE page */ + for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(ptep_get(pte))) + goto out_ptl; + } + pte_unmap(start_pte); + + pmd_clear(pmd); + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + + free_pte(mm, addr, tlb, pmdval); + + return; +out_ptl: + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (ptl != pml) + spin_unlock(pml); +} diff --git a/mm/readahead.c b/mm/readahead.c index e151f4b13ca4..6a4e96b69702 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,20 +158,10 @@ static void read_pages(struct readahead_control *rac) if (aops->readahead) { aops->readahead(rac); - /* - * Clean up the remaining folios. The sizes in ->ra - * may be used to size the next readahead, so make sure - * they accurately reflect what happened. - */ + /* Clean up the remaining folios. */ while ((folio = readahead_folio(rac)) != NULL) { - unsigned long nr = folio_nr_pages(folio); - folio_get(folio); - rac->ra->size -= nr; - if (rac->ra->async_size >= nr) { - rac->ra->async_size -= nr; - filemap_remove_folio(folio); - } + filemap_remove_folio(folio); folio_unlock(folio); folio_put(folio); } @@ -188,6 +178,18 @@ static void read_pages(struct readahead_control *rac) BUG_ON(readahead_count(rac)); } +static struct folio *ractl_alloc_folio(struct readahead_control *ractl, + gfp_t gfp_mask, unsigned int order) +{ + struct folio *folio; + + folio = filemap_alloc_folio(gfp_mask, order); + if (folio && ractl->dropbehind) + __folio_set_dropbehind(folio); + + return folio; +} + /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. @@ -265,8 +267,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - folio = filemap_alloc_folio(gfp_mask, - mapping_min_folio_order(mapping)); + folio = ractl_alloc_folio(ractl, gfp_mask, + mapping_min_folio_order(mapping)); if (!folio) break; @@ -436,7 +438,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; - struct folio *folio = filemap_alloc_folio(gfp, order); + struct folio *folio = ractl_alloc_folio(ractl, gfp, order); if (!folio) return -ENOMEM; @@ -458,7 +460,8 @@ void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; - pgoff_t index = readahead_index(ractl); + pgoff_t start = readahead_index(ractl); + pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; @@ -516,12 +519,18 @@ void page_cache_ra_order(struct readahead_control *ractl, /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this - * situation. + * situation below. */ if (!err) return; fallback: - do_page_cache_ra(ractl, ra->size, ra->async_size); + /* + * ->readahead() may have updated readahead window size so we have to + * check there's still something to read. + */ + if (ra->size > index - start) + do_page_cache_ra(ractl, ra->size - (index - start), + ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, @@ -754,7 +763,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; @@ -783,7 +792,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 6d783436951f..e7173fcd210c 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -12,7 +12,8 @@ #include #include -static const int rodata_test_data = 0xC3; +#define TEST_VALUE 0xC3 +static const int rodata_test_data = TEST_VALUE; void rodata_test(void) { @@ -20,7 +21,7 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ - if (!rodata_test_data) { + if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) { pr_err("test 1 fails (start data)\n"); return; } @@ -33,7 +34,7 @@ void rodata_test(void) } /* test 3: check the value hasn't changed */ - if (rodata_test_data == zero) { + if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) { pr_err("test data was changed\n"); return; } diff --git a/mm/shmem.c b/mm/shmem.c index fdb5afa1cfe9..44379bee5b96 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -553,38 +553,105 @@ static bool shmem_confirm_swap(struct address_space *mapping, /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; +static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - unsigned long vm_flags) +/** + * shmem_mapping_size_orders - Get allowable folio orders for the given file size. + * @mapping: Target address_space. + * @index: The page index. + * @write_end: end of a write, could extend inode size. + * + * This returns huge orders for folios (when supported) based on the file size + * which the mapping currently allows at the given index. The index is relevant + * due to alignment considerations the mapping might have. The returned order + * may be less than the size passed. + * + * Return: The orders. + */ +static inline unsigned int +shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end) { + unsigned int order; + size_t size; + + if (!mapping_large_folio_support(mapping) || !write_end) + return 0; + + /* Calculate the write size based on the write_end */ + size = write_end - (index << PAGE_SHIFT); + order = filemap_get_order(size); + if (!order) + return 0; + + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); + + order = min_t(size_t, order, MAX_PAGECACHE_ORDER); + return order > 0 ? BIT(order + 1) - 1 : 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? + 0 : BIT(HPAGE_PMD_ORDER); + unsigned long within_size_orders; + unsigned int order; + pgoff_t aligned_index; loff_t i_size; - if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) - return false; if (!S_ISREG(inode->i_mode)) - return false; + return 0; if (shmem_huge == SHMEM_HUGE_DENY) - return false; + return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) - return true; + return maybe_pmd_order; + /* + * The huge order allocation for anon shmem is controlled through + * the mTHP interface, so we still use PMD-sized huge order to + * check whether global control is enabled. + * + * For tmpfs mmap()'s huge order, we still use PMD-sized order to + * allocate huge pages due to lack of a write size hint. + * + * Otherwise, tmpfs will allow getting a highest order hint based on + * the size of write and fallocate paths, then will try each allowable + * huge orders. + */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - return true; + if (vma) + return maybe_pmd_order; + + return shmem_mapping_size_orders(inode->i_mapping, index, write_end); case SHMEM_HUGE_WITHIN_SIZE: - index = round_up(index + 1, HPAGE_PMD_NR); - i_size = max(write_end, i_size_read(inode)); - i_size = round_up(i_size, PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) - return true; + if (vma) + within_size_orders = maybe_pmd_order; + else + within_size_orders = shmem_mapping_size_orders(inode->i_mapping, + index, write_end); + + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) - return true; + return maybe_pmd_order; fallthrough; default: - return false; + return 0; } } @@ -779,11 +846,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, return 0; } -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - unsigned long vm_flags) +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) { - return false; + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -1180,7 +1248,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_huge_global_enabled(inode, 0, 0, false, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1690,22 +1758,18 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; pgoff_t aligned_index; - bool global_huge; + unsigned int global_orders; loff_t i_size; int order; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) return 0; - global_huge = shmem_huge_global_enabled(inode, index, write_end, - shmem_huge_force, vm_flags); - if (!vma || !vma_is_anon_shmem(vma)) { - /* - * For tmpfs, we now only support PMD sized THP if huge page - * is enabled, otherwise fallback to order 0. - */ - return global_huge ? BIT(HPAGE_PMD_ORDER) : 0; - } + global_orders = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; /* * Following the 'deny' semantics of the top level, force the huge @@ -1737,7 +1801,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); - if (global_huge) + if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; @@ -1903,6 +1967,65 @@ unlock: return ERR_PTR(error); } +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *new; + void *shadow; + int nr_pages; + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } + + new = shmem_alloc_folio(gfp, order, info, index); + if (!new) + return ERR_PTR(-ENOMEM); + + nr_pages = folio_nr_pages(new); + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + gfp, entry)) { + folio_put(new); + return ERR_PTR(-ENOMEM); + } + + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + return ERR_PTR(-EEXIST); + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + mem_cgroup_swapin_uncharge_swap(entry, nr_pages); + shadow = get_shadow_from_swap_cache(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +} + /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of @@ -2006,7 +2129,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap) + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; @@ -2022,7 +2146,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - delete_from_swap_cache(folio); + if (!skip_swapcache) + delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) @@ -2126,6 +2251,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; + bool skip_swapcache = false; swp_entry_t swap; int error, nr_pages; @@ -2147,6 +2273,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { + int order = xa_get_order(&mapping->i_pages, index); + bool fallback_order0 = false; int split_order; /* Or update major stats only when swapin succeeds?? */ @@ -2156,6 +2284,33 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + */ + if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled())) + fallback_order0 = true; + + /* Skip swapcache for synchronous device. */ + if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); + if (!IS_ERR(folio)) { + skip_swapcache = true; + goto alloced; + } + + /* + * Fallback to swapin order-0 folio unless the swap entry + * already exists. + */ + error = PTR_ERR(folio); + folio = NULL; + if (error == -EEXIST) + goto failed; + } + /* * Now swap device can only swap in order 0 folio, then we * should split the large swap entry stored in the pagecache @@ -2186,9 +2341,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } } +alloced: /* We have to do this with folio locked to prevent races */ folio_lock(folio); - if (!folio_test_swapcache(folio) || + if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; @@ -2224,7 +2380,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(folio); + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); + } else { + delete_from_swap_cache(folio); + } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); put_swap_device(si); @@ -2235,8 +2396,11 @@ failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap); + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); unlock: + if (skip_swapcache) + swapcache_clear(si, swap, folio_nr_pages(folio)); if (folio) { folio_unlock(folio); folio_put(folio); @@ -2752,12 +2916,6 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); - int ret; - - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ @@ -4583,48 +4741,37 @@ bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } -static int shmem_parse_options(struct fs_context *fc, void *data) +static char *shmem_next_opt(char **s) { - char *options = data; + char *sbegin = *s; + char *p; - if (options) { - int err = security_sb_eat_lsm_opts(options, &fc->security); - if (err) - return err; - } + if (sbegin == NULL) + return NULL; - while (options != NULL) { - char *this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } - } - if (*this_char) { - char *value = strchr(this_char, '='); - size_t len = 0; - int err; - - if (value) { - *value++ = '\0'; - len = strlen(value); - } - err = vfs_parse_fs_string(fc, this_char, value, len); - if (err < 0) - return err; + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; } } - return 0; + + *s = NULL; + return sbegin; +} + +static int shmem_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* @@ -4891,7 +5038,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; - sbinfo->huge = ctx->huge; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + else + sbinfo->huge = tmpfs_huge; +#endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; @@ -4969,7 +5121,7 @@ static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS - .parse_monolithic = shmem_parse_options, + .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif @@ -5442,6 +5594,21 @@ static int __init setup_transparent_hugepage_shmem(char *str) } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); +static int __init setup_transparent_hugepage_tmpfs(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge < 0) { + pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); + return huge; + } + + tmpfs_huge = huge; + return 1; +} +__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); + static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { diff --git a/mm/slub.c b/mm/slub.c index c2151c9fee22..996691c137eb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2311,7 +2311,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, * We have to do this manually because the rcu_head is * not located inside the object. */ - kasan_record_aux_stack_noalloc(x); + kasan_record_aux_stack(x); delayed_free->object = x; call_rcu(&delayed_free->head, slab_free_after_rcu_debug); @@ -2420,17 +2420,15 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, unsigned int order = oo_order(oo); if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_pages(flags, order); + folio = (struct folio *)alloc_frozen_pages(flags, order); else - folio = (struct folio *)__alloc_pages_node(node, flags, order); + folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); if (!folio) return NULL; slab = folio_slab(folio); __folio_set_slab(folio); - /* Make the flag visible before any changes to folio->mapping */ - smp_wmb(); if (folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); @@ -2651,12 +2649,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __slab_clear_pfmemalloc(slab); folio->mapping = NULL; - /* Make the mapping reset visible before clearing the flag */ - smp_wmb(); __folio_clear_slab(folio); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - __free_pages(&folio->page, order); + free_frozen_pages(&folio->page, order); } static void rcu_free_slab(struct rcu_head *h) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index cec67c5f37d8..3287ebadd167 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -31,6 +31,8 @@ #include #include +#include "internal.h" + /* * Allocate a block of memory to be used to back the virtual memory map * or to back the page tables that are used to create the mapping. @@ -42,8 +44,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_alloc_try_nid_raw(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); + return memmap_alloc(size, align, goal, node, false); } void * __meminit vmemmap_alloc_block(unsigned long size, int node) diff --git a/mm/sparse.c b/mm/sparse.c index 13b6624d3562..133b033d0cba 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,10 +257,7 @@ static void __init memblocks_present(void) size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc(size, align); - if (!mem_section) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, align); + mem_section = memblock_alloc_or_panic(size, align); } #endif diff --git a/mm/swap.c b/mm/swap.c index 10decd9dffa1..fc8281ef4241 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -109,7 +109,7 @@ void __folio_put(struct folio *folio) page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, folio_order(folio)); + free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); @@ -379,37 +379,58 @@ static void __lru_cache_activate_folio(struct folio *folio) } #ifdef CONFIG_LRU_GEN -static void folio_inc_refs(struct folio *folio) + +static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - folio_set_referenced(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } - if (!folio_test_workingset(folio)) { - folio_set_workingset(folio); - return; - } - - /* see the comment on MAX_NR_TIERS */ do { - new_flags = old_flags & LRU_REFS_MASK; - if (new_flags == LRU_REFS_MASK) - break; + if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { + if (!folio_test_workingset(folio)) + folio_set_workingset(folio); + return; + } - new_flags += BIT(LRU_REFS_PGOFF); - new_flags |= old_flags & ~LRU_REFS_MASK; + new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } -#else -static void folio_inc_refs(struct folio *folio) + +static bool lru_gen_clear_refs(struct folio *folio) +{ + struct lru_gen_folio *lrugen; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + + if (gen < 0) + return true; + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + + lrugen = &folio_lruvec(folio)->lrugen; + /* whether can do without shuffling under the LRU lock */ + return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); +} + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_inc_refs(struct folio *folio) { } + +static bool lru_gen_clear_refs(struct folio *folio) +{ + return false; +} + #endif /* CONFIG_LRU_GEN */ /** @@ -427,8 +448,10 @@ static void folio_inc_refs(struct folio *folio) */ void folio_mark_accessed(struct folio *folio) { + if (folio_test_dropbehind(folio)) + return; if (lru_gen_enabled()) { - folio_inc_refs(folio); + lru_gen_inc_refs(folio); return; } @@ -474,7 +497,7 @@ void folio_add_lru(struct folio *folio) folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - /* see the comment in lru_gen_add_folio() */ + /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); @@ -524,7 +547,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { - bool active = folio_test_active(folio); + bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) @@ -589,7 +612,10 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_del_folio(lruvec, folio); folio_clear_active(folio); - folio_clear_referenced(folio); + if (lru_gen_enabled()) + lru_gen_clear_refs(folio); + else + folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal @@ -657,6 +683,9 @@ void deactivate_file_folio(struct folio *folio) if (folio_test_unevictable(folio)) return; + if (lru_gen_enabled() && lru_gen_clear_refs(folio)) + return; + folio_batch_add_and_move(folio, lru_deactivate_file, true); } @@ -670,7 +699,10 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + if (folio_test_unevictable(folio)) + return; + + if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index da1278f0563b..be39078f255b 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -6,149 +6,106 @@ #include /* depends on mm.h include */ static DEFINE_MUTEX(swap_cgroup_mutex); + +/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ +#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) +#define ID_SHIFT (BITS_PER_TYPE(unsigned short)) +#define ID_MASK (BIT(ID_SHIFT) - 1) +struct swap_cgroup { + atomic_t ids; +}; + struct swap_cgroup_ctrl { - struct page **map; - unsigned long length; - spinlock_t lock; + struct swap_cgroup *map; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; -struct swap_cgroup { - unsigned short id; -}; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - -/* - * SwapCgroup implements "lookup" and "exchange" operations. - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge - * against SwapCache. At swap_free(), this is accessed directly from swap. - * - * This means, - * - we have no race in "exchange" when we're accessed via SwapCache because - * SwapCache(and its swp_entry) is under lock. - * - When called via swap_free(), there is no user of this entry and no race. - * Then, we don't need lock around "exchange". - * - * TODO: we can push these buffers out to HIGHMEM. - */ - -/* - * allocate buffer for swap_cgroup. - */ -static int swap_cgroup_prepare(int type) +static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, + pgoff_t offset) { - struct page *page; - struct swap_cgroup_ctrl *ctrl; - unsigned long idx, max; + unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; + unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); - ctrl = &swap_cgroup_ctrl[type]; + BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); + BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t)); - for (idx = 0; idx < ctrl->length; idx++) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto not_enough_page; - ctrl->map[idx] = page; - - if (!(idx % SWAP_CLUSTER_MAX)) - cond_resched(); - } - return 0; -not_enough_page: - max = idx; - for (idx = 0; idx < max; idx++) - __free_page(ctrl->map[idx]); - - return -ENOMEM; + return (old_ids >> shift) & ID_MASK; } -static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, - pgoff_t offset) +static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, + pgoff_t offset, + unsigned short new_id) { - struct page *mappage; - struct swap_cgroup *sc; + unsigned short old_id; + struct swap_cgroup *sc = &map[offset / ID_PER_SC]; + unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; + unsigned int new_ids, old_ids = atomic_read(&sc->ids); - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; -} + do { + old_id = (old_ids >> shift) & ID_MASK; + new_ids = (old_ids & ~(ID_MASK << shift)); + new_ids |= ((unsigned int)new_id) << shift; + } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) -{ - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - return __lookup_swap_cgroup(ctrl, offset); + return old_id; } /** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id + * swap_cgroup_record - record mem_cgroup for a set of swap entries. + * These entries must belong to one single folio, and that folio + * must be being charged for swap space (swap out), and these + * entries must not have been charged * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) + * @folio: the folio that the swap entry belongs to + * @ent: the first swap entry to be recorded */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) +void swap_cgroup_record(struct folio *folio, swp_entry_t ent) { - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; + unsigned int nr_ents = folio_nr_pages(folio); + struct swap_cgroup *map; + pgoff_t offset, end; + unsigned short old; - sc = lookup_swap_cgroup(ent, &ctrl); + offset = swp_offset(ent); + end = offset + nr_ents; + map = swap_cgroup_ctrl[swp_type(ent)].map; - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; + do { + old = __swap_cgroup_id_xchg(map, offset, + mem_cgroup_id(folio_memcg(folio))); + VM_BUG_ON(old); + } while (++offset != end); } /** - * swap_cgroup_record - record mem_cgroup for a set of swap entries + * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. + * These entries must be being uncharged from swap. They either + * belongs to one single folio in the swap cache (swap in for + * cgroup v1), or no longer have any users (slot freeing). + * * @ent: the first swap entry to be recorded into - * @id: mem_cgroup to be recorded * @nr_ents: number of swap entries to be recorded * - * Returns old value at success, 0 at failure. - * (Of course, old value can be 0.) + * Returns the existing old value. */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned short old; - unsigned long flags; pgoff_t offset = swp_offset(ent); pgoff_t end = offset + nr_ents; + struct swap_cgroup *map; + unsigned short old, iter = 0; - sc = lookup_swap_cgroup(ent, &ctrl); + offset = swp_offset(ent); + end = offset + nr_ents; + map = swap_cgroup_ctrl[swp_type(ent)].map; - spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - for (;;) { - VM_BUG_ON(sc->id != old); - sc->id = id; - offset++; - if (offset == end) - break; - if (offset % SC_PER_PAGE) - sc++; - else - sc = __lookup_swap_cgroup(ctrl, offset); - } - spin_unlock_irqrestore(&ctrl->lock, flags); + do { + old = __swap_cgroup_id_xchg(map, offset, 0); + if (!iter) + iter = old; + VM_BUG_ON(iter != old); + } while (++offset != end); return old; } @@ -161,39 +118,33 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { - if (mem_cgroup_disabled()) - return 0; - return lookup_swap_cgroup(ent, NULL)->id; -} - -int swap_cgroup_swapon(int type, unsigned long max_pages) -{ - void *array; - unsigned long length; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); +} - array = vcalloc(length, sizeof(void *)); - if (!array) +int swap_cgroup_swapon(int type, unsigned long max_pages) +{ + struct swap_cgroup *map; + struct swap_cgroup_ctrl *ctrl; + + if (mem_cgroup_disabled()) + return 0; + + BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC != + sizeof(struct swap_cgroup)); + map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * + sizeof(struct swap_cgroup)); + if (!map) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); - ctrl->length = length; - ctrl->map = array; - spin_lock_init(&ctrl->lock); - if (swap_cgroup_prepare(type)) { - /* memory shortage */ - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - vfree(array); - goto nomem; - } + ctrl->map = map; mutex_unlock(&swap_cgroup_mutex); return 0; @@ -205,8 +156,7 @@ nomem: void swap_cgroup_swapoff(int type) { - struct page **map; - unsigned long i, length; + struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) @@ -215,19 +165,8 @@ void swap_cgroup_swapoff(int type) mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; - length = ctrl->length; ctrl->map = NULL; - ctrl->length = 0; mutex_unlock(&swap_cgroup_mutex); - if (map) { - for (i = 0; i < length; i++) { - struct page *page = map[i]; - if (page) - __free_page(page); - if (!(i % SWAP_CLUSTER_MAX)) - cond_resched(); - } - vfree(map); - } + vfree(map); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 13ab3b771409..9c7c171df7ba 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -43,17 +43,15 @@ static DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ static DEFINE_MUTEX(swap_slots_cache_enable_mutex); -static void __drain_swap_slots_cache(unsigned int type); +static void __drain_swap_slots_cache(void); #define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) -#define SLOTS_CACHE 0x1 -#define SLOTS_CACHE_RET 0x2 static void deactivate_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_mutex); swap_slot_cache_active = false; - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + __drain_swap_slots_cache(); mutex_unlock(&swap_slots_cache_mutex); } @@ -72,7 +70,7 @@ void disable_swap_slots_cache_lock(void) if (swap_slot_cache_initialized) { /* serialize with cpu hotplug operations */ cpus_read_lock(); - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + __drain_swap_slots_cache(); cpus_read_unlock(); } } @@ -113,7 +111,7 @@ out: static int alloc_swap_slot_cache(unsigned int cpu) { struct swap_slots_cache *cache; - swp_entry_t *slots, *slots_ret; + swp_entry_t *slots; /* * Do allocation outside swap_slots_cache_mutex @@ -125,28 +123,19 @@ static int alloc_swap_slot_cache(unsigned int cpu) if (!slots) return -ENOMEM; - slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), - GFP_KERNEL); - if (!slots_ret) { - kvfree(slots); - return -ENOMEM; - } - mutex_lock(&swap_slots_cache_mutex); cache = &per_cpu(swp_slots, cpu); - if (cache->slots || cache->slots_ret) { + if (cache->slots) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); kvfree(slots); - kvfree(slots_ret); return 0; } if (!cache->lock_initialized) { mutex_init(&cache->alloc_lock); - spin_lock_init(&cache->free_lock); cache->lock_initialized = true; } cache->nr = 0; @@ -160,19 +149,16 @@ static int alloc_swap_slot_cache(unsigned int cpu) */ mb(); cache->slots = slots; - cache->slots_ret = slots_ret; mutex_unlock(&swap_slots_cache_mutex); return 0; } -static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +static void drain_slots_cache_cpu(unsigned int cpu, bool free_slots) { struct swap_slots_cache *cache; - swp_entry_t *slots = NULL; cache = &per_cpu(swp_slots, cpu); - if ((type & SLOTS_CACHE) && cache->slots) { + if (cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); cache->cur = 0; @@ -183,20 +169,9 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } mutex_unlock(&cache->alloc_lock); } - if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { - spin_lock_irq(&cache->free_lock); - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - if (free_slots && cache->slots_ret) { - slots = cache->slots_ret; - cache->slots_ret = NULL; - } - spin_unlock_irq(&cache->free_lock); - kvfree(slots); - } } -static void __drain_swap_slots_cache(unsigned int type) +static void __drain_swap_slots_cache(void) { unsigned int cpu; @@ -224,13 +199,13 @@ static void __drain_swap_slots_cache(unsigned int type) * There are no slots on such cpu that need to be drained. */ for_each_online_cpu(cpu) - drain_slots_cache_cpu(cpu, type, false); + drain_slots_cache_cpu(cpu, false); } static int free_slot_cache(unsigned int cpu) { mutex_lock(&swap_slots_cache_mutex); - drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); + drain_slots_cache_cpu(cpu, true); mutex_unlock(&swap_slots_cache_mutex); return 0; } @@ -269,39 +244,6 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) return cache->nr; } -void free_swap_slot(swp_entry_t entry) -{ - struct swap_slots_cache *cache; - - /* Large folio swap slot is not covered. */ - zswap_invalidate(entry); - - cache = raw_cpu_ptr(&swp_slots); - if (likely(use_swap_slot_cache && cache->slots_ret)) { - spin_lock_irq(&cache->free_lock); - /* Swap slots cache may be deactivated before acquiring lock */ - if (!use_swap_slot_cache || !cache->slots_ret) { - spin_unlock_irq(&cache->free_lock); - goto direct_free; - } - if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { - /* - * Return slots to global pool. - * The current swap_map value is SWAP_HAS_CACHE. - * Set it to 0 to indicate it is available for - * allocation in global pool - */ - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - } - cache->slots_ret[cache->n_ret++] = entry; - spin_unlock_irq(&cache->free_lock); - } else { -direct_free: - swapcache_free_entries(&entry, 1); - } -} - swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; diff --git a/mm/swap_state.c b/mm/swap_state.c index e0c0321b8ff7..ca42b2be64d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) struct folio_batch folios; unsigned int refs[PAGEVEC_SIZE]; - lru_add_drain(); folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); diff --git a/mm/swapfile.c b/mm/swapfile.c index b0a9071cfe1d..d623f5b6dc4c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,15 +53,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages); -static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, +static void swap_entry_range_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); +static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset); -static void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci); +static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset); +static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -129,6 +129,26 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } +/* + * Use the second highest bit of inuse_pages counter as the indicator + * of if one swap device is on the available plist, so the atomic can + * still be updated arithmetic while having special data embedded. + * + * inuse_pages counter is the only thing indicating if a device should + * be on avail_lists or not (except swapon / swapoff). By embedding the + * on-list bit in the atomic counter, updates no longer need any lock + * to check the list status. + * + * This bit will be set if the device is not on the plist and not + * usable, will be cleared if the device is on the plist. + */ +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) +static long swap_usage_in_pages(struct swap_info_struct *si) +{ + return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; +} + /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* @@ -222,9 +242,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); need_reclaim = swap_is_has_cache(si, offset, nr_pages); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); if (!need_reclaim) goto out_unlock; @@ -242,12 +262,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, folio_ref_sub(folio, nr_pages); folio_set_dirty(folio); - spin_lock(&si->lock); - /* Only sinple page folio can be backed by zswap */ - if (nr_pages == 1) - zswap_invalidate(entry); - swap_entry_range_free(si, entry, nr_pages); - spin_unlock(&si->lock); + ci = lock_cluster(si, offset); + swap_entry_range_free(si, ci, entry, nr_pages); + unlock_cluster(ci); ret = nr_pages; out_unlock: folio_unlock(folio); @@ -384,7 +401,21 @@ static void discard_swap_cluster(struct swap_info_struct *si, static inline bool cluster_is_free(struct swap_cluster_info *info) { - return info->flags & CLUSTER_FLAG_FREE; + return info->count == 0; +} + +static inline bool cluster_is_discard(struct swap_cluster_info *info) +{ + return info->flags == CLUSTER_FLAG_DISCARD; +} + +static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) +{ + if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) + return false; + if (!order) + return true; + return cluster_is_free(ci) || order == ci->order; } static inline unsigned int cluster_index(struct swap_info_struct *si, @@ -393,6 +424,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } +static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { @@ -404,45 +441,37 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si { struct swap_cluster_info *ci; - ci = si->cluster_info; - if (ci) { - ci += offset / SWAPFILE_CLUSTER; - spin_lock(&ci->lock); - } + ci = offset_to_cluster(si, offset); + spin_lock(&ci->lock); + return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { - if (ci) - spin_unlock(&ci->lock); + spin_unlock(&ci->lock); } -/* - * Determine the locking method in use for this device. Return - * swap_cluster_info if SSD-style cluster-based locking is in place. - */ -static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset) +static void cluster_move(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags) { - struct swap_cluster_info *ci; + VM_WARN_ON(ci->flags == new_flags); + BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); + lockdep_assert_held(&ci->lock); - /* Try to use fine-grained SSD-style locking if available: */ - ci = lock_cluster(si, offset); - /* Otherwise, fall back to traditional, coarse locking: */ - if (!ci) - spin_lock(&si->lock); - - return ci; -} - -static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci) -{ - if (ci) - unlock_cluster(ci); + spin_lock(&si->lock); + if (ci->flags == CLUSTER_FLAG_NONE) + list_add_tail(&ci->list, list); else - spin_unlock(&si->lock); + list_move_tail(&ci->list, list); + spin_unlock(&si->lock); + + if (ci->flags == CLUSTER_FLAG_FRAG) + atomic_long_dec(&si->frag_cluster_nr[ci->order]); + else if (new_flags == CLUSTER_FLAG_FRAG) + atomic_long_inc(&si->frag_cluster_nr[ci->order]); + ci->flags = new_flags; } /* Add a cluster to discard list and schedule it to do discard */ @@ -458,51 +487,97 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - - VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - list_move_tail(&ci->list, &si->discard_clusters); - ci->flags = 0; + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); + cluster_move(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&si->lock); lockdep_assert_held(&ci->lock); - - if (ci->flags) - list_move_tail(&ci->list, &si->free_clusters); - else - list_add_tail(&ci->list, &si->free_clusters); - ci->flags = CLUSTER_FLAG_FREE; + cluster_move(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } +/* + * Isolate and lock the first cluster that is not contented on a list, + * clean its flag before taken off-list. Cluster flag must be in sync + * with list status, so cluster updaters can always know the cluster + * list status without touching si lock. + * + * Note it's possible that all clusters on a list are contented so + * this returns NULL for an non-empty list. + */ +static struct swap_cluster_info *cluster_isolate_lock( + struct swap_info_struct *si, struct list_head *list) +{ + struct swap_cluster_info *ci, *ret = NULL; + + spin_lock(&si->lock); + + if (unlikely(!(si->flags & SWP_WRITEOK))) + goto out; + + list_for_each_entry(ci, list, list) { + if (!spin_trylock(&ci->lock)) + continue; + + /* We may only isolate and clear flags of following lists */ + VM_BUG_ON(!ci->flags); + VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && + ci->flags != CLUSTER_FLAG_FULL); + + list_del(&ci->list); + ci->flags = CLUSTER_FLAG_NONE; + ret = ci; + break; + } +out: + spin_unlock(&si->lock); + + return ret; +} + /* * Doing discard actually. After a cluster discard is finished, the cluster - * will be added to free cluster list. caller should hold si->lock. -*/ -static void swap_do_scheduled_discard(struct swap_info_struct *si) + * will be added to free cluster list. Discard cluster is a bit special as + * they don't participate in allocation or reclaim, so clusters marked as + * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. + */ +static bool swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *ci; + bool ret = false; unsigned int idx; + spin_lock(&si->lock); while (!list_empty(&si->discard_clusters)) { ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); + /* + * Delete the cluster from list but don't clear its flags until + * discard is done, so isolation and relocation will skip it. + */ list_del(&ci->list); idx = cluster_index(si, ci); spin_unlock(&si->lock); - discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); - spin_lock(&si->lock); spin_lock(&ci->lock); - __free_cluster(si, ci); + /* + * Discard is done, clear its flags as it's now off-list, + * then return the cluster to allocation list. + */ + ci->flags = CLUSTER_FLAG_NONE; memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); + __free_cluster(si, ci); spin_unlock(&ci->lock); + ret = true; + spin_lock(&si->lock); } + spin_unlock(&si->lock); + return ret; } static void swap_discard_work(struct work_struct *work) @@ -511,9 +586,7 @@ static void swap_discard_work(struct work_struct *work) si = container_of(work, struct swap_info_struct, discard_work); - spin_lock(&si->lock); swap_do_scheduled_discard(si); - spin_unlock(&si->lock); } static void swap_users_ref_free(struct percpu_ref *ref) @@ -524,15 +597,16 @@ static void swap_users_ref_free(struct percpu_ref *ref) complete(&si->comp); } +/* + * Must be called after freeing if ci->count == 0, moves the cluster to free + * or discard list. + */ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); - lockdep_assert_held(&si->lock); + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); lockdep_assert_held(&ci->lock); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -547,6 +621,48 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * __free_cluster(si, ci); } +/* + * Must be called after freeing if ci->count != 0, moves the cluster to + * nonfull list. + */ +static void partial_free_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); + lockdep_assert_held(&ci->lock); + + if (ci->flags != CLUSTER_FLAG_NONFULL) + cluster_move(si, ci, &si->nonfull_clusters[ci->order], + CLUSTER_FLAG_NONFULL); +} + +/* + * Must be called after allocation, moves the cluster to full or frag list. + * Note: allocation doesn't acquire si lock, and may drop the ci lock for + * reclaim, so the cluster could be any where when called. + */ +static void relocate_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + lockdep_assert_held(&ci->lock); + + /* Discard cluster must remain off-list or on discard list */ + if (cluster_is_discard(ci)) + return; + + if (!ci->count) { + free_cluster(si, ci); + } else if (ci->count != SWAPFILE_CLUSTER) { + if (ci->flags != CLUSTER_FLAG_FRAG) + cluster_move(si, ci, &si->frag_clusters[ci->order], + CLUSTER_FLAG_FRAG); + } else { + if (ci->flags != CLUSTER_FLAG_FULL) + cluster_move(si, ci, &si->full_clusters, + CLUSTER_FLAG_FULL); + } +} + /* * The cluster corresponding to page_nr will be used. The cluster will not be * added to free cluster list and its usage counter will be increased by 1. @@ -558,9 +674,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si, unsigned long idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; - if (!cluster_info) - return; - ci = cluster_info + idx; ci->count++; @@ -568,63 +681,33 @@ static void inc_cluster_info_page(struct swap_info_struct *si, VM_BUG_ON(ci->flags); } -/* - * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, - * which means no page in the cluster is in use, we can optionally discard - * the cluster and add it to free cluster list. - */ -static void dec_cluster_info_page(struct swap_info_struct *si, - struct swap_cluster_info *ci, int nr_pages) -{ - if (!si->cluster_info) - return; - - VM_BUG_ON(ci->count < nr_pages); - VM_BUG_ON(cluster_is_free(ci)); - lockdep_assert_held(&si->lock); - lockdep_assert_held(&ci->lock); - ci->count -= nr_pages; - - if (!ci->count) { - free_cluster(si, ci); - return; - } - - if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { - VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); - ci->flags = CLUSTER_FLAG_NONFULL; - } -} - static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned long end) { unsigned char *map = si->swap_map; - unsigned long offset; + unsigned long offset = start; + int nr_reclaim; spin_unlock(&ci->lock); - spin_unlock(&si->lock); - - for (offset = start; offset < end; offset++) { + do { switch (READ_ONCE(map[offset])) { case 0: - continue; + offset++; + break; case SWAP_HAS_CACHE: - if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) - continue; - goto out; + nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + if (nr_reclaim > 0) + offset += nr_reclaim; + else + goto out; + break; default: goto out; } - } + } while (offset < end); out: - spin_lock(&si->lock); spin_lock(&ci->lock); - /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. @@ -638,11 +721,11 @@ out: static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long start, unsigned int nr_pages) + unsigned long start, unsigned int nr_pages, + bool *need_reclaim) { unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; - bool need_reclaim = false; for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { @@ -651,16 +734,13 @@ static bool cluster_scan_range(struct swap_info_struct *si, case SWAP_HAS_CACHE: if (!vm_swap_full()) return false; - need_reclaim = true; + *need_reclaim = true; continue; default: return false; } } - if (need_reclaim) - return cluster_reclaim_range(si, ci, start, end); - return true; } @@ -670,73 +750,75 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster { unsigned int nr_pages = 1 << order; + lockdep_assert_held(&ci->lock); + if (!(si->flags & SWP_WRITEOK)) return false; - if (cluster_is_free(ci)) { - if (nr_pages < SWAPFILE_CLUSTER) { - list_move_tail(&ci->list, &si->nonfull_clusters[order]); - ci->flags = CLUSTER_FLAG_NONFULL; - } + if (cluster_is_free(ci)) ci->order = order; - } memset(si->swap_map + start, usage, nr_pages); - swap_range_alloc(si, start, nr_pages); + swap_range_alloc(si, nr_pages); ci->count += nr_pages; - if (ci->count == SWAPFILE_CLUSTER) { - VM_BUG_ON(!(ci->flags & - (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - list_move_tail(&ci->list, &si->full_clusters); - ci->flags = CLUSTER_FLAG_FULL; - } - return true; } -static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, - unsigned int *foundp, unsigned int order, +/* Try use a new cluster for current CPU and allocate from it. */ +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, + unsigned int order, unsigned char usage) { - unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); + unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; + unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int nr_pages = 1 << order; - struct swap_cluster_info *ci; + bool need_reclaim, ret; - if (end < nr_pages) - return SWAP_NEXT_INVALID; - end -= nr_pages; + lockdep_assert_held(&ci->lock); - ci = lock_cluster(si, offset); - if (ci->count + nr_pages > SWAPFILE_CLUSTER) { - offset = SWAP_NEXT_INVALID; - goto done; - } + if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) + goto out; - while (offset <= end) { - if (cluster_scan_range(si, ci, offset, nr_pages)) { - if (!cluster_alloc_range(si, ci, offset, usage, order)) { - offset = SWAP_NEXT_INVALID; - goto done; - } - *foundp = offset; - if (ci->count == SWAPFILE_CLUSTER) { - offset = SWAP_NEXT_INVALID; - goto done; - } - offset += nr_pages; - break; + for (end -= nr_pages; offset <= end; offset += nr_pages) { + need_reclaim = false; + if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) + continue; + if (need_reclaim) { + ret = cluster_reclaim_range(si, ci, start, end); + /* + * Reclaim drops ci->lock and cluster could be used + * by another order. Not checking flag as off-list + * cluster has no flag set, and change of list + * won't cause fragmentation. + */ + if (!cluster_is_usable(ci, order)) + goto out; + if (cluster_is_free(ci)) + offset = start; + /* Reclaim failed but cluster is usable, try next */ + if (!ret) + continue; } + if (!cluster_alloc_range(si, ci, offset, usage, order)) + break; + found = offset; offset += nr_pages; + if (ci->count < SWAPFILE_CLUSTER && offset <= end) + next = offset; + break; } - if (offset > end) - offset = SWAP_NEXT_INVALID; -done: +out: + relocate_cluster(si, ci); unlock_cluster(ci); - return offset; + if (si->flags & SWP_SOLIDSTATE) + __this_cpu_write(si->percpu_cluster->next[order], next); + else + si->global_cluster->next[order] = next; + return found; } /* Return true if reclaimed a whole cluster */ @@ -749,20 +831,19 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) int nr_reclaim; if (force) - to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while (!list_empty(&si->full_clusters)) { - ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); - list_move_tail(&ci->list, &si->full_clusters); + while ((ci = cluster_isolate_lock(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; - spin_unlock(&si->lock); while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + spin_lock(&ci->lock); if (nr_reclaim) { offset += abs(nr_reclaim); continue; @@ -770,8 +851,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) } offset++; } - spin_lock(&si->lock); + unlock_cluster(ci); if (to_scan <= 0) break; } @@ -783,9 +864,7 @@ static void swap_reclaim_work(struct work_struct *work) si = container_of(work, struct swap_info_struct, reclaim_work); - spin_lock(&si->lock); swap_reclaim_full_clusters(si, true); - spin_unlock(&si->lock); } /* @@ -796,29 +875,41 @@ static void swap_reclaim_work(struct work_struct *work) static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsigned char usage) { - struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned int offset, found = 0; -new_cluster: - lockdep_assert_held(&si->lock); - cluster = this_cpu_ptr(si->percpu_cluster); - offset = cluster->next[order]; + if (si->flags & SWP_SOLIDSTATE) { + /* Fast path using per CPU cluster */ + local_lock(&si->percpu_cluster->lock); + offset = __this_cpu_read(si->percpu_cluster->next[order]); + } else { + /* Serialize HDD SWAP allocation for each device. */ + spin_lock(&si->global_cluster_lock); + offset = si->global_cluster->next[order]; + } + if (offset) { - offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); + ci = lock_cluster(si, offset); + /* Cluster could have been used by another order */ + if (cluster_is_usable(ci, order)) { + if (cluster_is_free(ci)) + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, + order, usage); + } else { + unlock_cluster(ci); + } if (found) goto done; } - if (!list_empty(&si->free_clusters)) { - ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); - /* - * Either we didn't touch the cluster due to swapoff, - * or the allocation must success. - */ - VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); - goto done; +new_cluster: + ci = cluster_isolate_lock(si, &si->free_clusters); + if (ci) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; } /* Try reclaim from full clusters if free clusters list is drained */ @@ -826,56 +917,45 @@ new_cluster: swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0; + unsigned int frags = 0, frags_existing; - while (!list_empty(&si->nonfull_clusters[order])) { - ci = list_first_entry(&si->nonfull_clusters[order], - struct swap_cluster_info, list); - list_move_tail(&ci->list, &si->frag_clusters[order]); - ci->flags = CLUSTER_FLAG_FRAG; - si->frag_cluster_nr[order]++; - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; - if (found) - break; - } - - if (!found) { + while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[order]))) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); /* - * Nonfull clusters are moved to frag tail if we reached - * here, count them too, don't over scan the frag list. + * With `fragmenting` set to true, it will surely take + * the cluster off nonfull list */ - while (frags < si->frag_cluster_nr[order]) { - ci = list_first_entry(&si->frag_clusters[order], - struct swap_cluster_info, list); - /* - * Rotate the frag list to iterate, they were all failing - * high order allocation or moved here due to per-CPU usage, - * this help keeping usable cluster ahead. - */ - list_move_tail(&ci->list, &si->frag_clusters[order]); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; - if (found) - break; - } + if (found) + goto done; + frags++; + } + + frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); + while (frags < frags_existing && + (ci = cluster_isolate_lock(si, &si->frag_clusters[order]))) { + atomic_long_dec(&si->frag_cluster_nr[order]); + /* + * Rotate the frag list to iterate, they were all + * failing high order allocation or moved here due to + * per-CPU usage, but they could contain newly released + * reclaimable (eg. lazy-freed swap cache) slots. + */ + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; + frags++; } } - if (found) - goto done; - - if (!list_empty(&si->discard_clusters)) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock - */ - swap_do_scheduled_discard(si); + /* + * We don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; - } if (order) goto done; @@ -886,76 +966,153 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while (!list_empty(&si->frag_clusters[o])) { - ci = list_first_entry(&si->frag_clusters[o], - struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, 0, usage); + while ((ci = cluster_isolate_lock(si, &si->frag_clusters[o]))) { + atomic_long_dec(&si->frag_cluster_nr[o]); + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + 0, usage); if (found) goto done; } - while (!list_empty(&si->nonfull_clusters[o])) { - ci = list_first_entry(&si->nonfull_clusters[o], - struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, 0, usage); + while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[o]))) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + 0, usage); if (found) goto done; } } - done: - cluster->next[order] = offset; + if (si->flags & SWP_SOLIDSTATE) + local_unlock(&si->percpu_cluster->lock); + else + spin_unlock(&si->global_cluster_lock); return found; } -static void __del_from_avail_list(struct swap_info_struct *si) +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { int nid; + unsigned long pages; + + spin_lock(&swap_avail_lock); + + if (swapoff) { + /* + * Forcefully remove it. Clear the SWP_WRITEOK flags for + * swapoff here so it's synchronized by both si->lock and + * swap_avail_lock, to ensure the result can be seen by + * add_to_avail_list. + */ + lockdep_assert_held(&si->lock); + si->flags &= ~SWP_WRITEOK; + atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); + } else { + /* + * If not called by swapoff, take it off-list only if it's + * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly + * si->inuse_pages == pages), any concurrent slot freeing, + * or device already removed from plist by someone else + * will make this return false. + */ + pages = si->pages; + if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, + pages | SWAP_USAGE_OFFLIST_BIT)) + goto skip; + } - assert_spin_locked(&si->lock); for_each_node(nid) plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); -} -static void del_from_avail_list(struct swap_info_struct *si) -{ - spin_lock(&swap_avail_lock); - __del_from_avail_list(si); +skip: spin_unlock(&swap_avail_lock); } -static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ +static void add_to_avail_list(struct swap_info_struct *si, bool swapon) +{ + int nid; + long val; + unsigned long pages; + + spin_lock(&swap_avail_lock); + + /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ + if (swapon) { + lockdep_assert_held(&si->lock); + si->flags |= SWP_WRITEOK; + } else { + if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) + goto skip; + } + + if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) + goto skip; + + val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); + + /* + * When device is full and device is on the plist, only one updater will + * see (inuse_pages == si->pages) and will call del_from_avail_list. If + * that updater happen to be here, just skip adding. + */ + pages = si->pages; + if (val == pages) { + /* Just like the cmpxchg in del_from_avail_list */ + if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, + pages | SWAP_USAGE_OFFLIST_BIT)) + goto skip; + } + + for_each_node(nid) + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + +skip: + spin_unlock(&swap_avail_lock); +} + +/* + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock + * within each cluster, so the total contribution to the global counter should + * always be positive and cannot exceed the total number of usable slots. + */ +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) +{ + long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); + + /* + * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, + * remove it from the plist. + */ + if (unlikely(val == si->pages)) { + del_from_avail_list(si, false); + return true; + } + + return false; +} + +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) +{ + long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); + + /* + * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, + * remove it from the plist. + */ + if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) + add_to_avail_list(si, false); +} + +static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries) { - unsigned int end = offset + nr_entries - 1; - - if (offset == si->lowest_bit) - si->lowest_bit += nr_entries; - if (end == si->highest_bit) - WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); - WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - del_from_avail_list(si); - + if (swap_usage_add(si, nr_entries)) { if (si->cluster_info && vm_swap_full()) schedule_work(&si->reclaim_work); } } -static void add_to_avail_list(struct swap_info_struct *si) -{ - int nid; - - spin_lock(&swap_avail_lock); - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); - spin_unlock(&swap_avail_lock); -} - static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { @@ -968,18 +1125,11 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { clear_bit(offset + i, si->zeromap); - - if (offset < si->lowest_bit) - si->lowest_bit = offset; - if (end > si->highest_bit) { - bool was_full = !si->highest_bit; - - WRITE_ONCE(si->highest_bit, end); - if (was_full && (si->flags & SWP_WRITEOK)) - add_to_avail_list(si); + zswap_invalidate(swp_entry(si->type, offset + i)); } + if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -999,50 +1149,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); -} - -static void set_cluster_next(struct swap_info_struct *si, unsigned long next) -{ - unsigned long prev; - - if (!(si->flags & SWP_SOLIDSTATE)) { - si->cluster_next = next; - return; - } - - prev = this_cpu_read(*si->cluster_next_cpu); - /* - * Cross the swap address space size aligned trunk, choose - * another trunk randomly to avoid lock contention on swap - * address space if possible. - */ - if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != - (next >> SWAP_ADDRESS_SPACE_SHIFT)) { - /* No free swap slots available */ - if (si->highest_bit <= si->lowest_bit) - return; - next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); - next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); - next = max_t(unsigned int, next, si->lowest_bit); - } - this_cpu_write(*si->cluster_next_cpu, next); -} - -static bool swap_offset_available_and_locked(struct swap_info_struct *si, - unsigned long offset) -{ - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); - return true; - } - - if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); - return true; - } - - return false; + swap_usage_sub(si, nr_entries); } static int cluster_alloc_swap(struct swap_info_struct *si, @@ -1051,10 +1158,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, { int n_ret = 0; - VM_BUG_ON(!si->cluster_info); - - si->flags += SWP_SCANNING; - while (n_ret < nr) { unsigned long offset = cluster_alloc_swap_entry(si, order, usage); @@ -1063,8 +1166,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, slots[n_ret++] = swp_entry(si->type, offset); } - si->flags -= SWP_SCANNING; - return n_ret; } @@ -1072,13 +1173,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { - unsigned long offset; - unsigned long scan_base; - unsigned long last_in_cluster = 0; - int latency_ration = LATENCY_LIMIT; unsigned int nr_pages = 1 << order; - int n_ret = 0; - bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -1090,7 +1185,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ - if (order > 0) { /* * Should not even be attempting large allocations when huge @@ -1103,165 +1197,30 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } /* - * Swapfile is not block device or not using clusters so unable + * Swapfile is not block device so unable * to allocate large entries. */ - if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) + if (!(si->flags & SWP_BLKDEV)) return 0; } - if (si->cluster_info) - return cluster_alloc_swap(si, usage, nr, slots, order); - - si->flags += SWP_SCANNING; - - /* For HDD, sequential access is more important. */ - scan_base = si->cluster_next; - offset = scan_base; - - if (unlikely(!si->cluster_nr--)) { - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - - spin_unlock(&si->lock); - - /* - * If seek is expensive, start searching for new cluster from - * start of partition, to minimize the span of allocated swap. - */ - scan_base = offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - - offset = scan_base; - spin_lock(&si->lock); - si->cluster_nr = SWAPFILE_CLUSTER - 1; - } - -checks: - if (!(si->flags & SWP_WRITEOK)) - goto no_page; - if (!si->highest_bit) - goto no_page; - if (offset > si->highest_bit) - scan_base = offset = si->lowest_bit; - - /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - int swap_was_freed; - spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); - spin_lock(&si->lock); - /* entry was freed successfully, try to use this again */ - if (swap_was_freed > 0) - goto checks; - goto scan; /* check next one */ - } - - if (si->swap_map[offset]) { - if (!n_ret) - goto scan; - else - goto done; - } - memset(si->swap_map + offset, usage, nr_pages); - - swap_range_alloc(si, offset, nr_pages); - slots[n_ret++] = swp_entry(si->type, offset); - - /* got enough slots or reach max slots? */ - if ((n_ret == nr) || (offset >= si->highest_bit)) - goto done; - - /* search for next available slot */ - - /* time to take a break? */ - if (unlikely(--latency_ration < 0)) { - if (n_ret) - goto done; - spin_unlock(&si->lock); - cond_resched(); - spin_lock(&si->lock); - latency_ration = LATENCY_LIMIT; - } - - if (si->cluster_nr && !si->swap_map[++offset]) { - /* non-ssd case, still more slots in cluster? */ - --si->cluster_nr; - goto checks; - } + return cluster_alloc_swap(si, usage, nr, slots, order); +} +static bool get_swap_device_info(struct swap_info_struct *si) +{ + if (!percpu_ref_tryget_live(&si->users)) + return false; /* - * Even if there's no free clusters available (fragmented), - * try to scan a little more quickly with lock held unless we - * have scanned too many slots already. + * Guarantee the si->users are checked before accessing other + * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is + * up to dated. + * + * Paired with the spin_unlock() after setup_swap_info() in + * enable_swap_info(), and smp_wmb() in swapoff. */ - if (!scanned_many) { - unsigned long scan_limit; - - if (offset < scan_base) - scan_limit = scan_base; - else - scan_limit = si->highest_bit; - for (; offset <= scan_limit && --latency_ration > 0; - offset++) { - if (!si->swap_map[offset]) - goto checks; - } - } - -done: - if (order == 0) - set_cluster_next(si, offset + 1); - si->flags -= SWP_SCANNING; - return n_ret; - -scan: - VM_WARN_ON(order > 0); - spin_unlock(&si->lock); - while (++offset <= READ_ONCE(si->highest_bit)) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - } - offset = si->lowest_bit; - while (offset < scan_base) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - offset++; - } - spin_lock(&si->lock); - -no_page: - si->flags -= SWP_SCANNING; - return n_ret; + smp_rmb(); + return true; } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) @@ -1291,32 +1250,15 @@ start_over: /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); - spin_lock(&si->lock); - if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - if (plist_node_empty(&si->avail_lists[node])) { - spin_unlock(&si->lock); - goto nextsi; - } - WARN(!si->highest_bit, - "swap_info %d in list but !highest_bit\n", - si->type); - WARN(!(si->flags & SWP_WRITEOK), - "swap_info %d in list but !SWP_WRITEOK\n", - si->type); - __del_from_avail_list(si); - spin_unlock(&si->lock); - goto nextsi; + if (get_swap_device_info(si)) { + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries, order); + put_swap_device(si); + if (n_ret || size > 1) + goto check_out; } - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries, order); - spin_unlock(&si->lock); - if (n_ret || size > 1) - goto check_out; - cond_resched(); spin_lock(&swap_avail_lock); -nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, @@ -1376,22 +1318,6 @@ out: return NULL; } -static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, - struct swap_info_struct *q) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - - if (p != q) { - if (q != NULL) - spin_unlock(&q->lock); - if (p != NULL) - spin_lock(&p->lock); - } - return p; -} - static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) @@ -1481,16 +1407,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) si = swp_swap_info(entry); if (!si) goto bad_nofile; - if (!percpu_ref_tryget_live(&si->users)) + if (!get_swap_device_info(si)) goto out; - /* - * Guarantee the si->users are checked before accessing other - * fields of swap_info_struct. - * - * Paired with the spin_unlock() after setup_swap_info() in - * enable_swap_info(). - */ - smp_rmb(); offset = swp_offset(entry); if (offset >= si->max) goto put_out; @@ -1513,11 +1431,11 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); unsigned char usage; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); usage = __swap_entry_free_locked(si, offset, 1); - unlock_cluster_or_swap_info(si, ci); if (!usage) - free_swap_slot(entry); + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + unlock_cluster(ci); return usage; } @@ -1538,22 +1456,17 @@ static bool __swap_entries_free(struct swap_info_struct *si, if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) goto fallback; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); goto fallback; } for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster_or_swap_info(si, ci); + if (!has_cache) + swap_entry_range_free(si, ci, entry, nr); + unlock_cluster(ci); - if (!has_cache) { - for (i = 0; i < nr; i++) - zswap_invalidate(swp_entry(si->type, offset + i)); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, nr); - spin_unlock(&si->lock); - } return has_cache; fallback: @@ -1573,24 +1486,32 @@ fallback: * Drop the last HAS_CACHE flag of swap entries, caller have to * ensure all entries belong to the same cgroup. */ -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages) +static void swap_entry_range_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; - struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); + /* It should never free entries across different clusters */ + VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(cluster_is_free(ci)); + VM_BUG_ON(ci->count < nr_pages); + + ci->count -= nr_pages; do { VM_BUG_ON(*map != SWAP_HAS_CACHE); *map = 0; } while (++map < map_end); - dec_cluster_info_page(si, ci, nr_pages); - unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + + if (!ci->count) + free_cluster(si, ci); + else + partial_free_cluster(si, ci); } static void cluster_swap_free_nr(struct swap_info_struct *si, @@ -1598,29 +1519,14 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, unsigned char usage) { struct swap_cluster_info *ci; - DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; - int i, nr; + unsigned long end = offset + nr_pages; - ci = lock_cluster_or_swap_info(si, offset); - while (nr_pages) { - nr = min(BITS_PER_LONG, nr_pages); - for (i = 0; i < nr; i++) { - if (!__swap_entry_free_locked(si, offset + i, usage)) - bitmap_set(to_free, i, 1); - } - if (!bitmap_empty(to_free, BITS_PER_LONG)) { - unlock_cluster_or_swap_info(si, ci); - for_each_set_bit(i, to_free, BITS_PER_LONG) - free_swap_slot(swp_entry(si->type, offset + i)); - if (nr == nr_pages) - return; - bitmap_clear(to_free, 0, BITS_PER_LONG); - ci = lock_cluster_or_swap_info(si, offset); - } - offset += nr; - nr_pages -= nr; - } - unlock_cluster_or_swap_info(si, ci); + ci = lock_cluster(si, offset); + do { + if (!__swap_entry_free_locked(si, offset, usage)) + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + } while (++offset < end); + unlock_cluster(ci); } /* @@ -1659,59 +1565,35 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster_or_swap_info(si, offset); - if (size > 1 && swap_is_has_cache(si, offset, size)) { - unlock_cluster_or_swap_info(si, ci); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, size); - spin_unlock(&si->lock); - return; - } - for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { - unlock_cluster_or_swap_info(si, ci); - free_swap_slot(entry); - if (i == size - 1) - return; - lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); + if (swap_is_has_cache(si, offset, size)) + swap_entry_range_free(si, ci, entry, size); + else { + for (int i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) + swap_entry_range_free(si, ci, entry, 1); } } - unlock_cluster_or_swap_info(si, ci); -} - -static int swp_entry_cmp(const void *ent1, const void *ent2) -{ - const swp_entry_t *e1 = ent1, *e2 = ent2; - - return (int)swp_type(*e1) - (int)swp_type(*e2); + unlock_cluster(ci); } void swapcache_free_entries(swp_entry_t *entries, int n) { - struct swap_info_struct *p, *prev; int i; + struct swap_cluster_info *ci; + struct swap_info_struct *si = NULL; if (n <= 0) return; - prev = NULL; - p = NULL; - - /* - * Sort swap entries by swap device, so each lock is only taken once. - * nr_swapfiles isn't absolutely correct, but the overhead of sort() is - * so low that it isn't necessary to optimize further. - */ - if (nr_swapfiles > 1) - sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { - p = swap_info_get_cont(entries[i], prev); - if (p) - swap_entry_range_free(p, entries[i], 1); - prev = p; + si = _swap_info_get(entries[i]); + if (si) { + ci = lock_cluster(si, swp_offset(entries[i])); + swap_entry_range_free(si, ci, entries[i], 1); + unlock_cluster(ci); + } } - if (p) - spin_unlock(&p->lock); } int __swap_count(swp_entry_t entry) @@ -1733,9 +1615,9 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1758,7 +1640,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1781,7 +1663,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1796,8 +1678,8 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster_or_swap_info(si, offset); - if (!ci || nr_pages == 1) { + ci = lock_cluster(si, offset); + if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; @@ -1809,7 +1691,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return ret; } @@ -1963,10 +1845,8 @@ swp_entry_t get_swap_page_of_type(int type) goto fail; /* This is called for allocating swap entry, not cache */ - spin_lock(&si->lock); if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); - spin_unlock(&si->lock); fail: return entry; } @@ -2057,7 +1937,7 @@ unsigned int count_swap_pages(int type, int free) if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) - n -= sis->inuse_pages; + n -= swap_usage_in_pages(sis); } spin_unlock(&sis->lock); } @@ -2392,7 +2272,7 @@ static int try_to_unuse(unsigned int type) swp_entry_t entry; unsigned int i; - if (!READ_ONCE(si->inuse_pages)) + if (!swap_usage_in_pages(si)) goto success; retry: @@ -2405,7 +2285,7 @@ retry: spin_lock(&mmlist_lock); p = &init_mm.mmlist; - while (READ_ONCE(si->inuse_pages) && + while (swap_usage_in_pages(si) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { @@ -2433,7 +2313,7 @@ retry: mmput(prev_mm); i = 0; - while (READ_ONCE(si->inuse_pages) && + while (swap_usage_in_pages(si) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { @@ -2468,7 +2348,7 @@ retry: * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ - if (READ_ONCE(si->inuse_pages)) { + if (swap_usage_in_pages(si)) { if (!signal_pending(current)) goto retry; return -EINTR; @@ -2477,7 +2357,7 @@ retry: success: /* * Make sure that further cleanups after try_to_unuse() returns happen - * after swap_range_free() reduces si->inuse_pages to 0. + * after swap_range_free() reduces inuse_pages to 0. */ smp_mb(); return 0; @@ -2495,7 +2375,7 @@ static void drain_mmlist(void) unsigned int type; for (type = 0; type < nr_swapfiles; type++) - if (swap_info[type]->inuse_pages) + if (swap_usage_in_pages(swap_info[type])) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) @@ -2674,7 +2554,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, static void _enable_swap_info(struct swap_info_struct *si) { - si->flags |= SWP_WRITEOK; atomic_long_add(si->pages, &nr_swap_pages); total_swap_pages += si->pages; @@ -2691,9 +2570,8 @@ static void _enable_swap_info(struct swap_info_struct *si) */ plist_add(&si->list, &swap_active_head); - /* add to available list iff swap device is not full */ - if (si->highest_bit) - add_to_avail_list(si); + /* Add back to available list */ + add_to_avail_list(si, true); } static void enable_swap_info(struct swap_info_struct *si, int prio, @@ -2742,6 +2620,25 @@ bool has_usable_swap(void) return ret; } +/* + * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range + * see the updated flags, so there will be no more allocations. + */ +static void wait_for_allocation(struct swap_info_struct *si) +{ + unsigned long offset; + unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); + struct swap_cluster_info *ci; + + BUG_ON(si->flags & SWP_WRITEOK); + + for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { + ci = lock_cluster(si, offset); + unlock_cluster(ci); + offset += SWAPFILE_CLUSTER; + } +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -2791,7 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } spin_lock(&p->lock); - del_from_avail_list(p); + del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; @@ -2809,10 +2706,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; - p->flags &= ~SWP_WRITEOK; spin_unlock(&p->lock); spin_unlock(&swap_lock); + wait_for_allocation(p); + disable_swap_slots_cache_lock(); set_current_oom_origin(); @@ -2855,16 +2753,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_lock(&p->lock); drain_mmlist(); - /* wait for anyone still in scan_swap_map_slots */ - p->highest_bit = 0; /* cuts scans short */ - while (p->flags >= SWP_SCANNING) { - spin_unlock(&p->lock); - spin_unlock(&swap_lock); - schedule_timeout_uninterruptible(1); - spin_lock(&swap_lock); - spin_lock(&p->lock); - } - swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; @@ -2881,8 +2769,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; - free_percpu(p->cluster_next_cpu); - p->cluster_next_cpu = NULL; + kfree(p->global_cluster); + p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); @@ -2992,7 +2880,7 @@ static int swap_show(struct seq_file *swap, void *v) } bytes = K(si->pages); - inuse = K(READ_ONCE(si->inuse_pages)); + inuse = K(swap_usage_in_pages(si)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); @@ -3109,6 +2997,7 @@ static struct swap_info_struct *alloc_swap_info(void) } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); + atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); return p; @@ -3193,10 +3082,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return 0; } - si->lowest_bit = 1; - si->cluster_next = 1; - si->cluster_nr = 0; - maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { @@ -3213,7 +3098,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } - si->highest_bit = maxpages - 1; if (!maxpages) return 0; @@ -3281,7 +3165,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; struct swap_cluster_info *cluster_info; unsigned long i, j, k, idx; int cpu, err = -ENOMEM; @@ -3293,25 +3176,24 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - si->cluster_next_cpu = alloc_percpu(unsigned int); - if (!si->cluster_next_cpu) - goto err_free; + if (si->flags & SWP_SOLIDSTATE) { + si->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!si->percpu_cluster) + goto err_free; - /* Random start position to help with wear leveling */ - for_each_possible_cpu(cpu) - per_cpu(*si->cluster_next_cpu, cpu) = - get_random_u32_inclusive(1, si->highest_bit); + for_each_possible_cpu(cpu) { + struct percpu_cluster *cluster; - si->percpu_cluster = alloc_percpu(struct percpu_cluster); - if (!si->percpu_cluster) - goto err_free; - - for_each_possible_cpu(cpu) { - struct percpu_cluster *cluster; - - cluster = per_cpu_ptr(si->percpu_cluster, cpu); + cluster = per_cpu_ptr(si->percpu_cluster, cpu); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_ENTRY_INVALID; + local_lock_init(&cluster->lock); + } + } else { + si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); for (i = 0; i < SWAP_NR_ORDERS; i++) - cluster->next[i] = SWAP_NEXT_INVALID; + si->global_cluster->next[i] = SWAP_ENTRY_INVALID; + spin_lock_init(&si->global_cluster_lock); } /* @@ -3335,7 +3217,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - si->frag_cluster_nr[i] = 0; + atomic_long_set(&si->frag_cluster_nr[i], 0); } /* @@ -3343,7 +3225,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { - j = (k + col) % SWAP_CLUSTER_COLS; + j = k % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; @@ -3493,18 +3375,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; - - cluster_info = setup_clusters(si, swap_header, maxpages); - if (IS_ERR(cluster_info)) { - error = PTR_ERR(cluster_info); - cluster_info = NULL; - goto bad_swap_unlock_inode; - } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } + cluster_info = setup_clusters(si, swap_header, maxpages); + if (IS_ERR(cluster_info)) { + error = PTR_ERR(cluster_info); + cluster_info = NULL; + goto bad_swap_unlock_inode; + } + if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* @@ -3585,8 +3467,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(si->percpu_cluster); si->percpu_cluster = NULL; - free_percpu(si->cluster_next_cpu); - si->cluster_next_cpu = NULL; + kfree(si->global_cluster); + si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type); @@ -3623,7 +3505,7 @@ void si_swapinfo(struct sysinfo *val) struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) - nr_to_be_unused += READ_ONCE(si->inuse_pages); + nr_to_be_unused += swap_usage_in_pages(si); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; @@ -3655,7 +3537,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3710,7 +3592,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return err; } @@ -3819,7 +3701,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ goto outer; } - spin_lock(&si->lock); offset = swp_offset(entry); @@ -3884,7 +3765,6 @@ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); - spin_unlock(&si->lock); put_swap_device(si); outer: if (page) diff --git a/mm/truncate.c b/mm/truncate.c index 7c304d2f0052..e2e115adfbc5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -525,6 +525,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } EXPORT_SYMBOL(invalidate_mapping_pages); +static int folio_launder(struct address_space *mapping, struct folio *folio) +{ + if (!folio_test_dirty(folio)) + return 0; + if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) + return 0; + return mapping->a_ops->launder_folio(folio); +} + /* * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger @@ -532,14 +541,26 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * shrink_folio_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ -static int invalidate_complete_folio2(struct address_space *mapping, - struct folio *folio) +int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, + gfp_t gfp) { - if (folio->mapping != mapping) - return 0; + int ret; - if (!filemap_release_folio(folio, GFP_KERNEL)) + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (folio_test_dirty(folio)) return 0; + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio)); + + ret = folio_launder(mapping, folio); + if (ret) + return ret; + if (folio->mapping != mapping) + return -EBUSY; + if (!filemap_release_folio(folio, gfp)) + return -EBUSY; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); @@ -558,16 +579,7 @@ static int invalidate_complete_folio2(struct address_space *mapping, failed: xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); - return 0; -} - -static int folio_launder(struct address_space *mapping, struct folio *folio) -{ - if (!folio_test_dirty(folio)) - return 0; - if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) - return 0; - return mapping->a_ops->launder_folio(folio); + return -EBUSY; } /** @@ -631,16 +643,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); - - if (folio_mapped(folio)) - unmap_mapping_folio(folio); - BUG_ON(folio_mapped(folio)); - - ret2 = folio_launder(mapping, folio); - if (ret2 == 0) { - if (!invalidate_complete_folio2(mapping, folio)) - ret2 = -EBUSY; - } + ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL); if (ret2 < 0) ret = ret2; folio_unlock(folio); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 60a0be33766f..411a663932c4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -84,16 +84,9 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = find_vma_and_prepare_anon(mm, address); - if (!IS_ERR(vma)) { - /* - * We cannot use vma_start_read() as it may fail due to - * false locked (see comment in vma_start_read()). We - * can avoid that by directly locking vm_lock under - * mmap_lock, which guarantees that nobody can lock the - * vma for write (vma_start_write()) under us. - */ - down_read(&vma->vm_lock->lock); - } + if (!IS_ERR(vma)) + if (!vma_start_read_locked(vma)) + vma = ERR_PTR(-EAGAIN); mmap_read_unlock(mm); return vma; @@ -1020,6 +1013,14 @@ void double_pt_unlock(spinlock_t *ptl1, __release(ptl2); } +static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval) +{ + return pte_same(ptep_get(src_pte), orig_src_pte) && + pte_same(ptep_get(dst_pte), orig_dst_pte) && + pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); +} static int move_present_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1027,6 +1028,7 @@ static int move_present_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, struct folio *src_folio) { @@ -1034,8 +1036,8 @@ static int move_present_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { err = -EAGAIN; goto out; } @@ -1071,6 +1073,7 @@ static int move_swap_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl) { if (!pte_swp_exclusive(orig_src_pte)) @@ -1078,8 +1081,8 @@ static int move_swap_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1097,13 +1100,14 @@ static int move_zeropage_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl) { pte_t zero_pte; double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1136,6 +1140,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pte_t *src_pte = NULL; pte_t *dst_pte = NULL; pmd_t dummy_pmdval; + pmd_t dst_pmdval; struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; @@ -1148,11 +1153,11 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, retry: /* * Use the maywrite version to indicate that dst_pte will be modified, - * but since we will use pte_same() to detect the change of the pte - * entry, there is no need to get pmdval, so just pass a dummy variable - * to it. + * since dst_pte needs to be none, the subsequent pte_same() check + * cannot prevent the dst_pte page from being freed concurrently, so we + * also need to abtain dst_pmdval and recheck pmd_same() later. */ - dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval, + dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, &dst_ptl); /* Retry if a huge pmd materialized from under us */ @@ -1161,7 +1166,11 @@ retry: goto out; } - /* same as dst_pte */ + /* + * Unlike dst_pte, the subsequent pte_same() check can ensure the + * stability of the src_pte page, so there is no need to get pmdval, + * just pass a dummy variable to it. + */ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, &src_ptl); @@ -1177,8 +1186,8 @@ retry: } /* Sanity checks before the operation */ - if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || - WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { + if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || + pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { err = -EINVAL; goto out; } @@ -1213,7 +1222,7 @@ retry: err = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + dst_pmd, dst_pmdval, dst_ptl, src_ptl); goto out; } @@ -1303,8 +1312,8 @@ retry: err = move_present_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl, src_folio); + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, src_folio); } else { entry = pte_to_swp_entry(orig_src_pte); if (non_swap_entry(entry)) { @@ -1319,10 +1328,9 @@ retry: goto out; } - err = move_swap_pte(mm, dst_addr, src_addr, - dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl); } out: @@ -1476,14 +1484,17 @@ static int uffd_move_lock(struct mm_struct *mm, mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); if (!err) { - /* - * See comment in uffd_lock_vma() as to why not using - * vma_start_read() here. - */ - down_read(&(*dst_vmap)->vm_lock->lock); - if (*dst_vmap != *src_vmap) - down_read_nested(&(*src_vmap)->vm_lock->lock, - SINGLE_DEPTH_NESTING); + if (vma_start_read_locked(*dst_vmap)) { + if (*dst_vmap != *src_vmap) { + if (!vma_start_read_locked_nested(*src_vmap, + SINGLE_DEPTH_NESTING)) { + vma_end_read(*dst_vmap); + err = -EAGAIN; + } + } + } else { + err = -EAGAIN; + } } mmap_read_unlock(mm); return err; diff --git a/mm/util.c b/mm/util.c index 60aa40f612b8..b6b9684a1438 100644 --- a/mm/util.c +++ b/mm/util.c @@ -582,6 +582,23 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, return ret; } +/* + * Perform a userland memory mapping into the current process address space. See + * the comment for do_mmap() for more details on this operation in general. + * + * This differs from do_mmap() in that: + * + * a. An offset parameter is provided rather than pgoff, which is both checked + * for overflow and page alignment. + * b. mmap locking is performed on the caller's behalf. + * c. Userfaultfd unmap events and memory population are handled. + * + * This means that this function performs essentially the same work as if + * userland were invoking mmap (2). + * + * Returns either an error, or the address at which the requested mapping has + * been performed. + */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) diff --git a/mm/vma.c b/mm/vma.c index bb2119e5a0d0..0a5158d611e3 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -202,6 +202,38 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_lock and by + * the root anon_vma's mutex. + */ +static void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct @@ -295,7 +327,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, if (vp->remove) { again: - vma_mark_detached(vp->remove, true); + vma_mark_detached(vp->remove); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); @@ -374,17 +406,14 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, /* * Close a vm structure and free it. */ -void remove_vma(struct vm_area_struct *vma, bool unreachable) +void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - if (unreachable) - __vm_area_free(vma); - else - vm_area_free(vma); + vm_area_free(vma); } /* @@ -398,7 +427,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; - lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, @@ -415,8 +443,9 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static __must_check int +__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; @@ -510,38 +539,6 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } -/* - * vma has some anon_vma assigned, and is already inserted on that - * anon_vma's interval trees. - * - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the - * vma must be removed from the anon_vma's interval trees using - * anon_vma_interval_tree_pre_update_vma(). - * - * After the update, the vma will be reinserted using - * anon_vma_interval_tree_post_update_vma(). - * - * The entire update must be protected by exclusive mmap_lock and by - * the root anon_vma's mutex. - */ -void -anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); -} - -void -anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); -} - /* * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA @@ -660,14 +657,14 @@ static int commit_merge(struct vma_merge_struct *vmg, vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff); if (expanded) - vma_iter_store(vmg->vmi, vmg->vma); + vma_iter_store_attached(vmg->vmi, vmg->vma); if (adj_start) { adjust->vm_start += adj_start; adjust->vm_pgoff += PHYS_PFN(adj_start); if (adj_start < 0) { WARN_ON(expanded); - vma_iter_store(vmg->vmi, adjust); + vma_iter_store_attached(vmg->vmi, adjust); } } @@ -710,7 +707,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). */ -static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg) +static __must_check struct vm_area_struct *vma_merge_existing_range( + struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; struct vm_area_struct *prev = vmg->prev; @@ -728,19 +726,20 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * bool expanded; mmap_assert_write_locked(vmg->mm); - VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */ - VM_WARN_ON(vmg->next); /* We set this. */ - VM_WARN_ON(prev && start <= prev->vm_start); - VM_WARN_ON(start >= end); + VM_WARN_ON_VMG(!vma, vmg); /* We are modifying a VMA, so caller must specify. */ + VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ + VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); + VM_WARN_ON_VMG(start >= end, vmg); + /* * If vma == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ - VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) || - vmg->end > vma->vm_end)); + VM_WARN_ON_VMG(vma && ((vma != prev && vmg->start != vma->vm_start) || + vmg->end > vma->vm_end), vmg); /* The vmi must be positioned within vmg->vma. */ - VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && - vma_iter_addr(vmg->vmi) < vma->vm_end)); + VM_WARN_ON_VMG(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && + vma_iter_addr(vmg->vmi) < vma->vm_end), vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -857,9 +856,9 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); - VM_WARN_ON(!merge_right); + VM_WARN_ON_VMG(!merge_right, vmg); /* If we are offset into a VMA, then prev must be vma. */ - VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev); + VM_WARN_ON_VMG(vmg->start > vma->vm_start && prev && vma != prev, vmg); if (merge_will_delete_vma) { vmg->vma = next; @@ -971,9 +970,9 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; mmap_assert_write_locked(vmg->mm); - VM_WARN_ON(vmg->vma); + VM_WARN_ON_VMG(vmg->vma, vmg); /* vmi must point at or before the gap. */ - VM_WARN_ON(vma_iter_addr(vmg->vmi) > end); + VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -1055,7 +1054,7 @@ int vma_expand(struct vma_merge_struct *vmg) remove_next = true; /* This should already have been checked by this point. */ - VM_WARN_ON(!can_merge_remove_vma(next)); + VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); ret = dup_anon_vma(vma, next, &anon_dup); if (ret) @@ -1063,10 +1062,10 @@ int vma_expand(struct vma_merge_struct *vmg) } /* Not merging but overwriting any part of next is not handled. */ - VM_WARN_ON(next && !remove_next && - next != vma && vmg->end > next->vm_start); + VM_WARN_ON_VMG(next && !remove_next && + next != vma && vmg->end > next->vm_start, vmg); /* Only handles expanding */ - VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end); + VM_WARN_ON_VMG(vma->vm_start < vmg->start || vma->vm_end > vmg->end, vmg); if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true)) goto nomem; @@ -1130,7 +1129,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, * were isolated before we downgraded mmap_lock. */ mas_set(mas_detach, 1); - lru_add_drain(); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, @@ -1198,7 +1196,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - remove_vma(vma, /* unreachable = */ false); + remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); @@ -1220,7 +1218,7 @@ static void reattach_vmas(struct ma_state *mas_detach) mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - vma_mark_detached(vma, false); + vma_mark_attached(vma); __mt_destroy(mas_detach->tree); } @@ -1295,7 +1293,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, if (error) goto munmap_gather_failed; - vma_mark_detached(next, true); + vma_mark_detached(next); nrpages = vma_pages(next); vms->nr_pages += nrpages; @@ -2430,7 +2428,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } -unsigned long __mmap_region(struct file *file, unsigned long addr, +static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { @@ -2481,3 +2479,476 @@ abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } + +/** + * mmap_region() - Actually perform the userland mapping of a VMA into + * current->mm with known, aligned and overflow-checked @addr and @len, and + * correctly determined VMA flags @vm_flags and page offset @pgoff. + * + * This is an internal memory management function, and should not be used + * directly. + * + * The caller must write-lock current->mm->mmap_lock. + * + * @file: If a file-backed mapping, a pointer to the struct file describing the + * file to be mapped, otherwise NULL. + * @addr: The page-aligned address at which to perform the mapping. + * @len: The page-aligned, non-zero, length of the mapping. + * @vm_flags: The VMA flags which should be applied to the mapping. + * @pgoff: If @file is specified, the page offset into the file, if not then + * the virtual page offset in memory of the anonymous mapping. + * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap + * events. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. + */ +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + unsigned long ret; + bool writable_file_mapping = false; + + mmap_assert_write_locked(current->mm); + + /* Check to see if MDWE is applicable. */ + if (map_deny_write_exec(vm_flags, vm_flags)) + return -EACCES; + + /* Allow architectures to sanity-check the vm_flags. */ + if (!arch_validate_flags(vm_flags)) + return -EINVAL; + + /* Map writable and ensure this isn't a sealed memfd. */ + if (file && is_shared_maywrite(vm_flags)) { + int error = mapping_map_writable(file->f_mapping); + + if (error) + return error; + writable_file_mapping = true; + } + + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + + /* Clear our write mapping regardless of error. */ + if (writable_file_mapping) + mapping_unmap_writable(file->f_mapping); + + validate_mm(current->mm); + return ret; +} + +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @vmi: The vma iterator + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + /* + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. + */ + if (vma && vma->vm_end == addr) { + VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + + vmg.prev = vma; + /* vmi is positioned at prev, which this mode expects. */ + vmg.merge_flags = VMG_FLAG_JUST_EXPAND; + + if (vma_merge_new_range(&vmg)) + goto out; + else if (vmg_nomem(&vmg)) + goto unacct_fail; + } + + if (vma) + vma_iter_next_range(vmi); + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto unacct_fail; + + vma_set_anonymous(vma); + vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); + vm_flags_init(vma, flags); + vma->vm_page_prot = vm_get_page_prot(flags); + vma_start_write(vma); + if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) + goto mas_store_fail; + + mm->map_count++; + validate_mm(mm); + ksm_add_vma(vma); +out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); + vm_flags_set(vma, VM_SOFTDIRTY); + return 0; + +mas_store_fail: + vm_area_free(vma); +unacct_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; +} + +/** + * unmapped_area() - Find an area between the low_limit and the high_limit with + * the correct alignment and offset, all from @info. Note: current->mm is used + * for the search. + * + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask + info->start_gap; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +retry: + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) + return -ENOMEM; + + /* + * Adjust for the gap first so it doesn't interfere with the + * later alignment. The first step is the minimum needed to + * fulill the start gap, the next steps is the minimum to align + * that. It is the minimum needed to fulill both. + */ + gap = vma_iter_addr(&vmi) + info->start_gap; + gap += (info->align_offset - gap) & info->align_mask; + tmp = vma_next(&vmi); + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + vma_iter_reset(&vmi); + goto retry; + } + } else { + tmp = vma_prev(&vmi); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + vma_iter_reset(&vmi); + goto retry; + } + } + + return gap; +} + +/** + * unmapped_area_topdown() - Find an area between the low_limit and the + * high_limit with the correct alignment and offset at the highest available + * address, all from @info. Note: current->mm is used for the search. + * + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap, gap_end; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask + info->start_gap; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +retry: + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) + return -ENOMEM; + + gap = vma_iter_end(&vmi) - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + gap_end = vma_iter_end(&vmi); + tmp = vma_next(&vmi); + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap_end) { + high_limit = vm_start_gap(tmp); + vma_iter_reset(&vmi); + goto retry; + } + } else { + tmp = vma_prev(&vmi); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + vma_iter_reset(&vmi); + goto retry; + } + } + + return gap; +} + +/* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the + * grow-up and grow-down cases. + */ +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long new_start; + + /* address space limit tests */ + if (!may_expand_vm(mm, vma->vm_flags, grow)) + return -ENOMEM; + + /* Stack limit test */ + if (size > rlimit(RLIMIT_STACK)) + return -ENOMEM; + + /* mlock limit tests */ + if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) + return -ENOMEM; + + /* Check to ensure the stack will not grow into a hugetlb-only region */ + new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : + vma->vm_end - size; + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory_mm(mm, grow)) + return -ENOMEM; + + return 0; +} + +#if defined(CONFIG_STACK_GROWSUP) +/* + * PA-RISC uses this for its stack. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +int expand_upwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; + int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + mmap_assert_write_locked(mm); + + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= (TASK_SIZE & PAGE_MASK)) + return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + + if (next) + vma_iter_prev_range_limit(&vmi, address); + + vma_iter_config(&vmi, vma->vm_start, address); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + vma_iter_free(&vmi); + return -ENOMEM; + } + + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); + /* We update the anon VMA tree. */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address > vma->vm_end) { + unsigned long size, grow; + + size = address - vma->vm_start; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_iter_store_attached(&vmi, vma); + anon_vma_interval_tree_post_update_vma(vma); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + vma_iter_free(&vmi); + validate_mm(mm); + return error; +} +#endif /* CONFIG_STACK_GROWSUP */ + +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + * mmap_lock held for writing. + */ +int expand_downwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *prev; + int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); + + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + + mmap_assert_write_locked(mm); + + address &= PAGE_MASK; + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + + /* Enforce stack_guard_gap */ + prev = vma_prev(&vmi); + /* Check that both stack segments have the same anon_vma? */ + if (prev) { + if (!(prev->vm_flags & VM_GROWSDOWN) && + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; + } + + if (prev) + vma_iter_next_range_limit(&vmi, vma->vm_start); + + vma_iter_config(&vmi, address, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + vma_iter_free(&vmi); + return -ENOMEM; + } + + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); + /* We update the anon VMA tree. */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address < vma->vm_start) { + unsigned long size, grow; + + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_start = address; + vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_iter_store_attached(&vmi, vma); + anon_vma_interval_tree_post_update_vma(vma); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + vma_iter_free(&vmi); + validate_mm(mm); + return error; +} + +int __vm_munmap(unsigned long start, size_t len, bool unlock) +{ + int ret; + struct mm_struct *mm = current->mm; + LIST_HEAD(uf); + VMA_ITERATOR(vmi, mm, start); + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); + if (ret || !unlock) + mmap_write_unlock(mm); + + userfaultfd_unmap_complete(mm, &uf); + return ret; +} diff --git a/mm/vma.h b/mm/vma.h index 388d34748674..f51005b95b39 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -139,15 +139,10 @@ void validate_mm(struct mm_struct *mm); #define validate_mm(mm) do { } while (0) #endif -/* Required for expand_downwards(). */ -void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); - -/* Required for expand_downwards(). */ -void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); - -int vma_expand(struct vma_merge_struct *vmg); -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); +__must_check int vma_expand(struct vma_merge_struct *vmg); +__must_check int vma_shrink(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) @@ -162,6 +157,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; + vma_mark_attached(vma); return 0; } @@ -174,19 +170,20 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); -void remove_vma(struct vm_area_struct *vma, bool unreachable); +void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, +__must_check struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -196,7 +193,7 @@ struct vm_area_struct struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -204,7 +201,7 @@ struct vm_area_struct struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -212,11 +209,13 @@ struct vm_area_struct unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx); -struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); +__must_check struct vm_area_struct +*vma_merge_new_range(struct vma_merge_struct *vmg); -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); +__must_check struct vm_area_struct +*vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); @@ -243,10 +242,16 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); -unsigned long __mmap_region(struct file *file, unsigned long addr, +unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); +int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, unsigned long flags); + +unsigned long unmapped_area(struct vm_unmapped_area_info *info); +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); + static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* @@ -360,9 +365,10 @@ static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) } /* Store a VMA with preallocated memory */ -static inline void vma_iter_store(struct vma_iterator *vmi, - struct vm_area_struct *vma) +static inline void vma_iter_store_attached(struct vma_iterator *vmi, + struct vm_area_struct *vma) { + vma_assert_attached(vma); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && @@ -387,6 +393,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi, mas_store_prealloc(&vmi->mas, vma); } +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vma_mark_attached(vma); + vma_iter_store_attached(vmi, vma); +} + static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; @@ -472,4 +485,12 @@ static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) #endif +#if defined(CONFIG_STACK_GROWSUP) +int expand_upwards(struct vm_area_struct *vma, unsigned long address); +#endif + +int expand_downwards(struct vm_area_struct *vma, unsigned long address); + +int __vm_munmap(unsigned long start, size_t len, bool unlock); + #endif /* __MM_VMA_H */ diff --git a/mm/vma_internal.h b/mm/vma_internal.h index fc5f172a36bd..2f05735ff190 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5c88d0e90c20..a6e7acebe9ad 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3562,11 +3562,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy_noprof(gfp, + nr = alloc_pages_bulk_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node_noprof(gfp, nid, + nr = alloc_pages_bulk_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); diff --git a/mm/vmscan.c b/mm/vmscan.c index 49f801b00d5d..01dce6f26ed3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -862,6 +862,31 @@ enum folio_references { FOLIOREF_ACTIVATE, }; +#ifdef CONFIG_LRU_GEN +/* + * Only used on a mapped folio in the eviction (rmap walk) path, where promotion + * needs to be done by taking the folio off the LRU list and then adding it back + * with PG_active set. In contrast, the aging (page table walk) path uses + * folio_update_gen(). + */ +static bool lru_gen_set_refs(struct folio *folio) +{ + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return false; + } + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + return true; +} +#else +static bool lru_gen_set_refs(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ + static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { @@ -870,7 +895,6 @@ static enum folio_references folio_check_references(struct folio *folio, referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); - referenced_folio = folio_test_clear_referenced(folio); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. @@ -888,6 +912,15 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; + if (lru_gen_enabled()) { + if (!referenced_ptes) + return FOLIOREF_RECLAIM; + + return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; + } + + referenced_folio = folio_test_clear_referenced(folio); + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1092,11 +1125,6 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; - /* folio_update_gen() tried to promote this page? */ - if (lru_gen_enabled() && !ignore_references && - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -1137,8 +1165,9 @@ retry: * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the folio for immediate - * reclaim and continue scanning. + * not to fs), or the writeback may take an indeterminate + * amount of time to complete. In this case mark the folio + * for immediate reclaim and continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might @@ -1163,6 +1192,8 @@ retry: * takes to write them to disk. */ if (folio_test_writeback(folio)) { + mapping = folio_mapping(folio); + /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && @@ -1173,7 +1204,8 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || - !may_enter_fs(folio, sc->gfp_mask)) { + !may_enter_fs(folio, sc->gfp_mask) || + (mapping && mapping_writeback_indeterminate(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have @@ -2624,11 +2656,17 @@ static bool should_clear_pmd_young(void) READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } +#define evictable_min_seq(min_seq, swappiness) \ + min((min_seq)[!(swappiness)], (min_seq)[(swappiness) != MAX_SWAPPINESS]) + #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define for_each_evictable_type(type, swappiness) \ + for ((type) = !(swappiness); (type) <= ((swappiness) != MAX_SWAPPINESS); (type)++) + #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) @@ -2674,10 +2712,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_folio */ - return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && - get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && - get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + int n = get_nr_gens(lruvec, type); + + if (n < MIN_NR_GENS || n > MAX_NR_GENS) + return false; + } + + return true; } /****************************************************************************** @@ -3078,16 +3122,20 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { + int i; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); - pos->refaulted = lrugen->avg_refaulted[type][tier] + - atomic_long_read(&lrugen->refaulted[hist][type][tier]); - pos->total = lrugen->avg_total[type][tier] + - atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - pos->total += lrugen->protected[hist][type][tier - 1]; pos->gain = gain; + pos->refaulted = pos->total = 0; + + for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { + pos->refaulted += lrugen->avg_refaulted[type][i] + + atomic_long_read(&lrugen->refaulted[hist][type][i]); + pos->total += lrugen->avg_total[type][i] + + lrugen->protected[hist][type][i] + + atomic_long_read(&lrugen->evicted[hist][type][i]); + } } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) @@ -3113,17 +3161,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); sum = lrugen->avg_total[type][tier] + + lrugen->protected[hist][type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - sum += lrugen->protected[hist][type][tier - 1]; WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); } if (clear) { atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); atomic_long_set(&lrugen->evicted[hist][type][tier], 0); - if (tier) - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + WRITE_ONCE(lrugen->protected[hist][type][tier], 0); } } } @@ -3150,16 +3196,19 @@ static int folio_update_gen(struct folio *folio, int gen) VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return -1; + } + do { /* lru_gen_del_folio() has isolated this page? */ - if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_folio_list() */ - new_flags = old_flags | BIT(PG_referenced); - continue; - } + if (!(old_flags & LRU_GEN_MASK)) + return -1; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; @@ -3183,7 +3232,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_gen = (old_gen + 1) % MAX_NR_GENS; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) @@ -3258,7 +3307,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (vma_is_anonymous(vma)) - return !walk->can_swap; + return !walk->swappiness; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; @@ -3268,7 +3317,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (shmem_mapping(mapping)) - return !walk->can_swap; + return !walk->swappiness; + + if (walk->swappiness == MAX_SWAPPINESS) + return true; /* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio; @@ -3356,21 +3408,19 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned } static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, - struct pglist_data *pgdat, bool can_swap) + struct pglist_data *pgdat) { - struct folio *folio; + struct folio *folio = pfn_folio(pfn); + + if (folio_lru_gen(folio) < 0) + return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; if (folio_memcg(folio) != memcg) return NULL; - /* file VMAs can contain anon pages from COW */ - if (!folio_is_file_lru(folio) && !can_swap) - return NULL; - return folio; } @@ -3382,29 +3432,55 @@ static bool suitable_to_scan(int total, int young) return young * n >= total; } +static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio, + int new_gen, bool dirty) +{ + int old_gen; + + if (!folio) + return; + + if (dirty && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); + } +} + static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { int i; + bool dirty; pte_t *pte; spinlock_t *ptl; unsigned long addr; int total = 0; int young = 0; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); pmd_t pmdval; - pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, - &ptl); + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); if (!pte) return false; + if (!spin_trylock(ptl)) { pte_unmap(pte); - return false; + return true; } if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { @@ -3426,26 +3502,30 @@ restart: if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(args->vma, addr, pte + i)) continue; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pte_dirty(ptent)) + dirty = true; + young++; walk->mm_stats[MM_LEAF_YOUNG]++; - - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); } + walk_update_folio(walk, last, gen, dirty); + last = NULL; + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; @@ -3459,13 +3539,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; + bool dirty; pmd_t *pmd; spinlock_t *ptl; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3511,27 +3593,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (pfn == -1) goto next; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) goto next; if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pmd_dirty(pmd[i])) + dirty = true; + walk->mm_stats[MM_LEAF_YOUNG]++; - - if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); next: i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; } while (i <= MIN_LRU_BATCH); + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: @@ -3723,22 +3808,25 @@ static void clear_mm_walk(void) kfree(walk); } -static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) { int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - if (type == LRU_GEN_ANON && !can_swap) + if (type ? swappiness == MAX_SWAPPINESS : !swappiness) goto done; - /* prevent cold/hot inversion if force_scan is true */ + /* prevent cold/hot inversion if the type is evictable */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); + int refs = folio_lru_refs(folio); + bool workingset = folio_test_workingset(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); @@ -3748,6 +3836,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int tier = lru_tier_from_refs(refs, workingset); + int delta = folio_nr_pages(folio); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } + if (!--remaining) return false; } @@ -3759,7 +3856,7 @@ done: return true; } -static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; @@ -3769,7 +3866,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); /* find the oldest populated generation */ - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { gen = lru_gen_from_seq(min_seq[type]); @@ -3785,13 +3882,17 @@ next: } /* see the comment on lru_gen_folio */ - if (can_swap) { - min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); - min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + if (swappiness && swappiness != MAX_SWAPPINESS) { + unsigned long seq = lrugen->max_seq - MIN_NR_GENS; + + if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) + min_seq[LRU_GEN_ANON] = seq; + else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) + min_seq[LRU_GEN_FILE] = seq; } - for (type = !can_swap; type < ANON_AND_FILE; type++) { - if (min_seq[type] == lrugen->min_seq[type]) + for_each_evictable_type(type, swappiness) { + if (min_seq[type] <= lrugen->min_seq[type]) continue; reset_ctrl_pos(lruvec, type, true); @@ -3802,8 +3903,7 @@ next: return success; } -static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) { bool success; int prev, next; @@ -3821,13 +3921,11 @@ restart: if (!success) goto unlock; - for (type = ANON_AND_FILE - 1; type >= 0; type--) { + for (type = 0; type < ANON_AND_FILE; type++) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; - VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); - - if (inc_min_seq(lruvec, type, can_swap)) + if (inc_min_seq(lruvec, type, swappiness)) continue; spin_unlock_irq(&lruvec->lru_lock); @@ -3871,7 +3969,7 @@ unlock: } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -3882,7 +3980,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) - return inc_max_seq(lruvec, seq, can_swap, force_scan); + return inc_max_seq(lruvec, seq, swappiness); /* see the comment in iterate_mm_list() */ if (seq <= READ_ONCE(mm_state->seq)) @@ -3907,7 +4005,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, walk->lruvec = lruvec; walk->seq = seq; - walk->can_swap = can_swap; + walk->swappiness = swappiness; walk->force_scan = force_scan; do { @@ -3917,7 +4015,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, } while (mm); done: if (success) { - success = inc_max_seq(lruvec, seq, can_swap, force_scan); + success = inc_max_seq(lruvec, seq, swappiness); WARN_ON_ONCE(!success); } @@ -3958,13 +4056,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; unsigned long total = 0; - bool can_swap = get_swappiness(lruvec, sc); + int swappiness = get_swappiness(lruvec, sc); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { @@ -3984,6 +4082,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc { int gen; unsigned long birth; + int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); @@ -3993,8 +4092,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc if (!lruvec_is_sizable(lruvec, sc)) return false; - /* see the comment on lru_gen_folio */ - gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); return time_is_before_jiffies(birth + min_ttl); @@ -4053,21 +4151,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; + bool dirty; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; + struct folio *last = NULL; int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); - bool can_swap = !folio_is_file_lru(folio); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); @@ -4114,37 +4213,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(vma, addr, pte + i)) continue; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pte_dirty(ptent)) + dirty = true; + young++; - - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - if (walk) { - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); - - continue; - } - - old_gen = folio_lru_gen(folio); - if (old_gen < 0) - folio_set_referenced(folio); - else if (old_gen != new_gen) { - folio_clear_lru_refs(folio); - folio_activate(folio); - } } + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); /* feedback from rmap walkers to page table walkers */ @@ -4302,7 +4392,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4324,14 +4415,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* protected */ - if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - + if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], - lrugen->protected[hist][type][tier - 1] + delta); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } return true; } @@ -4351,8 +4445,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* waiting for writeback */ - if (folio_test_locked(folio) || writeback || - (type == LRU_GEN_FILE && dirty)) { + if (writeback || (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4381,13 +4474,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return false; } - /* see the comment on MAX_NR_TIERS */ + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - folio_clear_lru_refs(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); - folio_clear_referenced(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); @@ -4483,13 +4575,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type) struct ctrl_pos sp, pv; /* - * To leave a margin for fluctuations, use a larger gain factor (1:2). + * To leave a margin for fluctuations, use a larger gain factor (2:3). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier. */ - read_ctrl_pos(lruvec, type, 0, 1, &sp); + read_ctrl_pos(lruvec, type, 0, 2, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, 2, &pv); + read_ctrl_pos(lruvec, type, tier, 3, &pv); if (!positive_ctrl_err(&sp, &pv)) break; } @@ -4497,79 +4589,45 @@ static int get_tier_idx(struct lruvec *lruvec, int type) return tier - 1; } -static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - int type, tier; struct ctrl_pos sp, pv; - int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness }; + if (!swappiness) + return LRU_GEN_FILE; + + if (swappiness == MAX_SWAPPINESS) + return LRU_GEN_ANON; /* - * Compare the first tier of anon with that of file to determine which - * type to scan. Also need to compare other tiers of the selected type - * with the first tier of the other type to determine the last tier (of - * the selected type) to evict. + * Compare the sum of all tiers of anon with that of file to determine + * which type to scan. */ - read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); - read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); - type = positive_ctrl_err(&sp, &pv); + read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); - read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); - for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, gain[type], &pv); - if (!positive_ctrl_err(&sp, &pv)) - break; - } - - *tier_idx = tier - 1; - - return type; + return positive_ctrl_err(&sp, &pv); } static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; - int type; - int scanned; - int tier = -1; - DEFINE_MIN_SEQ(lruvec); + int type = get_type_to_scan(lruvec, swappiness); - /* - * Try to make the obvious choice first, and if anon and file are both - * available from the same generation, - * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon - * first. - * 2. If !__GFP_IO, file first since clean pagecache is more likely to - * exist than clean swapcache. - */ - if (!swappiness) - type = LRU_GEN_FILE; - else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) - type = LRU_GEN_ANON; - else if (swappiness == 1) - type = LRU_GEN_FILE; - else if (swappiness == MAX_SWAPPINESS) - type = LRU_GEN_ANON; - else if (!(sc->gfp_mask & __GFP_IO)) - type = LRU_GEN_FILE; - else - type = get_type_to_scan(lruvec, swappiness, &tier); + for_each_evictable_type(i, swappiness) { + int scanned; + int tier = get_tier_idx(lruvec, type); - for (i = !swappiness; i < ANON_AND_FILE; i++) { - if (tier < 0) - tier = get_tier_idx(lruvec, type); + *type_scanned = type; scanned = scan_folios(lruvec, sc, type, tier, list); if (scanned) - break; + return scanned; type = !type; - tier = -1; } - *type_scanned = type; - - return scanned; + return 0; } static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) @@ -4585,6 +4643,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -4594,7 +4653,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap scanned += try_to_inc_min_seq(lruvec, swappiness); - if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; spin_unlock_irq(&lruvec->lru_lock); @@ -4610,31 +4669,24 @@ retry: type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { + DEFINE_MIN_SEQ(lruvec); + if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } - if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); - continue; - } - - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || - folio_mapped(folio) || folio_test_locked(folio) || - folio_test_dirty(folio) || folio_test_writeback(folio)) { - /* don't add rejected folios to the oldest generation */ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, - BIT(PG_active)); - continue; - } - /* retry folios that may have missed folio_rotate_reclaimable() */ - list_move(&folio->lru, &clean); + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, &clean); + continue; + } + + /* don't add rejected folios to the oldest generation */ + if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); @@ -4671,63 +4723,32 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - bool can_swap, unsigned long *nr_to_scan) + int swappiness, unsigned long *nr_to_scan) { int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; - unsigned long total = 0; + unsigned long size = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); - /* whether this lruvec is completely out of cold folios */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { - *nr_to_scan = 0; + *nr_to_scan = 0; + /* have to run aging, since eviction is not possible anymore */ + if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - } - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; } } - *nr_to_scan = total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; + *nr_to_scan = size; + /* better to run aging even though eviction is still possible */ + return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } /* @@ -4735,7 +4756,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { bool success; unsigned long nr_to_scan; @@ -4745,7 +4766,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; - success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan); + success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); /* try to scrape all its memory if this memcg was deleted */ if (nr_to_scan && !mem_cgroup_online(memcg)) @@ -4756,7 +4777,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -5300,8 +5321,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, s = "rep"; n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); } for (i = 0; i < 3; i++) @@ -5356,7 +5376,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) seq_printf(m, " node %5d\n", nid); if (!full) - seq = min_seq[LRU_GEN_ANON]; + seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); else if (max_seq >= MAX_NR_GENS) seq = max_seq - MAX_NR_GENS + 1; else @@ -5396,23 +5416,14 @@ static const struct seq_operations lru_gen_seq_ops = { }; static int run_aging(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); - - if (seq < max_seq) - return 0; if (seq > max_seq) return -EINVAL; - if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) - return -ERANGE; - - try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); - - return 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, @@ -5428,7 +5439,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co while (!signal_pending(current)) { DEFINE_MIN_SEQ(lruvec); - if (seq < min_seq[!swappiness]) + if (seq < evictable_min_seq(min_seq, swappiness)) return 0; if (sc->nr_reclaimed >= nr_to_reclaim) diff --git a/mm/workingset.c b/mm/workingset.c index a4705e196545..4841ae8af411 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio) int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -253,18 +254,18 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; - unsigned long min_seq; + unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; @@ -273,8 +274,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); - min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); - return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); + max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); + max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + + return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) @@ -290,7 +293,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) rcu_read_lock(); - recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; @@ -302,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow) lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); - /* see the comment in folio_lru_refs() */ - refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; - tier = lru_tier_from_refs(refs); + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; + tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); - /* - * Count the following two cases as stalls: - * 1. For pages accessed through page tables, hotter pages pushed out - * hot pages which refaulted immediately. - * 2. For pages accessed multiple times through file descriptors, - * they would have been protected by sort_folio(). - */ - if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { - set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); + /* see folio_add_lru() where folio_set_active() will be called */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + + if (workingset) { + folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); - } + } else + set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } @@ -331,7 +330,7 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; @@ -428,17 +427,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, struct pglist_data *pgdat; unsigned long eviction; - rcu_read_lock(); - if (lru_gen_enabled()) { - bool recent = lru_gen_test_recent(shadow, file, - &eviction_lruvec, &eviction, workingset); + bool recent; + rcu_read_lock(); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } - + rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; @@ -459,14 +457,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && - (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { - rcu_read_unlock(); - return false; - } - + if (!mem_cgroup_tryget(eviction_memcg)) + eviction_memcg = NULL; rcu_read_unlock(); + if (!mem_cgroup_disabled() && !eviction_memcg) + return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * @@ -544,6 +540,8 @@ void workingset_refault(struct folio *folio, void *shadow) bool workingset; long nr; + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; @@ -558,7 +556,6 @@ void workingset_refault(struct folio *folio, void *shadow) * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); diff --git a/mm/zpdesc.h b/mm/zpdesc.h new file mode 100644 index 000000000000..fa47fece2237 --- /dev/null +++ b/mm/zpdesc.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* zpdesc.h: zswap.zpool memory descriptor + * + * Written by Alex Shi + * Hyeonggon Yoo <42.hyeyoo@gmail.com> + */ +#ifndef __MM_ZPDESC_H__ +#define __MM_ZPDESC_H__ + +/* + * struct zpdesc - Memory descriptor for zpool memory. + * @flags: Page flags, mostly unused by zsmalloc. + * @lru: Indirectly used by page migration. + * @movable_ops: Used by page migration. + * @next: Next zpdesc in a zspage in zsmalloc zpool. + * @handle: For huge zspage in zsmalloc zpool. + * @zspage: Points to the zspage this zpdesc is a part of. + * @first_obj_offset: First object offset in zsmalloc zpool. + * @_refcount: The number of references to this zpdesc. + * + * This struct overlays struct page for now. Do not modify without a good + * understanding of the issues. In particular, do not expand into the overlap + * with memcg_data. + * + * Page flags used: + * * PG_private identifies the first component page. + * * PG_locked is used by page migration code. + */ +struct zpdesc { + unsigned long flags; + struct list_head lru; + unsigned long movable_ops; + union { + struct zpdesc *next; + unsigned long handle; + }; + struct zspage *zspage; + /* + * Only the lower 24 bits are available for offset, limiting a page + * to 16 MiB. The upper 8 bits are reserved for PGTY_zsmalloc. + * + * Do not access this field directly. + * Instead, use {get,set}_first_obj_offset() helpers. + */ + unsigned int first_obj_offset; + atomic_t _refcount; +}; +#define ZPDESC_MATCH(pg, zp) \ + static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp)) + +ZPDESC_MATCH(flags, flags); +ZPDESC_MATCH(lru, lru); +ZPDESC_MATCH(mapping, movable_ops); +ZPDESC_MATCH(index, next); +ZPDESC_MATCH(index, handle); +ZPDESC_MATCH(private, zspage); +ZPDESC_MATCH(page_type, first_obj_offset); +ZPDESC_MATCH(_refcount, _refcount); +#undef ZPDESC_MATCH +static_assert(sizeof(struct zpdesc) <= sizeof(struct page)); + +/* + * zpdesc_page - The first struct page allocated for a zpdesc + * @zp: The zpdesc. + * + * A convenience wrapper for converting zpdesc to the first struct page of the + * underlying folio, to communicate with code not yet converted to folio or + * struct zpdesc. + * + */ +#define zpdesc_page(zp) (_Generic((zp), \ + const struct zpdesc *: (const struct page *)(zp), \ + struct zpdesc *: (struct page *)(zp))) + +/** + * zpdesc_folio - The folio allocated for a zpdesc + * @zp: The zpdesc. + * + * Zpdescs are descriptors for zpool memory. The zpool memory itself is + * allocated as folios that contain the zpool objects, and zpdesc uses specific + * fields in the first struct page of the folio - those fields are now accessed + * by struct zpdesc. + * + * It is occasionally necessary convert to back to a folio in order to + * communicate with the rest of the mm. Please use this helper function + * instead of casting yourself, as the implementation may change in the future. + */ +#define zpdesc_folio(zp) (_Generic((zp), \ + const struct zpdesc *: (const struct folio *)(zp), \ + struct zpdesc *: (struct folio *)(zp))) +/** + * page_zpdesc - Converts from first struct page to zpdesc. + * @p: The first (either head of compound or single) page of zpdesc. + * + * A temporary wrapper to convert struct page to struct zpdesc in situations + * where we know the page is the compound head, or single order-0 page. + * + * Long-term ideally everything would work with struct zpdesc directly or go + * through folio to struct zpdesc. + * + * Return: The zpdesc which contains this page + */ +#define page_zpdesc(p) (_Generic((p), \ + const struct page *: (const struct zpdesc *)(p), \ + struct page *: (struct zpdesc *)(p))) + +static inline void zpdesc_lock(struct zpdesc *zpdesc) +{ + folio_lock(zpdesc_folio(zpdesc)); +} + +static inline bool zpdesc_trylock(struct zpdesc *zpdesc) +{ + return folio_trylock(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_unlock(struct zpdesc *zpdesc) +{ + folio_unlock(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_wait_locked(struct zpdesc *zpdesc) +{ + folio_wait_locked(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_get(struct zpdesc *zpdesc) +{ + folio_get(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_put(struct zpdesc *zpdesc) +{ + folio_put(zpdesc_folio(zpdesc)); +} + +static inline void *kmap_local_zpdesc(struct zpdesc *zpdesc) +{ + return kmap_local_page(zpdesc_page(zpdesc)); +} + +static inline unsigned long zpdesc_pfn(struct zpdesc *zpdesc) +{ + return page_to_pfn(zpdesc_page(zpdesc)); +} + +static inline struct zpdesc *pfn_zpdesc(unsigned long pfn) +{ + return page_zpdesc(pfn_to_page(pfn)); +} + +static inline void __zpdesc_set_movable(struct zpdesc *zpdesc, + const struct movable_operations *mops) +{ + __SetPageMovable(zpdesc_page(zpdesc), mops); +} + +static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) +{ + __SetPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc) +{ + __ClearPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc) +{ + return PageIsolated(zpdesc_page(zpdesc)); +} + +static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) +{ + return page_zone(zpdesc_page(zpdesc)); +} + +static inline bool zpdesc_is_locked(struct zpdesc *zpdesc) +{ + return folio_test_locked(zpdesc_folio(zpdesc)); +} +#endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 64b66a4d3e6e..dae32e051779 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -13,24 +13,6 @@ * Released under the terms of GNU General Public License Version 2.0 */ -/* - * Following is how we use various fields and flags of underlying - * struct page(s) to form a zspage. - * - * Usage of struct page fields: - * page->private: points to zspage - * page->index: links together all component pages of a zspage - * For the huge page, this is always 0, so we use this field - * to store handle. - * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object - * offset in a subpage of a zspage - * - * Usage of struct page flags: - * PG_private: identifies the first component page - * PG_owner_priv_1: identifies the huge component page - * - */ - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* @@ -67,6 +49,7 @@ #include #include #include +#include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 @@ -245,6 +228,35 @@ struct zs_pool { atomic_t compaction_in_progress; }; +static inline void zpdesc_set_first(struct zpdesc *zpdesc) +{ + SetPagePrivate(zpdesc_page(zpdesc)); +} + +static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc) +{ + inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); +} + +static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) +{ + dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); +} + +static inline struct zpdesc *alloc_zpdesc(gfp_t gfp) +{ + struct page *page = alloc_page(gfp); + + return page_zpdesc(page); +} + +static inline void free_zpdesc(struct zpdesc *zpdesc) +{ + struct page *page = zpdesc_page(zpdesc); + + __free_page(page); +} + struct zspage { struct { unsigned int huge:HUGE_BITS; @@ -254,7 +266,7 @@ struct zspage { }; unsigned int inuse; unsigned int freeobj; - struct page *first_page; + struct zpdesc *first_zpdesc; struct list_head list; /* fullness list */ struct zs_pool *pool; rwlock_t lock; @@ -440,9 +452,9 @@ static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { .lock = INIT_LOCAL_LOCK(lock), }; -static __maybe_unused int is_first_page(struct page *page) +static inline bool is_first_zpdesc(struct zpdesc *zpdesc) { - return PagePrivate(page); + return PagePrivate(zpdesc_page(zpdesc)); } /* Protected by class->lock */ @@ -451,36 +463,35 @@ static inline int get_zspage_inuse(struct zspage *zspage) return zspage->inuse; } - static inline void mod_zspage_inuse(struct zspage *zspage, int val) { zspage->inuse += val; } -static inline struct page *get_first_page(struct zspage *zspage) +static struct zpdesc *get_first_zpdesc(struct zspage *zspage) { - struct page *first_page = zspage->first_page; + struct zpdesc *first_zpdesc = zspage->first_zpdesc; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - return first_page; + VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc)); + return first_zpdesc; } #define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff -static inline unsigned int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc) { - VM_WARN_ON_ONCE(!PageZsmalloc(page)); - return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK; + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); + return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK; } -static inline void set_first_obj_offset(struct page *page, unsigned int offset) +static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset) { /* With 24 bits available, we can support offsets into 16 MiB pages. */ BUILD_BUG_ON(PAGE_SIZE > SZ_16M); - VM_WARN_ON_ONCE(!PageZsmalloc(page)); + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); - page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK; - page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK; } static inline unsigned int get_freeobj(struct zspage *zspage) @@ -733,52 +744,52 @@ out: return newfg; } -static struct zspage *get_zspage(struct page *page) +static struct zspage *get_zspage(struct zpdesc *zpdesc) { - struct zspage *zspage = (struct zspage *)page_private(page); + struct zspage *zspage = zpdesc->zspage; BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; } -static struct page *get_next_page(struct page *page) +static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc) { - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) return NULL; - return (struct page *)page->index; + return zpdesc->next; } /** - * obj_to_location - get (, ) from encoded object value + * obj_to_location - get (, ) from encoded object value * @obj: the encoded object value - * @page: page object resides in zspage + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static void obj_to_location(unsigned long obj, struct page **page, +static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc, unsigned int *obj_idx) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); *obj_idx = (obj & OBJ_INDEX_MASK); } -static void obj_to_page(unsigned long obj, struct page **page) +static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); } /** - * location_to_obj - get obj value encoded from (, ) - * @page: page object resides in zspage + * location_to_obj - get obj value encoded from (, ) + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) +static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx) { unsigned long obj; - obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; return obj; @@ -789,15 +800,15 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static inline bool obj_allocated(struct page *page, void *obj, +static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj, unsigned long *phandle) { unsigned long handle; - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) { - VM_BUG_ON_PAGE(!is_first_page(page), page); - handle = page->index; + VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc)); + handle = zpdesc->handle; } else handle = *(unsigned long *)obj; @@ -809,8 +820,10 @@ static inline bool obj_allocated(struct page *page, void *obj, return true; } -static void reset_page(struct page *page) +static void reset_zpdesc(struct zpdesc *zpdesc) { + struct page *page = zpdesc_page(zpdesc); + __ClearPageMovable(page); ClearPagePrivate(page); set_page_private(page, 0); @@ -820,11 +833,11 @@ static void reset_page(struct page *page) static int trylock_zspage(struct zspage *zspage) { - struct page *cursor, *fail; + struct zpdesc *cursor, *fail; - for (cursor = get_first_page(zspage); cursor != NULL; cursor = - get_next_page(cursor)) { - if (!trylock_page(cursor)) { + for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor = + get_next_zpdesc(cursor)) { + if (!zpdesc_trylock(cursor)) { fail = cursor; goto unlock; } @@ -832,9 +845,9 @@ static int trylock_zspage(struct zspage *zspage) return 1; unlock: - for (cursor = get_first_page(zspage); cursor != fail; cursor = - get_next_page(cursor)) - unlock_page(cursor); + for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor = + get_next_zpdesc(cursor)) + zpdesc_unlock(cursor); return 0; } @@ -842,23 +855,23 @@ unlock: static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { - struct page *page, *next; + struct zpdesc *zpdesc, *next; assert_spin_locked(&class->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); - next = page = get_first_page(zspage); + next = zpdesc = get_first_zpdesc(zspage); do { - VM_BUG_ON_PAGE(!PageLocked(page), page); - next = get_next_page(page); - reset_page(page); - unlock_page(page); - dec_zone_page_state(page, NR_ZSPAGES); - put_page(page); - page = next; - } while (page != NULL); + VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc)); + next = get_next_zpdesc(zpdesc); + reset_zpdesc(zpdesc); + zpdesc_unlock(zpdesc); + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_put(zpdesc); + zpdesc = next; + } while (zpdesc != NULL); cache_free_zspage(pool, zspage); @@ -891,16 +904,16 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) { unsigned int freeobj = 1; unsigned long off = 0; - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); - while (page) { - struct page *next_page; + while (zpdesc) { + struct zpdesc *next_zpdesc; struct link_free *link; void *vaddr; - set_first_obj_offset(page, off); + set_first_obj_offset(zpdesc, off); - vaddr = kmap_local_page(page); + vaddr = kmap_local_zpdesc(zpdesc); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { @@ -913,8 +926,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) * page, which must point to the first object on the next * page (if present) */ - next_page = get_next_page(page); - if (next_page) { + next_zpdesc = get_next_zpdesc(zpdesc); + if (next_zpdesc) { link->next = freeobj++ << OBJ_TAG_BITS; } else { /* @@ -924,7 +937,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) link->next = -1UL << OBJ_TAG_BITS; } kunmap_local(vaddr); - page = next_page; + zpdesc = next_zpdesc; off %= PAGE_SIZE; } @@ -932,35 +945,35 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) } static void create_page_chain(struct size_class *class, struct zspage *zspage, - struct page *pages[]) + struct zpdesc *zpdescs[]) { int i; - struct page *page; - struct page *prev_page = NULL; - int nr_pages = class->pages_per_zspage; + struct zpdesc *zpdesc; + struct zpdesc *prev_zpdesc = NULL; + int nr_zpdescs = class->pages_per_zspage; /* * Allocate individual pages and link them together as: - * 1. all pages are linked together using page->index - * 2. each sub-page point to zspage using page->private + * 1. all pages are linked together using zpdesc->next + * 2. each sub-page point to zspage using zpdesc->zspage * - * we set PG_private to identify the first page (i.e. no other sub-page + * we set PG_private to identify the first zpdesc (i.e. no other zpdesc * has this flag set). */ - for (i = 0; i < nr_pages; i++) { - page = pages[i]; - set_page_private(page, (unsigned long)zspage); - page->index = 0; + for (i = 0; i < nr_zpdescs; i++) { + zpdesc = zpdescs[i]; + zpdesc->zspage = zspage; + zpdesc->next = NULL; if (i == 0) { - zspage->first_page = page; - SetPagePrivate(page); + zspage->first_zpdesc = zpdesc; + zpdesc_set_first(zpdesc); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) SetZsHugePage(zspage); } else { - prev_page->index = (unsigned long)page; + prev_zpdesc->next = zpdesc; } - prev_page = page; + prev_zpdesc = zpdesc; } } @@ -972,7 +985,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, gfp_t gfp) { int i; - struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; + struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; struct zspage *zspage = cache_alloc_zspage(pool, gfp); if (!zspage) @@ -982,25 +995,25 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, migrate_lock_init(zspage); for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; + struct zpdesc *zpdesc; - page = alloc_page(gfp); - if (!page) { + zpdesc = alloc_zpdesc(gfp); + if (!zpdesc) { while (--i >= 0) { - dec_zone_page_state(pages[i], NR_ZSPAGES); - __ClearPageZsmalloc(pages[i]); - __free_page(pages[i]); + zpdesc_dec_zone_page_state(zpdescs[i]); + __zpdesc_clear_zsmalloc(zpdescs[i]); + free_zpdesc(zpdescs[i]); } cache_free_zspage(pool, zspage); return NULL; } - __SetPageZsmalloc(page); + __zpdesc_set_zsmalloc(zpdesc); - inc_zone_page_state(page, NR_ZSPAGES); - pages[i] = page; + zpdesc_inc_zone_page_state(zpdesc); + zpdescs[i] = zpdesc; } - create_page_chain(class, zspage, pages); + create_page_chain(class, zspage, zpdescs); init_zspage(class, zspage); zspage->pool = pool; zspage->class = class->index; @@ -1044,7 +1057,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) } static void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) + struct zpdesc *zpdescs[2], int off, int size) { size_t sizes[2]; char *buf = area->vm_buf; @@ -1060,14 +1073,14 @@ static void *__zs_map_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ - memcpy_from_page(buf, pages[0], off, sizes[0]); - memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]); + memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]); + memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]); out: return area->vm_buf; } static void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) + struct zpdesc *zpdescs[2], int off, int size) { size_t sizes[2]; char *buf; @@ -1085,8 +1098,8 @@ static void __zs_unmap_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ - memcpy_to_page(pages[0], off, buf, sizes[0]); - memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]); + memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]); + memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]); out: /* enable page faults to match kunmap_local() return conditions */ @@ -1176,13 +1189,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct zspage *zspage; - struct page *page; + struct zpdesc *zpdesc; unsigned long obj, off; unsigned int obj_idx; struct size_class *class; struct mapping_area *area; - struct page *pages[2]; + struct zpdesc *zpdescs[2]; void *ret; /* @@ -1195,8 +1208,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, /* It guarantees it can get zspage from handle safely */ read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_location(obj, &page, &obj_idx); - zspage = get_zspage(page); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); /* * migration cannot move any zpages in this zspage. Here, class->lock @@ -1215,17 +1228,17 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ - area->vm_addr = kmap_local_page(page); + area->vm_addr = kmap_local_zpdesc(zpdesc); ret = area->vm_addr + off; goto out; } /* this object spans two pages */ - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + zpdescs[0] = zpdesc; + zpdescs[1] = get_next_zpdesc(zpdesc); + BUG_ON(!zpdescs[1]); - ret = __zs_map_object(area, pages, off, class->size); + ret = __zs_map_object(area, zpdescs, off, class->size); out: if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; @@ -1237,7 +1250,7 @@ EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; - struct page *page; + struct zpdesc *zpdesc; unsigned long obj, off; unsigned int obj_idx; @@ -1245,8 +1258,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) struct mapping_area *area; obj = handle_to_obj(handle); - obj_to_location(obj, &page, &obj_idx); - zspage = get_zspage(page); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); @@ -1254,13 +1267,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) if (off + class->size <= PAGE_SIZE) kunmap_local(area->vm_addr); else { - struct page *pages[2]; + struct zpdesc *zpdescs[2]; - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + zpdescs[0] = zpdesc; + zpdescs[1] = get_next_zpdesc(zpdesc); + BUG_ON(!zpdescs[1]); - __zs_unmap_object(area, pages, off, class->size); + __zs_unmap_object(area, zpdescs, off, class->size); } local_unlock(&zs_map_area.lock); @@ -1290,12 +1303,12 @@ EXPORT_SYMBOL_GPL(zs_huge_class_size); static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { - int i, nr_page, offset; + int i, nr_zpdesc, offset; unsigned long obj; struct link_free *link; struct size_class *class; - struct page *m_page; + struct zpdesc *m_zpdesc; unsigned long m_offset; void *vaddr; @@ -1303,27 +1316,26 @@ static unsigned long obj_malloc(struct zs_pool *pool, obj = get_freeobj(zspage); offset = obj * class->size; - nr_page = offset >> PAGE_SHIFT; + nr_zpdesc = offset >> PAGE_SHIFT; m_offset = offset_in_page(offset); - m_page = get_first_page(zspage); + m_zpdesc = get_first_zpdesc(zspage); - for (i = 0; i < nr_page; i++) - m_page = get_next_page(m_page); + for (i = 0; i < nr_zpdesc; i++) + m_zpdesc = get_next_zpdesc(m_zpdesc); - vaddr = kmap_local_page(m_page); + vaddr = kmap_local_zpdesc(m_zpdesc); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle | OBJ_ALLOCATED_TAG; else - /* record handle to page->index */ - zspage->first_page->index = handle | OBJ_ALLOCATED_TAG; + zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG; kunmap_local(vaddr); mod_zspage_inuse(zspage, 1); - obj = location_to_obj(m_page, obj); + obj = location_to_obj(m_zpdesc, obj); record_obj(handle, obj); return obj; @@ -1402,23 +1414,24 @@ static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long f_offset; unsigned int f_objidx; void *vaddr; - obj_to_location(obj, &f_page, &f_objidx); - f_offset = offset_in_page(class_size * f_objidx); - zspage = get_zspage(f_page); - vaddr = kmap_local_page(f_page); + obj_to_location(obj, &f_zpdesc, &f_objidx); + f_offset = offset_in_page(class_size * f_objidx); + zspage = get_zspage(f_zpdesc); + + vaddr = kmap_local_zpdesc(f_zpdesc); link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ if (likely(!ZsHugePage(zspage))) link->next = get_freeobj(zspage) << OBJ_TAG_BITS; else - f_page->index = 0; + f_zpdesc->handle = 0; set_freeobj(zspage, f_objidx); kunmap_local(vaddr); @@ -1428,7 +1441,7 @@ static void obj_free(int class_size, unsigned long obj) void zs_free(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long obj; struct size_class *class; int fullness; @@ -1442,8 +1455,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) */ read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_page(obj, &f_page); - zspage = get_zspage(f_page); + obj_to_zpdesc(obj, &f_zpdesc); + zspage = get_zspage(f_zpdesc); class = zspage_class(pool, zspage); spin_lock(&class->lock); read_unlock(&pool->migrate_lock); @@ -1463,7 +1476,7 @@ EXPORT_SYMBOL_GPL(zs_free); static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { - struct page *s_page, *d_page; + struct zpdesc *s_zpdesc, *d_zpdesc; unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; @@ -1472,8 +1485,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, s_size = d_size = class->size; - obj_to_location(src, &s_page, &s_objidx); - obj_to_location(dst, &d_page, &d_objidx); + obj_to_location(src, &s_zpdesc, &s_objidx); + obj_to_location(dst, &d_zpdesc, &d_objidx); s_off = offset_in_page(class->size * s_objidx); d_off = offset_in_page(class->size * d_objidx); @@ -1484,8 +1497,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (d_off + class->size > PAGE_SIZE) d_size = PAGE_SIZE - d_off; - s_addr = kmap_local_page(s_page); - d_addr = kmap_local_page(d_page); + s_addr = kmap_local_zpdesc(s_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); while (1) { size = min(s_size, d_size); @@ -1510,17 +1523,17 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (s_off >= PAGE_SIZE) { kunmap_local(d_addr); kunmap_local(s_addr); - s_page = get_next_page(s_page); - s_addr = kmap_local_page(s_page); - d_addr = kmap_local_page(d_page); + s_zpdesc = get_next_zpdesc(s_zpdesc); + s_addr = kmap_local_zpdesc(s_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); s_size = class->size - written; s_off = 0; } if (d_off >= PAGE_SIZE) { kunmap_local(d_addr); - d_page = get_next_page(d_page); - d_addr = kmap_local_page(d_page); + d_zpdesc = get_next_zpdesc(d_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); d_size = class->size - written; d_off = 0; } @@ -1535,18 +1548,18 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) + struct zpdesc *zpdesc, int *obj_idx) { unsigned int offset; int index = *obj_idx; unsigned long handle = 0; - void *addr = kmap_local_page(page); + void *addr = kmap_local_zpdesc(zpdesc); - offset = get_first_obj_offset(page); + offset = get_first_obj_offset(zpdesc); offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) + if (obj_allocated(zpdesc, addr + offset, &handle)) break; offset += class->size; @@ -1566,14 +1579,14 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, unsigned long used_obj, free_obj; unsigned long handle; int obj_idx = 0; - struct page *s_page = get_first_page(src_zspage); + struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage); struct size_class *class = pool->size_class[src_zspage->class]; while (1) { - handle = find_alloced_obj(class, s_page, &obj_idx); + handle = find_alloced_obj(class, s_zpdesc, &obj_idx); if (!handle) { - s_page = get_next_page(s_page); - if (!s_page) + s_zpdesc = get_next_zpdesc(s_zpdesc); + if (!s_zpdesc) break; obj_idx = 0; continue; @@ -1653,7 +1666,7 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage) */ static void lock_zspage(struct zspage *zspage) { - struct page *curr_page, *page; + struct zpdesc *curr_zpdesc, *zpdesc; /* * Pages we haven't locked yet can be migrated off the list while we're @@ -1665,24 +1678,24 @@ static void lock_zspage(struct zspage *zspage) */ while (1) { migrate_read_lock(zspage); - page = get_first_page(zspage); - if (trylock_page(page)) + zpdesc = get_first_zpdesc(zspage); + if (zpdesc_trylock(zpdesc)) break; - get_page(page); + zpdesc_get(zpdesc); migrate_read_unlock(zspage); - wait_on_page_locked(page); - put_page(page); + zpdesc_wait_locked(zpdesc); + zpdesc_put(zpdesc); } - curr_page = page; - while ((page = get_next_page(curr_page))) { - if (trylock_page(page)) { - curr_page = page; + curr_zpdesc = zpdesc; + while ((zpdesc = get_next_zpdesc(curr_zpdesc))) { + if (zpdesc_trylock(zpdesc)) { + curr_zpdesc = zpdesc; } else { - get_page(page); + zpdesc_get(zpdesc); migrate_read_unlock(zspage); - wait_on_page_locked(page); - put_page(page); + zpdesc_wait_locked(zpdesc); + zpdesc_put(zpdesc); migrate_read_lock(zspage); } } @@ -1720,26 +1733,28 @@ static void migrate_write_unlock(struct zspage *zspage) static const struct movable_operations zsmalloc_mops; static void replace_sub_page(struct size_class *class, struct zspage *zspage, - struct page *newpage, struct page *oldpage) + struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc) { - struct page *page; - struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + struct zpdesc *zpdesc; + struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + unsigned int first_obj_offset; int idx = 0; - page = get_first_page(zspage); + zpdesc = get_first_zpdesc(zspage); do { - if (page == oldpage) - pages[idx] = newpage; + if (zpdesc == oldzpdesc) + zpdescs[idx] = newzpdesc; else - pages[idx] = page; + zpdescs[idx] = zpdesc; idx++; - } while ((page = get_next_page(page)) != NULL); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); - create_page_chain(class, zspage, pages); - set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); + create_page_chain(class, zspage, zpdescs); + first_obj_offset = get_first_obj_offset(oldzpdesc); + set_first_obj_offset(newzpdesc, first_obj_offset); if (unlikely(ZsHugePage(zspage))) - newpage->index = oldpage->index; - __SetPageMovable(newpage, &zsmalloc_mops); + newzpdesc->handle = oldzpdesc->handle; + __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) @@ -1759,20 +1774,22 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct zs_pool *pool; struct size_class *class; struct zspage *zspage; - struct page *dummy; + struct zpdesc *dummy; + struct zpdesc *newzpdesc = page_zpdesc(newpage); + struct zpdesc *zpdesc = page_zpdesc(page); void *s_addr, *d_addr, *addr; unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); /* We're committed, tell the world that this is a Zsmalloc page. */ - __SetPageZsmalloc(newpage); + __zpdesc_set_zsmalloc(newzpdesc); /* The page is locked, so this pointer must remain valid */ - zspage = get_zspage(page); + zspage = get_zspage(zpdesc); pool = zspage->pool; /* @@ -1789,30 +1806,29 @@ static int zs_page_migrate(struct page *newpage, struct page *page, /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); - offset = get_first_obj_offset(page); - s_addr = kmap_local_page(page); + offset = get_first_obj_offset(zpdesc); + s_addr = kmap_local_zpdesc(zpdesc); /* * Here, any user cannot access all objects in the zspage so let's move. */ - d_addr = kmap_local_page(newpage); + d_addr = kmap_local_zpdesc(newzpdesc); copy_page(d_addr, s_addr); kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { - if (obj_allocated(page, addr, &handle)) { + if (obj_allocated(zpdesc, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); - new_obj = (unsigned long)location_to_obj(newpage, - obj_idx); + new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx); record_obj(handle, new_obj); } } kunmap_local(s_addr); - replace_sub_page(class, zspage, newpage, page); + replace_sub_page(class, zspage, newzpdesc, zpdesc); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release migration_lock. @@ -1821,14 +1837,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page, spin_unlock(&class->lock); migrate_write_unlock(zspage); - get_page(newpage); - if (page_zone(newpage) != page_zone(page)) { - dec_zone_page_state(page, NR_ZSPAGES); - inc_zone_page_state(newpage, NR_ZSPAGES); + zpdesc_get(newzpdesc); + if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_inc_zone_page_state(newzpdesc); } - reset_page(page); - put_page(page); + reset_zpdesc(zpdesc); + zpdesc_put(zpdesc); return MIGRATEPAGE_SUCCESS; } @@ -1897,13 +1913,13 @@ static void init_deferred_free(struct zs_pool *pool) static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) { - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); do { - WARN_ON(!trylock_page(page)); - __SetPageMovable(page, &zsmalloc_mops); - unlock_page(page); - } while ((page = get_next_page(page)) != NULL); + WARN_ON(!zpdesc_trylock(zpdesc)); + __zpdesc_set_movable(zpdesc, &zsmalloc_mops); + zpdesc_unlock(zpdesc); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); } #else static inline void zs_flush_migration(struct zs_pool *pool) { } diff --git a/mm/zswap.c b/mm/zswap.c index 30f5a27a6862..167ae641379f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1186,7 +1186,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o /* * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. + * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP. */ spin_unlock(&l->lock); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f89cf93f6eb4..8a91c1972dc5 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -532,12 +532,11 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array_node(gfp, - pool->p.nid, bulk, - (struct page **)pool->alloc.cache); + nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, + (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) return 0; diff --git a/net/mpls/internal.h b/net/mpls/internal.h index b9f492ddf93b..83c629529b57 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -33,7 +33,7 @@ struct mpls_dev { #define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ do { \ - __typeof__(*(mdev)->stats) *ptr = \ + TYPEOF_UNQUAL(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ @@ -45,7 +45,7 @@ struct mpls_dev { #define MPLS_INC_STATS(mdev, field) \ do { \ - __typeof__(*(mdev)->stats) *ptr = \ + TYPEOF_UNQUAL(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 79879b7d39cb..e7f9c295d13c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -651,8 +651,8 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) if (pages > RPCSVC_MAXPAGES) pages = RPCSVC_MAXPAGES; - ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages, + rqstp->rq_pages); return ret == pages; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 43c57124de52..aebc0d8ddff5 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -671,8 +671,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) } for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; diff --git a/samples/Kconfig b/samples/Kconfig index b288d9991d27..8d5a36f0e5d6 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -293,6 +293,8 @@ config SAMPLE_CGROUP source "samples/rust/Kconfig" +source "samples/damon/Kconfig" + endif # SAMPLES config HAVE_SAMPLE_FTRACE_DIRECT diff --git a/samples/Makefile b/samples/Makefile index b85fa64390c5..5af6bb8afb07 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -39,3 +39,5 @@ obj-$(CONFIG_SAMPLE_KMEMLEAK) += kmemleak/ obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight/ obj-$(CONFIG_SAMPLE_FPROBE) += fprobe/ obj-$(CONFIG_SAMPLES_RUST) += rust/ +obj-$(CONFIG_SAMPLE_DAMON_WSSE) += damon/ +obj-$(CONFIG_SAMPLE_DAMON_PRCL) += damon/ diff --git a/samples/damon/Kconfig b/samples/damon/Kconfig new file mode 100644 index 000000000000..63f6dcd71daa --- /dev/null +++ b/samples/damon/Kconfig @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "DAMON Samples" + +config SAMPLE_DAMON_WSSE + bool "DAMON sameple module for working set size estimation" + depends on DAMON && DAMON_VADDR + help + This builds DAMON sample module for working set size estimation. + + The module receives a pid, monitor access to the virtual address + space of the process, estimate working set size of the process, and + repeatedly prints the size on the kernel log. + + If unsure, say N. + +config SAMPLE_DAMON_PRCL + bool "DAMON sameple module for access-aware proactive reclamation" + depends on DAMON && DAMON_VADDR + help + This builds DAMON sample module for access-aware proactive + reclamation. + + The module receives a pid, monitor access to the virtual address + space of the process, find memory regions that not accessed, and + proactively reclaim the regions. + + If unsure, say N. + +endmenu diff --git a/samples/damon/Makefile b/samples/damon/Makefile new file mode 100644 index 000000000000..7f155143f237 --- /dev/null +++ b/samples/damon/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_SAMPLE_DAMON_WSSE) += wsse.o +obj-$(CONFIG_SAMPLE_DAMON_PRCL) += prcl.o diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c new file mode 100644 index 000000000000..c3acbdab7a62 --- /dev/null +++ b/samples/damon/prcl.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * proactive reclamation: monitor access pattern of a given process, find + * regiosn that seems not accessed, and proactively page out the regions. + */ + +#define pr_fmt(fmt) "damon_sample_prcl: " fmt + +#include +#include +#include +#include + +static int target_pid __read_mostly; +module_param(target_pid, int, 0600); + +static int damon_sample_prcl_enable_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enable_param_ops = { + .set = damon_sample_prcl_enable_store, + .get = param_get_bool, +}; + +static bool enable __read_mostly; +module_param_cb(enable, &enable_param_ops, &enable, 0600); +MODULE_PARM_DESC(enable, "Enable of disable DAMON_SAMPLE_WSSE"); + +static struct damon_ctx *ctx; +static struct pid *target_pidp; + +static int damon_sample_prcl_after_aggregate(struct damon_ctx *c) +{ + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + unsigned long wss = 0; + + damon_for_each_region(r, t) { + if (r->nr_accesses > 0) + wss += r->ar.end - r->ar.start; + } + pr_info("wss: %lu\n", wss); + } + return 0; +} + +static int damon_sample_prcl_start(void) +{ + struct damon_target *target; + struct damos *scheme; + + pr_info("start\n"); + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + if (damon_select_ops(ctx, DAMON_OPS_VADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + target_pidp = find_get_pid(target_pid); + if (!target_pidp) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + target->pid = target_pidp; + + ctx->callback.after_aggregation = damon_sample_prcl_after_aggregate; + + scheme = damon_new_scheme( + &(struct damos_access_pattern) { + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + .min_nr_accesses = 0, + .max_nr_accesses = 0, + .min_age_region = 50, + .max_age_region = UINT_MAX}, + DAMOS_PAGEOUT, + 0, + &(struct damos_quota){}, + &(struct damos_watermarks){}, + NUMA_NO_NODE); + if (!scheme) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_set_schemes(ctx, &scheme, 1); + + return damon_start(&ctx, 1, true); +} + +static void damon_sample_prcl_stop(void) +{ + pr_info("stop\n"); + if (ctx) { + damon_stop(&ctx, 1); + damon_destroy_ctx(ctx); + } + if (target_pidp) + put_pid(target_pidp); +} + +static int damon_sample_prcl_enable_store( + const char *val, const struct kernel_param *kp) +{ + bool enabled = enable; + int err; + + err = kstrtobool(val, &enable); + if (err) + return err; + + if (enable == enabled) + return 0; + + if (enable) + return damon_sample_prcl_start(); + damon_sample_prcl_stop(); + return 0; +} + +static int __init damon_sample_prcl_init(void) +{ + return 0; +} + +module_init(damon_sample_prcl_init); diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c new file mode 100644 index 000000000000..11be25803274 --- /dev/null +++ b/samples/damon/wsse.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * working set size estimation: monitor access pattern of given process and + * print estimated working set size (total size of regions that showing some + * access). + */ + +#define pr_fmt(fmt) "damon_sample_wsse: " fmt + +#include +#include +#include +#include + +static int target_pid __read_mostly; +module_param(target_pid, int, 0600); + +static int damon_sample_wsse_enable_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enable_param_ops = { + .set = damon_sample_wsse_enable_store, + .get = param_get_bool, +}; + +static bool enable __read_mostly; +module_param_cb(enable, &enable_param_ops, &enable, 0600); +MODULE_PARM_DESC(enable, "Enable or disable DAMON_SAMPLE_WSSE"); + +static struct damon_ctx *ctx; +static struct pid *target_pidp; + +static int damon_sample_wsse_after_aggregate(struct damon_ctx *c) +{ + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + unsigned long wss = 0; + + damon_for_each_region(r, t) { + if (r->nr_accesses > 0) + wss += r->ar.end - r->ar.start; + } + pr_info("wss: %lu\n", wss); + } + return 0; +} + +static int damon_sample_wsse_start(void) +{ + struct damon_target *target; + + pr_info("start\n"); + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + if (damon_select_ops(ctx, DAMON_OPS_VADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + target_pidp = find_get_pid(target_pid); + if (!target_pidp) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + target->pid = target_pidp; + + ctx->callback.after_aggregation = damon_sample_wsse_after_aggregate; + return damon_start(&ctx, 1, true); +} + +static void damon_sample_wsse_stop(void) +{ + pr_info("stop\n"); + if (ctx) { + damon_stop(&ctx, 1); + damon_destroy_ctx(ctx); + } + if (target_pidp) + put_pid(target_pidp); +} + +static int damon_sample_wsse_enable_store( + const char *val, const struct kernel_param *kp) +{ + bool enabled = enable; + int err; + + err = kstrtobool(val, &enable); + if (err) + return err; + + if (enable == enabled) + return 0; + + if (enable) + return damon_sample_wsse_start(); + damon_sample_wsse_stop(); + return 0; +} + +static int __init damon_sample_wsse_init(void) +{ + return 0; +} + +module_init(damon_sample_wsse_init); diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config index b3b00269a52a..b0049be00c70 100644 --- a/tools/testing/kunit/configs/all_tests.config +++ b/tools/testing/kunit/configs/all_tests.config @@ -38,9 +38,6 @@ CONFIG_IWLWIFI=y CONFIG_DAMON=y CONFIG_DAMON_VADDR=y CONFIG_DAMON_PADDR=y -CONFIG_DEBUG_FS=y -CONFIG_DAMON_DBGFS=y -CONFIG_DAMON_DBGFS_DEPRECATED=y CONFIG_REGMAP_BUILD=y diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index cffaf2245d4f..eaff1b036989 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -227,6 +227,7 @@ static void *load_creator(void *ptr) unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) - (1 << order); item_insert_order(tree, index, order); + xa_set_mark(tree, index, XA_MARK_1); item_delete_rcu(tree, index); } } @@ -242,8 +243,11 @@ static void *load_worker(void *ptr) rcu_register_thread(); while (!stop_iteration) { + unsigned long find_index = (2 << RADIX_TREE_MAP_SHIFT) + 1; struct item *item = xa_load(ptr, index); assert(!xa_is_internal(item)); + item = xa_find(ptr, &find_index, index, XA_MARK_1); + assert(!xa_is_internal(item)); } rcu_unregister_thread(); diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore index 2ab675fecb6b..2f0297657c81 100644 --- a/tools/testing/selftests/damon/.gitignore +++ b/tools/testing/selftests/damon/.gitignore @@ -1,6 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -huge_count_read_write -debugfs_target_ids_read_before_terminate_race -debugfs_target_ids_pid_leak access_memory access_memory_even diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 812f656260fb..ecbf07afc6dd 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -1,15 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for damon selftests -TEST_GEN_FILES += huge_count_read_write -TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race -TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory access_memory_even -TEST_FILES = _chk_dependency.sh _debugfs_common.sh _damon_sysfs.py +TEST_FILES = _chk_dependency.sh _damon_sysfs.py # functionality tests -TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += sysfs.sh TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py @@ -17,11 +13,6 @@ TEST_PROGS += damos_tried_regions.py damon_nr_regions.py TEST_PROGS += reclaim.sh lru_sort.sh # regression tests (reproducers of previously found bugs) -TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh -TEST_PROGS += debugfs_duplicate_context_creation.sh -TEST_PROGS += debugfs_rm_non_contexts.sh -TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh -TEST_PROGS += debugfs_target_ids_pid_leak.sh TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py diff --git a/tools/testing/selftests/damon/config b/tools/testing/selftests/damon/config index 0daf38974eb0..a68a9fead5dc 100644 --- a/tools/testing/selftests/damon/config +++ b/tools/testing/selftests/damon/config @@ -1,6 +1,5 @@ CONFIG_DAMON=y CONFIG_DAMON_SYSFS=y -CONFIG_DAMON_DBGFS=y CONFIG_DAMON_PADDR=y CONFIG_DAMON_VADDR=y CONFIG_DAMON_RECLAIM=y diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh deleted file mode 100755 index 902e312bca89..000000000000 --- a/tools/testing/selftests/damon/debugfs_attrs.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test attrs file -# =============== - -file="$DBGFS/attrs" -orig_content=$(cat "$file") - -test_write_succ "$file" "1 2 3 4 5" "$orig_content" "valid input" -test_write_fail "$file" "1 2 3 4" "$orig_content" "no enough fields" -test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ - "min_nr_regions > max_nr_regions" -test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" -echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh deleted file mode 100755 index bd6c22d96ead..000000000000 --- a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test duplicated context creation -# ================================ - -if ! echo foo > "$DBGFS/mk_contexts" -then - echo "context creation failed" - exit 1 -fi - -if echo foo > "$DBGFS/mk_contexts" 2> /dev/null -then - echo "duplicate context creation success" - exit 1 -fi - -if ! echo foo > "$DBGFS/rm_contexts" -then - echo "context deletion failed" - exit 1 -fi - -exit 0 diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh deleted file mode 100755 index effbea33dc16..000000000000 --- a/tools/testing/selftests/damon/debugfs_empty_targets.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test empty targets case -# ======================= - -orig_target_ids=$(cat "$DBGFS/target_ids") -echo "" > "$DBGFS/target_ids" - -if [ -f "$DBGFS/monitor_on_DEPRECATED" ] -then - monitor_on_file="$DBGFS/monitor_on_DEPRECATED" -else - monitor_on_file="$DBGFS/monitor_on" -fi - -orig_monitor_on=$(cat "$monitor_on_file") -test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids" -echo "$orig_target_ids" > "$DBGFS/target_ids" diff --git a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh deleted file mode 100755 index 922cadac2950..000000000000 --- a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test huge count read write -# ========================== - -dmesg -C - -for file in "$DBGFS/"* -do - ./huge_count_read_write "$file" -done - -if dmesg | grep -q WARNING -then - dmesg - exit 1 -else - exit 0 -fi diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh deleted file mode 100755 index f3ffeb1343cf..000000000000 --- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test putting non-ctx files/dirs to rm_contexts file -# =================================================== - -dmesg -C - -for file in "$DBGFS/"* -do - (echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null - if dmesg | grep -q BUG - then - dmesg - exit 1 - fi -done diff --git a/tools/testing/selftests/damon/debugfs_schemes.sh b/tools/testing/selftests/damon/debugfs_schemes.sh deleted file mode 100755 index 5b39ab44731c..000000000000 --- a/tools/testing/selftests/damon/debugfs_schemes.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test schemes file -# ================= - -file="$DBGFS/schemes" -orig_content=$(cat "$file") - -test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \ - "$orig_content" "valid input" -test_write_fail "$file" "1 2 -3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines" -test_write_succ "$file" "" "$orig_content" "disabling" -test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \ - "$orig_content" "wrong condition ranges" -echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/debugfs_target_ids.sh b/tools/testing/selftests/damon/debugfs_target_ids.sh deleted file mode 100755 index 49aeabdb0aae..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source _debugfs_common.sh - -# Test target_ids file -# ==================== - -file="$DBGFS/target_ids" -orig_content=$(cat "$file") - -test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input" -test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input" -test_content "$file" "$orig_content" "1 2" "non-integer was there" -test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input" -test_content "$file" "$orig_content" "" "wrong input written" -test_write_succ "$file" "" "$orig_content" "empty input" -test_content "$file" "$orig_content" "" "empty input written" -echo "$orig_content" > "$file" diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c deleted file mode 100644 index 0cc2eef7d142..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Author: SeongJae Park - */ - -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids" - -static void write_targetid_exit(void) -{ - int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); - char pid_str[128]; - - snprintf(pid_str, sizeof(pid_str), "%d", getpid()); - write(target_ids_fd, pid_str, sizeof(pid_str)); - close(target_ids_fd); - exit(0); -} - -unsigned long msec_timestamp(void) -{ - struct timeval tv; - - gettimeofday(&tv, NULL); - return tv.tv_sec * 1000UL + tv.tv_usec / 1000; -} - -int main(int argc, char *argv[]) -{ - unsigned long start_ms; - int time_to_run, nr_forks = 0; - - if (argc != 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - exit(1); - } - time_to_run = atoi(argv[1]); - - start_ms = msec_timestamp(); - while (true) { - int pid = fork(); - - if (pid < 0) { - fprintf(stderr, "fork() failed\n"); - exit(1); - } - if (pid == 0) - write_targetid_exit(); - wait(NULL); - nr_forks++; - - if (msec_timestamp() - start_ms > time_to_run) - break; - } - printf("%d\n", nr_forks); - return 0; -} diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh deleted file mode 100755 index 31fe33c2b032..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -before=$(grep "^pid " /proc/slabinfo | awk '{print $2}') - -nr_leaks=$(./debugfs_target_ids_pid_leak 1000) -expected_after_max=$((before + nr_leaks / 2)) - -after=$(grep "^pid " /proc/slabinfo | awk '{print $2}') - -echo > /sys/kernel/debug/damon/target_ids - -echo "tried $nr_leaks pid leak" -echo "number of active pid slabs: $before -> $after" -echo "(up to $expected_after_max expected)" -if [ $after -gt $expected_after_max ] -then - echo "maybe pids are leaking" - exit 1 -else - exit 0 -fi diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c deleted file mode 100644 index b06f52a8ce2d..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Author: SeongJae Park - */ -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DBGFS_MONITOR_ON "/sys/kernel/debug/damon/monitor_on_DEPRECATED" -#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids" - -static void turn_damon_on_exit(void) -{ - int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); - int monitor_on_fd = open(DBGFS_MONITOR_ON, O_RDWR); - char pid_str[128]; - - snprintf(pid_str, sizeof(pid_str), "%d", getpid()); - write(target_ids_fd, pid_str, sizeof(pid_str)); - write(monitor_on_fd, "on\n", 3); - close(target_ids_fd); - close(monitor_on_fd); - usleep(1000); - exit(0); -} - -static void try_race(void) -{ - int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); - int pid = fork(); - int buf[256]; - - if (pid < 0) { - fprintf(stderr, "fork() failed\n"); - exit(1); - } - if (pid == 0) - turn_damon_on_exit(); - while (true) { - int status; - - read(target_ids_fd, buf, sizeof(buf)); - if (waitpid(-1, &status, WNOHANG) == pid) - break; - } - close(target_ids_fd); -} - -static inline uint64_t ts_to_ms(struct timespec *ts) -{ - return (uint64_t)ts->tv_sec * 1000 + (uint64_t)ts->tv_nsec / 1000000; -} - -int main(int argc, char *argv[]) -{ - struct timespec start_time, now; - int runtime_ms; - - if (argc != 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - exit(1); - } - runtime_ms = atoi(argv[1]); - clock_gettime(CLOCK_MONOTONIC, &start_time); - while (true) { - try_race(); - clock_gettime(CLOCK_MONOTONIC, &now); - if (ts_to_ms(&now) - ts_to_ms(&start_time) > runtime_ms) - break; - } - return 0; -} diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh deleted file mode 100755 index fc793c4c9aea..000000000000 --- a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -dmesg -C - -./debugfs_target_ids_read_before_terminate_race 5000 - -if dmesg | grep -q dbgfs_target_ids_read -then - dmesg - exit 1 -else - exit 0 -fi diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c deleted file mode 100644 index 53e69a669668..000000000000 --- a/tools/testing/selftests/damon/huge_count_read_write.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Author: SeongJae Park - */ - -#include -#include -#include -#include - -#pragma GCC diagnostic push -#if __GNUC__ >= 11 && __GNUC_MINOR__ >= 1 -/* Ignore read(2) overflow and write(2) overread compile warnings */ -#pragma GCC diagnostic ignored "-Wstringop-overread" -#pragma GCC diagnostic ignored "-Wstringop-overflow" -#endif - -void write_read_with_huge_count(char *file) -{ - int filedesc = open(file, O_RDWR); - char buf[256]; - int ret; - - printf("%s %s\n", __func__, file); - if (filedesc < 0) { - fprintf(stderr, "failed opening %s\n", file); - exit(1); - } - - write(filedesc, "", 0xfffffffful); - ret = read(filedesc, buf, 0xfffffffful); - close(filedesc); -} - -#pragma GCC diagnostic pop - -int main(int argc, char *argv[]) -{ - if (argc != 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - exit(1); - } - write_read_with_huge_count(argv[1]); - - return 0; -} diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index a5a72415e37b..66d444ae1676 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -174,7 +174,7 @@ static void test_name(struct __test_metadata *_metadata); \ static inline void wrapper_##test_name( \ struct __test_metadata *_metadata, \ - struct __fixture_variant_metadata *variant) \ + struct __fixture_variant_metadata __attribute__((unused)) *variant) \ { \ _metadata->setup_completed = true; \ if (setjmp(_metadata->env) == 0) \ @@ -756,7 +756,7 @@ /* Avoid multiple evaluation of the cases */ \ __typeof__(_expected) __exp = (_expected); \ __typeof__(_seen) __seen = (_seen); \ - if (!(__exp _t __seen)) { \ + if (!(__exp _t (__typeof__(_expected)) __seen)) { \ /* Report with actual signedness to avoid weird output. */ \ switch (is_signed_type(__exp) * 2 + is_signed_type(__seen)) { \ case 0: { \ @@ -965,7 +965,7 @@ static inline void __test_check_assert(struct __test_metadata *t) } struct __test_metadata *__active_test; -static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) +static void __timeout_handler(int sig, siginfo_t *info, void __attribute__((unused)) *ucontext) { struct __test_metadata *t = __active_test; diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 8f01f4da1c0d..121000c28c10 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -27,6 +27,7 @@ protection_keys_64 madv_populate uffd-stress uffd-unit-tests +uffd-wp-mremap mlock-intersect-test mlock-random-test virtual_address_range @@ -36,6 +37,9 @@ map_fixed_noreplace write_to_hugetlbfs hmm-tests memfd_secret +hugetlb_dio +pkey_sighandler_tests_32 +pkey_sighandler_tests_64 soft-dirty split_huge_page_test ksm_tests @@ -49,7 +53,6 @@ va_high_addr_switch hugetlb_fault_after_madv hugetlb_madv_vs_map mseal_test -seal_elf droppable hugetlb_dio pkey_sighandler_tests_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 3de23ea4663f..006ed2e8df87 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -33,9 +33,17 @@ endif # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +CFLAGS = -Wall -O2 -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +CFLAGS += -Wunreachable-code -Wunused -Wunused-parameter -Wunused-function -Wunused-variable LDLIBS = -lrt -lpthread -lm +# Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is +# automatically enabled at -O1 or above. This triggers various unused-result +# warnings where functions such as read() or write() are called and their +# return value is not checked. Disable _FORTIFY_SOURCE to silence those +# warnings. +CFLAGS += -U_FORTIFY_SOURCE + KDIR ?= /lib/modules/$(shell uname -r)/build ifneq (,$(wildcard $(KDIR)/Module.symvers)) ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) @@ -75,13 +83,13 @@ TEST_GEN_FILES += mrelease_test TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += mremap_test TEST_GEN_FILES += mseal_test -TEST_GEN_FILES += seal_elf TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress TEST_GEN_FILES += uffd-unit-tests +TEST_GEN_FILES += uffd-wp-mremap TEST_GEN_FILES += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_FILES += ksm_functional_tests @@ -152,11 +160,16 @@ $(TEST_GEN_FILES): vm_util.c thp_settings.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c +$(OUTPUT)/uffd-wp-mremap: uffd-common.c +$(OUTPUT)/protection_keys: pkey_util.c +$(OUTPUT)/pkey_sighandler_tests: pkey_util.c ifeq ($(ARCH),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) +$(BINARIES_32) $(BINARIES_64): pkey_util.c + define gen-target-rule-32 $(1) $(1)_32: $(OUTPUT)/$(1)_32 .PHONY: $(1) $(1)_32 diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 2c3a0eb6b22d..f6f32a5732e9 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -134,7 +134,7 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size, lseek(fd, 0, SEEK_SET); if (write(fd, init_nr_hugepages, strlen(init_nr_hugepages)) - != strlen(init_nr_hugepages)) { + != (signed long int)strlen(init_nr_hugepages)) { ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n", strerror(errno)); goto close_fd; @@ -194,7 +194,7 @@ int set_zero_hugepages(unsigned long *initial_nr_hugepages) return ret; } -int main(int argc, char **argv) +int main(void) { struct rlimit lim; struct map_list *list = NULL, *entry; diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 1238e1c5aae1..34fc0b65a85a 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -84,7 +84,7 @@ static void detect_huge_zeropage(void) return; ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { + if (ret > 0 && (unsigned int)ret < sizeof(buf)) { buf[ret] = 0; enabled = strtoul(buf, NULL, 10); @@ -263,12 +263,14 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb) +static void test_cow_in_parent(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false); } -static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb) +static void test_cow_in_parent_mprotect(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false); } @@ -408,10 +410,11 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork) struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; struct io_uring ring; - ssize_t cur, total; struct iovec iov; char *buf, *tmp; + size_t total; int ret, fd; + ssize_t cur; FILE *file; ret = setup_comm_pipes(&comm_pipes); @@ -515,7 +518,7 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork) goto quit_child; } - if (cqe->res != size) { + if ((unsigned int) cqe->res != size) { ksft_test_result_fail("write_fixed failed\n"); goto quit_child; } @@ -529,7 +532,7 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork) ksft_test_result_fail("pread() failed\n"); goto quit_child; } - total += cur; + total += (size_t)cur; } /* Finally, check if we read what we expected. */ @@ -553,12 +556,14 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb) +static void test_iouring_ro(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_iouring(mem, size, false); } -static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb) +static void test_iouring_fork(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_iouring(mem, size, true); } @@ -702,36 +707,38 @@ free_tmp: free(tmp); } -static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb) +static void test_ro_pin_on_shared(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); } -static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb) +static void test_ro_fast_pin_on_shared(char *mem, size_t size, + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); } static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size, - bool is_hugetlb) + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); } static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size, - bool is_hugetlb) + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); } static void test_ro_pin_on_ro_exclusive(char *mem, size_t size, - bool is_hugetlb) + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); } static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size, - bool is_hugetlb) + bool __attribute__((unused)) is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); } @@ -1192,7 +1199,7 @@ static void run_anon_test_case(struct test_case const *test_case) static void run_anon_test_cases(void) { - int i; + unsigned int i; ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); @@ -1420,7 +1427,7 @@ static const struct test_case anon_thp_test_cases[] = { static void run_anon_thp_test_cases(void) { - int i; + unsigned int i; if (!pmdsize) return; @@ -1457,13 +1464,14 @@ static void test_cow(char *mem, const char *smem, size_t size) "Other mapping not modified\n"); free(old); } +//typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); -static void test_ro_pin(char *mem, const char *smem, size_t size) +static void test_ro_pin(char *mem, const char __attribute__((unused)) *smem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST, false); } -static void test_ro_fast_pin(char *mem, const char *smem, size_t size) +static void test_ro_fast_pin(char *mem, const char __attribute__((unused)) *smem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST, true); } @@ -1684,7 +1692,7 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, goto close; } smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { + if (smem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } @@ -1696,7 +1704,7 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, fn(mem, smem, hugetlbsize); munmap: munmap(mem, hugetlbsize); - if (mem != MAP_FAILED) + if (smem != MAP_FAILED) munmap(smem, hugetlbsize); close: close(fd); @@ -1752,7 +1760,7 @@ static void run_non_anon_test_case(struct non_anon_test_case const *test_case) static void run_non_anon_test_cases(void) { - int i; + unsigned int i; ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); @@ -1769,7 +1777,7 @@ static int tests_per_non_anon_test_case(void) return tests; } -int main(int argc, char **argv) +int main(void) { int err; struct thp_settings default_settings; diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c index f3d9ecf96890..90ea6377810c 100644 --- a/tools/testing/selftests/mm/droppable.c +++ b/tools/testing/selftests/mm/droppable.c @@ -15,7 +15,7 @@ #include "../kselftest.h" -int main(int argc, char *argv[]) +int main(void) { size_t alloc_size = 134217728; size_t page_size = getpagesize(); diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c index 7cdf815d0d63..fc1165ef2015 100644 --- a/tools/testing/selftests/mm/guard-pages.c +++ b/tools/testing/selftests/mm/guard-pages.c @@ -55,6 +55,12 @@ static int pidfd_open(pid_t pid, unsigned int flags) return syscall(SYS_pidfd_open, pid, flags); } +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t n, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, n, advice, flags); +} + /* * Enable our signal catcher and try to read/write the specified buffer. The * return value indicates whether the read/write succeeds without a fatal @@ -136,7 +142,7 @@ TEST_F(guard_pages, basic) const unsigned long NUM_PAGES = 10; const unsigned long page_size = self->page_size; char *ptr; - int i; + unsigned int i; ptr = mmap(NULL, NUM_PAGES * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); @@ -419,7 +425,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0); /* Now guard in one step. */ - count = process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); + count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); /* OK we don't have permission to do this, skip. */ if (count == -1 && errno == EPERM) @@ -440,7 +446,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size])); /* Now do the same with unguard... */ - count = process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); + count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); /* ...and everything should now succeed. */ @@ -990,7 +996,7 @@ TEST_F(guard_pages, fork) MAP_ANON | MAP_PRIVATE, -1, 0); ASSERT_NE(ptr, MAP_FAILED); - /* Establish guard apges in the first 5 pages. */ + /* Establish guard pages in the first 5 pages. */ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0); pid = fork(); @@ -1029,6 +1035,77 @@ TEST_F(guard_pages, fork) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +/* + * Assert expected behaviour after we fork populated ranges of anonymous memory + * and then guard and unguard the range. + */ +TEST_F(guard_pages, fork_cow) +{ + const unsigned long page_size = self->page_size; + char *ptr; + pid_t pid; + unsigned int i; + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Populate range. */ + for (i = 0; i < 10 * page_size; i++) { + char chr = 'a' + (i % 26); + + ptr[i] = chr; + } + + pid = fork(); + ASSERT_NE(pid, -1); + if (!pid) { + /* This is the child process now. */ + + /* Ensure the range is as expected. */ + for (i = 0; i < 10 * page_size; i++) { + char expected = 'a' + (i % 26); + char actual = ptr[i]; + + ASSERT_EQ(actual, expected); + } + + /* Establish guard pages across the whole range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + /* Remove it. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0); + + /* + * By removing the guard pages, the page tables will be + * cleared. Assert that we are looking at the zero page now. + */ + for (i = 0; i < 10 * page_size; i++) { + char actual = ptr[i]; + + ASSERT_EQ(actual, '\0'); + } + + exit(0); + } + + /* Parent process. */ + + /* Parent simply waits on child. */ + waitpid(pid, NULL, 0); + + /* Ensure the range is unchanged in parent anon range. */ + for (i = 0; i < 10 * page_size; i++) { + char expected = 'a' + (i % 26); + char actual = ptr[i]; + + ASSERT_EQ(actual, expected); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + /* * Assert that forking a process with VMAs that do have VM_WIPEONFORK set * behave as expected. diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 9423ad439a61..7f1b4ad7fcae 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -444,9 +444,10 @@ static int tests_per_test_case(void) return 3 + nr_hugetlbsizes; } -int main(int argc, char **argv) +int main(void) { - int i, err; + unsigned int i; + int err; pagesize = getpagesize(); nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 141bf63cbe05..3b4db583bd3b 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -796,7 +796,7 @@ TEST_F(hmm, anon_write_hugetlbfs) int ret; default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) + if (default_hsize*1024 < default_hsize) SKIP(return, "Huge page size could not be determined"); default_hsize = default_hsize*1024; /* KB to B */ @@ -1579,7 +1579,7 @@ TEST_F(hmm, compound) /* Skip test if we can't allocate a hugetlbfs page. */ default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) + if (default_hsize*1024 < default_hsize) SKIP(return, "Huge page size could not be determined"); default_hsize = default_hsize*1024; /* KB to B */ diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c index df366a4d1b92..8d30ebfc9b86 100644 --- a/tools/testing/selftests/mm/hugepage-vmemmap.c +++ b/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -51,7 +51,8 @@ static unsigned long virt_to_pfn(void *addr) static int check_page_flags(unsigned long pfn) { - int fd, i; + int fd; + unsigned int i; unsigned long pageflags; fd = open("/proc/kpageflags", O_RDONLY); @@ -87,7 +88,7 @@ static int check_page_flags(unsigned long pfn) return 0; } -int main(int argc, char **argv) +int main(void) { void *addr; unsigned long pfn; diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index e74107185324..8f527084858d 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -26,7 +26,7 @@ #define validate_free_pages(exp_free) \ do { \ - int fhp = get_free_hugepages(); \ + unsigned int fhp = get_free_hugepages(); \ if (fhp != (exp_free)) { \ printf("Unexpected number of free huge " \ "pages line %d\n", __LINE__); \ @@ -58,7 +58,7 @@ void read_fault_pages(void *addr, unsigned long nr_pages) } } -int main(int argc, char **argv) +int main(int __attribute__((unused)) argc, char **argv) { unsigned long free_hugepages; void *addr, *addr2; diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c index ba6cc6f9cabc..e2a2bb1989d5 100644 --- a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c +++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c @@ -72,7 +72,7 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, { char buf[MAX_WRITE_READ_CHUNK_SIZE]; ssize_t ret_count = 0; - ssize_t total_ret_count = 0; + size_t total_ret_count = 0; char val = offset / wr_chunk_size + offset % wr_chunk_size; printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset); @@ -83,7 +83,7 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, return false; } - while (offset + total_ret_count < len) { + while ((unsigned long)offset + total_ret_count < len) { ret_count = read(fd, buf, wr_chunk_size); if (ret_count == 0) { printf(PREFIX PREFIX "read reach end of the file\n"); @@ -109,7 +109,7 @@ static bool read_hugepage_filemap(int fd, size_t len, { char buf[MAX_WRITE_READ_CHUNK_SIZE]; ssize_t ret_count = 0; - ssize_t total_ret_count = 0; + size_t total_ret_count = 0; char val = 0; printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", diff --git a/tools/testing/selftests/mm/hugetlb-soft-offline.c b/tools/testing/selftests/mm/hugetlb-soft-offline.c index f086f0e04756..cb087303f5ed 100644 --- a/tools/testing/selftests/mm/hugetlb-soft-offline.c +++ b/tools/testing/selftests/mm/hugetlb-soft-offline.c @@ -216,7 +216,7 @@ static void test_soft_offline_common(int enable_soft_offline) enable_soft_offline); } -int main(int argc, char **argv) +int main(void) { ksft_print_header(); ksft_set_plan(2); diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c index db63abe5ee5e..62f368d4c8c1 100644 --- a/tools/testing/selftests/mm/hugetlb_dio.c +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -63,7 +63,7 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) memset(buffer, 'A', writesize); /* Write the buffer to the file */ - if (write(fd, buffer, writesize) != (writesize)) { + if (write(fd, buffer, writesize) != (signed int)writesize) { munmap(orig_buffer, h_pagesize); close(fd); ksft_exit_fail_perror("Error writing to file\n"); diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c index e2640529dbb2..2b5acb13ee0b 100644 --- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -28,7 +28,7 @@ static void signal_handler(int signal) } /* Touch the memory while it is being madvised() */ -void *touch(void *unused) +void *touch(void __attribute__((unused)) *unused) { char *ptr = (char *)huge_ptr; @@ -41,7 +41,7 @@ void *touch(void *unused) return NULL; } -void *madv(void *unused) +void *madv(void __attribute__((unused)) *unused) { usleep(rand() % 10); @@ -88,7 +88,7 @@ int main(void) MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if ((unsigned long)huge_ptr == -1) + if (huge_ptr == MAP_FAILED) ksft_exit_skip("Failed to allocated huge page\n"); pthread_create(&thread1, NULL, madv, NULL); diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c index 8f122a0f0828..eda38b63e9e8 100644 --- a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -33,7 +33,7 @@ size_t mmap_size; char *huge_ptr; /* Touch the memory while it is being madvised() */ -void *touch(void *unused) +void *touch(void __attribute__((unused)) *unused) { for (int i = 0; i < INLOOP_ITER; i++) huge_ptr[0] = '.'; @@ -41,7 +41,7 @@ void *touch(void *unused) return NULL; } -void *madv(void *unused) +void *madv(void __attribute__((unused)) *unused) { for (int i = 0; i < INLOOP_ITER; i++) madvise(huge_ptr, mmap_size, MADV_DONTNEED); @@ -54,7 +54,7 @@ void *madv(void *unused) * The other hugepage should be flipping from used <-> reserved, because * of madvise(DONTNEED). */ -void *map_extra(void *unused) +void *map_extra(void __attribute__((unused)) *unused) { void *ptr; @@ -100,7 +100,7 @@ int main(void) MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if ((unsigned long)huge_ptr == -1) { + if (huge_ptr == MAP_FAILED) { ksft_test_result_fail("Failed to allocate huge page\n"); return KSFT_FAIL; } diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 8a4d34cce36b..dde7c7fbbac2 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -140,7 +140,7 @@ static void get_finfo(const char *dir) exit(EXIT_FAILURE); } if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, - finfo.dir) >= sizeof(finfo.path)) { + finfo.dir) >= (signed int)sizeof(finfo.path)) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } @@ -155,7 +155,7 @@ static void get_finfo(const char *dir) /* Find owning device's queue/read_ahead_kb control */ if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(path)) { + >= (signed int)sizeof(path)) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } @@ -169,7 +169,7 @@ static void get_finfo(const char *dir) sizeof(finfo.dev_queue_read_ahead_path), "/sys/dev/block/%d:%d/queue/read_ahead_kb", major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(finfo.dev_queue_read_ahead_path)) { + >= (signed int)sizeof(finfo.dev_queue_read_ahead_path)) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } @@ -197,7 +197,7 @@ static void get_finfo(const char *dir) if (snprintf(finfo.dev_queue_read_ahead_path, sizeof(finfo.dev_queue_read_ahead_path), "/sys/block/%s/queue/read_ahead_kb", - str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + str) >= (signed int)sizeof(finfo.dev_queue_read_ahead_path)) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } @@ -271,7 +271,7 @@ static void *alloc_mapping(int nr) static void fill_memory(int *p, unsigned long start, unsigned long end) { - int i; + unsigned int i; for (i = start / page_size; i < end / page_size; i++) p[i * page_size / sizeof(*p)] = i + 0xdead0000; @@ -333,10 +333,10 @@ static void *alloc_hpage(struct mem_ops *ops) static void validate_memory(int *p, unsigned long start, unsigned long end) { - int i; + unsigned int i; for (i = start / page_size; i < end / page_size; i++) { - if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + if ((unsigned int)p[i * page_size / sizeof(*p)] != i + 0xdead0000) { printf("Page %d is corrupted: %#x\n", i, p[i * page_size / sizeof(*p)]); exit(EXIT_FAILURE); @@ -537,7 +537,7 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, static bool wait_for_scan(const char *msg, char *p, int nr_hpages, struct mem_ops *ops) { - int full_scans; + unsigned int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 66b4e111b5a2..4f96126e4e1f 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -306,7 +306,7 @@ static void test_unmerge_zero_pages(void) /* Check if ksm_zero_pages is updated correctly after KSM merging */ pages_expected = size / pagesize; - if (pages_expected != get_my_ksm_zero_pages()) { + if ((signed long)pages_expected != get_my_ksm_zero_pages()) { ksft_test_result_fail("'ksm_zero_pages' updated after merging\n"); goto unmap; } @@ -319,7 +319,7 @@ static void test_unmerge_zero_pages(void) /* Check if ksm_zero_pages is updated correctly after unmerging */ pages_expected /= 2; - if (pages_expected != get_my_ksm_zero_pages()) { + if ((signed long)pages_expected != get_my_ksm_zero_pages()) { ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n"); goto unmap; } @@ -625,7 +625,7 @@ static void test_prot_none(void) { const unsigned int size = 2 * MiB; char *map; - int i; + unsigned int i; ksft_print_msg("[RUN] %s\n", __func__); diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c index b748c48908d9..323cfcb14e4d 100644 --- a/tools/testing/selftests/mm/ksm_tests.c +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -265,8 +265,7 @@ static int ksm_merge_pages(int merge_type, void *addr, size_t size, return 0; } -static int ksm_unmerge_pages(void *addr, size_t size, - struct timespec start_time, int timeout) +static int ksm_unmerge_pages(void *addr, size_t size) { if (madvise(addr, size, MADV_UNMERGEABLE)) { perror("madvise"); @@ -483,7 +482,7 @@ static int get_first_mem_node(void) return get_next_mem_node(numa_max_node()); } -static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeout, +static int check_ksm_numa_merge(int merge_type, int timeout, bool merge_across_nodes, size_t page_size) { void *numa1_map_ptr, *numa2_map_ptr; @@ -547,8 +546,7 @@ err_out: return KSFT_FAIL; } -static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot, - int timeout, size_t map_size) +static int ksm_merge_hugepages_time(int merge_type, int timeout, size_t map_size) { void *map_ptr, *map_ptr_orig; struct timespec start_time, end_time; @@ -678,7 +676,7 @@ static int ksm_unmerge_time(int merge_type, int mapping, int prot, int timeout, perror("clock_gettime"); goto err_out; } - if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) + if (ksm_unmerge_pages(map_ptr, map_size)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { perror("clock_gettime"); @@ -776,7 +774,7 @@ err_out: int main(int argc, char *argv[]) { - int ret, opt; + int ret = 0, opt; int prot = 0; int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; int merge_type = KSM_MERGE_TYPE_DEFAULT; @@ -906,8 +904,8 @@ int main(int argc, char *argv[]) page_size); break; case CHECK_KSM_NUMA_MERGE: - ret = check_ksm_numa_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, merge_across_nodes, page_size); + ret = check_ksm_numa_merge(merge_type, ksm_scan_limit_sec, merge_across_nodes, + page_size); break; case KSM_MERGE_TIME: if (size_MB == 0) { @@ -922,8 +920,7 @@ int main(int argc, char *argv[]) printf("Option '-s' is required.\n"); return KSFT_FAIL; } - ret = ksm_merge_hugepages_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, size_MB); + ret = ksm_merge_hugepages_time(merge_type, ksm_scan_limit_sec, size_MB); break; case KSM_UNMERGE_TIME: if (size_MB == 0) { diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c index ef7d911da13e..c6a3ee56a54a 100644 --- a/tools/testing/selftests/mm/madv_populate.c +++ b/tools/testing/selftests/mm/madv_populate.c @@ -281,7 +281,7 @@ static int system_has_softdirty(void) #endif } -int main(int argc, char **argv) +int main(void) { int nr_tests = 16; int err; diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c index 5c8a53869b1b..0dd849b4affa 100644 --- a/tools/testing/selftests/mm/map_populate.c +++ b/tools/testing/selftests/mm/map_populate.c @@ -74,7 +74,7 @@ static int child_f(int sock, unsigned long *smap, int fd) return ksft_cnt.ksft_pass; } -int main(int argc, char **argv) +int main(void) { int sock[2], child, ret; FILE *ftmp; diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 74c911aa3aea..f9d728e18678 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -121,7 +121,7 @@ close_pipe: close(pipefd[1]); } -static void try_process_vm_read(int fd, int pipefd[2]) +static void try_process_vm_read(int __attribute__((unused)) fd, int pipefd[2]) { struct iovec liov, riov; char buf[64]; @@ -145,7 +145,7 @@ static void try_process_vm_read(int fd, int pipefd[2]) exit(KSFT_FAIL); } -static void try_ptrace(int fd, int pipefd[2]) +static void try_ptrace(int __attribute__((unused)) fd, int pipefd[2]) { pid_t ppid = getppid(); int status; @@ -297,7 +297,7 @@ static void prepare(void) #define NUM_TESTS 6 -int main(int argc, char *argv[]) +int main(void) { int fd; diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index 64bcbb7151cf..1e3a595fbf01 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -204,4 +204,103 @@ TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) ASSERT_EQ(pthread_cancel(self->threads[i]), 0); } +/* + * migration test with shared anon THP page + */ + +TEST_F_TIMEOUT(migration, shared_anon_thp, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, 2 * TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); + ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGHUP); + /* Parent may have died before prctl so check now. */ + if (getppid() == 1) + kill(getpid(), SIGHUP); + access_mem(ptr); + } else { + self->pids[i] = pid; + } + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + +/* + * migration test with private anon hugetlb page + */ +TEST_F_TIMEOUT(migration, private_anon_htlb, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +/* + * migration test with shared anon hugetlb page + */ +TEST_F_TIMEOUT(migration, shared_anon_htlb, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGHUP); + /* Parent may have died before prctl so check now. */ + if (getppid() == 1) + kill(getpid(), SIGHUP); + access_mem(ptr); + } else { + self->pids[i] = pid; + } + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c index 1cd80b0f76c3..f410699458f2 100644 --- a/tools/testing/selftests/mm/mlock-random-test.c +++ b/tools/testing/selftests/mm/mlock-random-test.c @@ -138,7 +138,7 @@ static void test_mlock_within_limit(char *p, int alloc_size) int page_size = 0; getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur < alloc_size) + if (cur.rlim_cur < (unsigned int)alloc_size) ksft_exit_fail_msg("alloc_size[%d] < %u rlimit,lead to mlock failure\n", alloc_size, (unsigned int)cur.rlim_cur); @@ -204,7 +204,7 @@ static void test_mlock_outof_limit(char *p, int alloc_size) struct rlimit cur; getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur >= alloc_size) + if (cur.rlim_cur >= (unsigned int)alloc_size) ksft_exit_fail_msg("alloc_size[%d] >%u rlimit, violates test condition\n", alloc_size, (unsigned int)cur.rlim_cur); @@ -236,7 +236,7 @@ static void test_mlock_outof_limit(char *p, int alloc_size) ksft_test_result_pass("%s\n", __func__); } -int main(int argc, char **argv) +int main(void) { char *p = NULL; diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index 7f0d50fa361d..358711e8191f 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -425,7 +425,7 @@ static void test_mlockall(void) munlockall(); } -int main(int argc, char **argv) +int main(void) { int ret, size = 3 * getpagesize(); void *map; diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 5a3a9bcba640..bb84476a177f 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -34,7 +34,7 @@ struct config { unsigned long long dest_alignment; unsigned long long region_size; int overlapping; - int dest_preamble_size; + unsigned int dest_preamble_size; }; struct test { @@ -328,7 +328,7 @@ static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr) { char *test_name = "mremap mremap move within range"; void *src, *dest; - int i, success = 1; + unsigned int i, success = 1; size_t size = SIZE_MB(20); void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, @@ -384,7 +384,7 @@ out: static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) { - void *addr, *src_addr, *dest_addr, *dest_preamble_addr; + void *addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; @@ -569,7 +569,7 @@ static void mremap_move_1mb_from_start(unsigned int pattern_seed, { char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src"; void *src = NULL, *dest = NULL; - int i, success = 1; + unsigned int i, success = 1; /* Config to reuse get_source_mapping() to do an aligned mmap. */ struct config c = { @@ -636,7 +636,7 @@ out: static void run_mremap_test_case(struct test test_case, int *failures, unsigned int threshold_mb, - unsigned int pattern_seed, char *rand_addr) + char *rand_addr) { long long remap_time = remap_region(test_case.config, threshold_mb, rand_addr); @@ -708,7 +708,8 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb, int main(int argc, char **argv) { int failures = 0; - int i, run_perf_tests; + unsigned int i; + int run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; /* hard-coded test configs */ @@ -831,7 +832,7 @@ int main(int argc, char **argv) for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed, rand_addr); + rand_addr); maps_fp = fopen("/proc/self/maps", "r"); @@ -853,7 +854,7 @@ int main(int argc, char **argv) "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed, + threshold_mb, rand_addr); } diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index 01675c412b2a..ad17005521a8 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -802,7 +802,7 @@ static void test_seal_mprotect_partial_mprotect_tail(bool seal) } -static void test_seal_mprotect_two_vma_with_gap(bool seal) +static void test_seal_mprotect_two_vma_with_gap(void) { void *ptr; unsigned long page_size = getpagesize(); @@ -1864,7 +1864,7 @@ static void test_seal_madvise_nodiscard(bool seal) REPORT_TEST_PASS(); } -int main(int argc, char **argv) +int main(void) { bool test_seal = seal_support(); @@ -1913,8 +1913,8 @@ int main(int argc, char **argv) test_seal_mprotect_partial_mprotect(false); test_seal_mprotect_partial_mprotect(true); - test_seal_mprotect_two_vma_with_gap(false); - test_seal_mprotect_two_vma_with_gap(true); + test_seal_mprotect_two_vma_with_gap(); + test_seal_mprotect_two_vma_with_gap(); test_seal_mprotect_merge(false); test_seal_mprotect_merge(true); diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c index 431c1277d83a..ade160966c92 100644 --- a/tools/testing/selftests/mm/on-fault-limit.c +++ b/tools/testing/selftests/mm/on-fault-limit.c @@ -28,7 +28,7 @@ static void test_limit(void) munlockall(); } -int main(int argc, char **argv) +int main(void) { ksft_print_header(); ksft_set_plan(1); diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index bcc73b4e805c..57b4bba2b45f 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -34,8 +34,8 @@ #define PAGEMAP "/proc/self/pagemap" int pagemap_fd; int uffd; -int page_size; -int hpage_size; +unsigned int page_size; +unsigned int hpage_size; const char *progname; #define LEN(region) ((region.end - region.start)/page_size) @@ -235,7 +235,9 @@ int get_reads(struct page_region *vec, int vec_size) int sanity_tests_sd(void) { - int mem_size, vec_size, ret, ret2, ret3, i, num_pages = 1000, total_pages = 0; + unsigned long long mem_size, vec_size, i, total_pages = 0; + long ret, ret2, ret3; + int num_pages = 1000; int total_writes, total_reads, reads, count; struct page_region *vec, *vec2; char *mem, *m[2]; @@ -321,9 +323,9 @@ int sanity_tests_sd(void) ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == mem_size/(page_size * 2), + ksft_test_result((unsigned long long)ret == mem_size/(page_size * 2), "%s Repeated pattern of written and non-written pages\n", __func__); /* 4. Repeated pattern of written and non-written pages in parts */ @@ -331,21 +333,21 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, num_pages/2 - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ret2 = pagemap_ioctl(mem, mem_size, vec, 2, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret2 < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno)); ret3 = pagemap_ioctl(mem, mem_size, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret3 < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret3, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret3, errno, strerror(errno)); ksft_test_result((ret + ret3) == num_pages/2 && ret2 == 2, - "%s Repeated pattern of written and non-written pages in parts %d %d %d\n", + "%s Repeated pattern of written and non-written pages in parts %ld %ld %ld\n", __func__, ret, ret3, ret2); /* 5. Repeated pattern of written and non-written pages max_pages */ @@ -357,13 +359,13 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, num_pages/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ret2 = pagemap_ioctl(mem, mem_size, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret2 < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno)); ksft_test_result(ret == num_pages/2 && ret2 == 1, "%s Repeated pattern of written and non-written pages max_pages\n", @@ -378,12 +380,12 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ret2 = pagemap_ioctl(mem, mem_size, vec2, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret2 < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno)); ksft_test_result(ret == 1 && LEN(vec[0]) == 2 && vec[0].start == (uintptr_t)(mem + page_size) && @@ -416,7 +418,7 @@ int sanity_tests_sd(void) ret = pagemap_ioctl(m[1], mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && LEN(vec[0]) == mem_size/page_size, "%s Two regions\n", __func__); @@ -448,7 +450,7 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); for (i = 0; i < mem_size/page_size; i += 2) mem[i * page_size]++; @@ -457,7 +459,7 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); total_pages += ret; @@ -465,7 +467,7 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); total_pages += ret; @@ -473,7 +475,7 @@ int sanity_tests_sd(void) PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); total_pages += ret; @@ -515,9 +517,9 @@ int sanity_tests_sd(void) vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); - if (ret > vec_size) + if ((unsigned long)ret > vec_size) break; reads = get_reads(vec, ret); @@ -554,63 +556,63 @@ int sanity_tests_sd(void) ret = pagemap_ioc(mem, 0, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 0 && walk_end == (long)mem, "Walk_end: Same start and end address\n"); ret = pagemap_ioc(mem, 0, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 0 && walk_end == (long)mem, "Walk_end: Same start and end with WP\n"); ret = pagemap_ioc(mem, 0, vec, 0, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 0 && walk_end == (long)mem, "Walk_end: Same start and end with 0 output buffer\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), "Walk_end: Big vec\n"); ret = pagemap_ioc(mem, mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), "Walk_end: vec of minimum length\n"); ret = pagemap_ioc(mem, mem_size, vec, 1, 0, vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), "Walk_end: Max pages specified\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size/2), "Walk_end: Half max pages\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size), "Walk_end: 1 max page\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, -1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), "Walk_end: max pages\n"); @@ -621,49 +623,49 @@ int sanity_tests_sd(void) ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); + ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size), "Walk_end sparse: Big vec\n"); ret = pagemap_ioc(mem, mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), "Walk_end sparse: vec of minimum length\n"); ret = pagemap_ioc(mem, mem_size, vec, 1, 0, vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), "Walk_end sparse: Max pages specified\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size/2, 0, vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); + ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size), "Walk_end sparse: Max pages specified\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); + ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size), "Walk_end sparse: Max pages specified\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); - ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); + ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size), "Walk_endsparse : Half max pages\n"); ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), "Walk_end: 1 max page\n"); @@ -674,9 +676,10 @@ int sanity_tests_sd(void) return 0; } -int base_tests(char *prefix, char *mem, int mem_size, int skip) +int base_tests(char *prefix, char *mem, unsigned long long mem_size, int skip) { - int vec_size, written; + unsigned long long vec_size; + int written; struct page_region *vec, *vec2; if (skip) { @@ -799,8 +802,8 @@ int hpage_unit_tests(void) char *map; int ret, ret2; size_t num_pages = 10; - int map_size = hpage_size * num_pages; - int vec_size = map_size/page_size; + unsigned long long map_size = hpage_size * num_pages; + unsigned long long vec_size = map_size/page_size; struct page_region *vec, *vec2; vec = malloc(sizeof(struct page_region) * vec_size); @@ -1047,7 +1050,8 @@ static void test_simple(void) int sanity_tests(void) { - int mem_size, vec_size, ret, fd, i, buf_size; + unsigned long long mem_size, vec_size; + int ret, fd, i, buf_size; struct page_region *vec; char *mem, *fmem; struct stat sbuf; @@ -1312,7 +1316,9 @@ static ssize_t get_dirty_pages_reset(char *mem, unsigned int count, { struct pm_scan_arg arg = {0}; struct page_region rgns[256]; - int i, j, cnt, ret; + unsigned long long i, j; + long ret; + int cnt; arg.size = sizeof(struct pm_scan_arg); arg.start = (uintptr_t)mem; @@ -1330,7 +1336,7 @@ static ssize_t get_dirty_pages_reset(char *mem, unsigned int count, ksft_exit_fail_msg("ioctl failed\n"); cnt = 0; - for (i = 0; i < ret; ++i) { + for (i = 0; i < (unsigned long)ret; ++i) { if (rgns[i].categories != PAGE_IS_WRITTEN) ksft_exit_fail_msg("wrong flags\n"); @@ -1384,9 +1390,10 @@ void *thread_proc(void *mem) static void transact_test(int page_size) { unsigned int i, count, extra_pages; + unsigned int c; pthread_t th; char *mem; - int ret, c; + int ret; if (pthread_barrier_init(&start_barrier, NULL, nthreads + 1)) ksft_exit_fail_msg("pthread_barrier_init\n"); @@ -1405,9 +1412,9 @@ static void transact_test(int page_size) memset(mem, 0, 0x1000 * nthreads * pages_per_thread); count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); - ksft_test_result(count > 0, "%s count %d\n", __func__, count); + ksft_test_result(count > 0, "%s count %u\n", __func__, count); count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); - ksft_test_result(count == 0, "%s count %d\n", __func__, count); + ksft_test_result(count == 0, "%s count %u\n", __func__, count); finish = 0; for (i = 0; i < nthreads; ++i) @@ -1429,7 +1436,7 @@ static void transact_test(int page_size) ksft_exit_fail_msg("pthread_barrier_wait\n"); if (count > nthreads * access_per_thread) - ksft_exit_fail_msg("Too big count %d expected %d, iter %d\n", + ksft_exit_fail_msg("Too big count %u expected %u, iter %u\n", count, nthreads * access_per_thread, i); c = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); @@ -1454,7 +1461,7 @@ static void transact_test(int page_size) * access and application gets page fault again for the same write. */ if (count < nthreads * access_per_thread) { - ksft_test_result_fail("Lost update, iter %d, %d vs %d.\n", i, count, + ksft_test_result_fail("Lost update, iter %u, %u vs %u.\n", i, count, nthreads * access_per_thread); return; } @@ -1467,15 +1474,16 @@ static void transact_test(int page_size) finish = 1; pthread_barrier_wait(&end_barrier); - ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %d.\n", __func__, + ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %u.\n", __func__, extra_pages, 100.0 * extra_pages / (iter_count * nthreads * access_per_thread), extra_thread_faults); } -int main(int argc, char *argv[]) +int main(int __attribute__((unused)) argc, char *argv[]) { - int mem_size, shmid, buf_size, fd, i, ret; + int shmid, buf_size, fd, i, ret; + unsigned long long mem_size; char *mem, *map, *fmem; struct stat sbuf; diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h index d9d2100eafc0..8e9685e03c44 100644 --- a/tools/testing/selftests/mm/pkey-arm64.h +++ b/tools/testing/selftests/mm/pkey-arm64.h @@ -30,7 +30,7 @@ #define NR_PKEYS 8 #define NR_RESERVED_PKEYS 1 /* pkey-0 */ -#define PKEY_ALLOW_ALL 0x77777777 +#define PKEY_REG_ALLOW_ALL 0x77777777 #define PKEY_REG_ALLOW_NONE 0x0 #define PKEY_BITS_PER_PKEY 4 @@ -81,11 +81,11 @@ static inline int get_arch_reserved_keys(void) return NR_RESERVED_PKEYS; } -void expect_fault_on_read_execonly_key(void *p1, int pkey) +static inline void expect_fault_on_read_execonly_key(void *p1, int pkey) { } -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) { return PTR_ERR_ENOTSUP; } diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index f7cfe163b0ff..f080e97b39be 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -13,22 +13,22 @@ #include #include +#include + #include "../kselftest.h" /* Define some kernel-like types */ -#define u8 __u8 -#define u16 __u16 -#define u32 __u32 -#define u64 __u64 +typedef __u8 u8; +typedef __u16 u16; +typedef __u32 u32; +typedef __u64 u64; #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) #ifndef DEBUG_LEVEL #define DEBUG_LEVEL 0 #endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; extern int test_nr; extern int iteration_nr; @@ -83,17 +83,18 @@ extern void abort_hooks(void); #ifndef noinline # define noinline __attribute__((noinline)) #endif +#ifndef __maybe_unused +# define __maybe_unused __attribute__((__unused__)) +#endif -noinline int read_ptr(int *ptr) -{ - /* Keep GCC from optimizing this away somehow */ - barrier(); - return *ptr; -} - -void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); + +/* For functions called from protection_keys.c only */ +noinline int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, unsigned long pkey); void record_pkey_malloc(void *ptr, long size, int prot); @@ -171,38 +172,6 @@ static inline void write_pkey_reg(u64 pkey_reg) pkey_reg, __read_pkey_reg()); } -/* - * These are technically racy. since something could - * change PKEY register between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - u64 pkey_reg = read_pkey_reg(); - int bit = pkey * 2; - - if (do_allow) - pkey_reg &= (1< +#include + +#include "pkey-helpers.h" + +int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(SYS_pkey_alloc, flags, init_val); + dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", + __func__, flags, init_val, ret, errno); + return ret; +} + +int sys_pkey_free(unsigned long pkey) +{ + int ret = syscall(SYS_pkey_free, pkey); + dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); + return ret; +} + +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, + ptr, size, orig_prot, pkey); + + errno = 0; + sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); + if (errno) { + dprintf2("SYS_mprotect_key sret: %d\n", sret); + dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); + dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); + if (DEBUG_LEVEL >= 2) + perror("SYS_mprotect_pkey"); + } + return sret; +} diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 4990f7ab4cb7..28960634044c 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -53,9 +53,15 @@ int test_nr; u64 shadow_pkey_reg; int dprint_in_signal; -char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -void cat_into_file(char *str, char *file) +noinline int read_ptr(int *ptr) +{ + /* Keep GCC from optimizing this away somehow */ + barrier(); + return *ptr; +} + +static void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); int ret; @@ -72,7 +78,7 @@ void cat_into_file(char *str, char *file) } ret = write(fd, str, strlen(str)); - if (ret != strlen(str)) { + if (ret != (signed int)strlen(str)) { perror("write to file failed"); fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); exit(__LINE__); @@ -82,7 +88,7 @@ void cat_into_file(char *str, char *file) #if CONTROL_TRACING > 0 static int warned_tracing; -int tracing_root_ok(void) +static int tracing_root_ok(void) { if (geteuid() != 0) { if (!warned_tracing) @@ -95,7 +101,7 @@ int tracing_root_ok(void) } #endif -void tracing_on(void) +static void tracing_on(void) { #if CONTROL_TRACING > 0 #define TRACEDIR "/sys/kernel/tracing" @@ -119,7 +125,7 @@ void tracing_on(void) #endif } -void tracing_off(void) +static void tracing_off(void) { #if CONTROL_TRACING > 0 if (!tracing_root_ok()) @@ -153,7 +159,7 @@ __attribute__((__aligned__(65536))) #else __attribute__((__aligned__(PAGE_SIZE))) #endif -void lots_o_noops_around_write(int *write_to_me) +static void lots_o_noops_around_write(int *write_to_me) { dprintf3("running %s()\n", __func__); __page_o_noops(); @@ -164,7 +170,7 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -void dump_mem(void *dumpme, int len_bytes) +static void dump_mem(void *dumpme, int len_bytes) { char *c = (void *)dumpme; int i; @@ -207,7 +213,7 @@ static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) return 0; } -void pkey_disable_set(int pkey, int flags) +static void pkey_disable_set(int pkey, int flags) { unsigned long syscall_flags = 0; int ret; @@ -245,7 +251,7 @@ void pkey_disable_set(int pkey, int flags) pkey, flags); } -void pkey_disable_clear(int pkey, int flags) +static void pkey_disable_clear(int pkey, int flags) { unsigned long syscall_flags = 0; int ret; @@ -271,19 +277,19 @@ void pkey_disable_clear(int pkey, int flags) pkey, read_pkey_reg()); } -void pkey_write_allow(int pkey) +__maybe_unused static void pkey_write_allow(int pkey) { pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); } -void pkey_write_deny(int pkey) +__maybe_unused static void pkey_write_deny(int pkey) { pkey_disable_set(pkey, PKEY_DISABLE_WRITE); } -void pkey_access_allow(int pkey) +__maybe_unused static void pkey_access_allow(int pkey) { pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); } -void pkey_access_deny(int pkey) +__maybe_unused static void pkey_access_deny(int pkey) { pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); } @@ -301,9 +307,9 @@ static char *si_code_str(int si_code) return "UNKNOWN"; } -int pkey_faults; -int last_si_pkey = -1; -void signal_handler(int signum, siginfo_t *si, void *vucontext) +static int pkey_faults; +static int last_si_pkey = -1; +static void signal_handler(int signum, siginfo_t *si, void *vucontext) { ucontext_t *uctxt = vucontext; int trapno; @@ -390,27 +396,21 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) /* restore access and let the faulting instruction continue */ pkey_access_allow(siginfo_pkey); #elif defined(__aarch64__) - aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL); + aarch64_write_signal_pkey(uctxt, PKEY_REG_ALLOW_ALL); #endif /* arch */ pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; } -int wait_all_children(void) -{ - int status; - return waitpid(-1, &status, 0); -} - -void sig_chld(int x) +static void sig_chld(int x) { dprint_in_signal = 1; dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); dprint_in_signal = 0; } -void setup_sigsegv_handler(void) +static void setup_sigsegv_handler(void) { int r, rs; struct sigaction newact; @@ -436,13 +436,13 @@ void setup_sigsegv_handler(void) pkey_assert(r == 0); } -void setup_handlers(void) +static void setup_handlers(void) { signal(SIGCHLD, &sig_chld); setup_sigsegv_handler(); } -pid_t fork_lazy_child(void) +static pid_t fork_lazy_child(void) { pid_t forkret; @@ -460,35 +460,7 @@ pid_t fork_lazy_child(void) return forkret; } -int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int sret; - - dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, - ptr, size, orig_prot, pkey); - - errno = 0; - sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); - if (errno) { - dprintf2("SYS_mprotect_key sret: %d\n", sret); - dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); - dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); - if (DEBUG_LEVEL >= 2) - perror("SYS_mprotect_pkey"); - } - return sret; -} - -int sys_pkey_alloc(unsigned long flags, unsigned long init_val) -{ - int ret = syscall(SYS_pkey_alloc, flags, init_val); - dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", - __func__, flags, init_val, ret, errno); - return ret; -} - -int alloc_pkey(void) +static int alloc_pkey(void) { int ret; unsigned long init_val = 0x0; @@ -534,19 +506,12 @@ int alloc_pkey(void) return ret; } -int sys_pkey_free(unsigned long pkey) -{ - int ret = syscall(SYS_pkey_free, pkey); - dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); - return ret; -} - /* * I had a bug where pkey bits could be set by mprotect() but * not cleared. This ensures we get lots of random bit sets * and clears on the vma and pte pkey bits. */ -int alloc_random_pkey(void) +static int alloc_random_pkey(void) { int max_nr_pkey_allocs; int ret; @@ -629,13 +594,13 @@ struct pkey_malloc_record { }; struct pkey_malloc_record *pkey_malloc_records; struct pkey_malloc_record *pkey_last_malloc_record; -long nr_pkey_malloc_records; +static long nr_pkey_malloc_records; void record_pkey_malloc(void *ptr, long size, int prot) { - long i; + unsigned long i; struct pkey_malloc_record *rec = NULL; - for (i = 0; i < nr_pkey_malloc_records; i++) { + for (i = 0; i < (unsigned long)nr_pkey_malloc_records; i++) { rec = &pkey_malloc_records[i]; /* find a free record */ if (rec) @@ -667,7 +632,7 @@ void record_pkey_malloc(void *ptr, long size, int prot) nr_pkey_malloc_records++; } -void free_pkey_malloc(void *ptr) +static void free_pkey_malloc(void *ptr) { long i; int ret; @@ -694,8 +659,7 @@ void free_pkey_malloc(void *ptr) pkey_assert(false); } - -void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) +static void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) { void *ptr; int ret; @@ -715,7 +679,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) return ptr; } -void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) +static void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) { int ret; void *ptr; @@ -745,10 +709,10 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) return ptr; } -int hugetlb_setup_ok; +static int hugetlb_setup_ok; #define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 -void setup_hugetlbfs(void) +static void setup_hugetlbfs(void) { int err; int fd; @@ -796,7 +760,7 @@ void setup_hugetlbfs(void) hugetlb_setup_ok = 1; } -void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) +static void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) { void *ptr; int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; @@ -817,42 +781,15 @@ void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) return ptr; } -void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) -{ - void *ptr; - int fd; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - fd = open("/dax/foo", O_RDWR); - pkey_assert(fd >= 0); - - ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); - pkey_assert(ptr != (void *)-1); - - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); - close(fd); - return ptr; -} - -void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { +static void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb -/* can not do direct with the pkey_mprotect() API: - malloc_pkey_mmap_direct, - malloc_pkey_mmap_dax, -*/ }; -void *malloc_pkey(long size, int prot, u16 pkey) +static void *malloc_pkey(long size, int prot, u16 pkey) { void *ret; static int malloc_type; @@ -882,7 +819,7 @@ void *malloc_pkey(long size, int prot, u16 pkey) return ret; } -int last_pkey_faults; +static int last_pkey_faults; #define UNKNOWN_PKEY -2 void expected_pkey_fault(int pkey) { @@ -905,7 +842,7 @@ void expected_pkey_fault(int pkey) */ if (__read_pkey_reg() != 0) #elif defined(__aarch64__) - if (__read_pkey_reg() != PKEY_ALLOW_ALL) + if (__read_pkey_reg() != PKEY_REG_ALLOW_ALL) #else if (__read_pkey_reg() != shadow_pkey_reg) #endif /* arch */ @@ -924,24 +861,24 @@ void expected_pkey_fault(int pkey) pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) -int test_fds[10] = { -1 }; -int nr_test_fds; -void __save_test_fd(int fd) +static int test_fds[10] = { -1 }; +static int nr_test_fds; +static void __save_test_fd(int fd) { pkey_assert(fd >= 0); - pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); + pkey_assert(nr_test_fds < (signed int)ARRAY_SIZE(test_fds)); test_fds[nr_test_fds] = fd; nr_test_fds++; } -int get_test_read_fd(void) +static int get_test_read_fd(void) { int test_fd = open("/etc/passwd", O_RDONLY); __save_test_fd(test_fd); return test_fd; } -void close_test_fds(void) +static void close_test_fds(void) { int i; @@ -954,13 +891,13 @@ void close_test_fds(void) nr_test_fds = 0; } -void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +static void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) { int i, err; int max_nr_pkey_allocs; int alloced_pkeys[NR_PKEYS]; int nr_alloced = 0; - long size; + unsigned long size; pkey_assert(pkey_last_malloc_record); size = pkey_last_malloc_record->size; @@ -1006,7 +943,7 @@ void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) pkey_assert(!err); } -void test_read_of_write_disabled_region(int *ptr, u16 pkey) +static void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1016,7 +953,7 @@ void test_read_of_write_disabled_region(int *ptr, u16 pkey) dprintf1("*ptr: %d\n", ptr_contents); dprintf1("\n"); } -void test_read_of_access_disabled_region(int *ptr, u16 pkey) +static void test_read_of_access_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1028,7 +965,7 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) expected_pkey_fault(pkey); } -void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, +static void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, u16 pkey) { int ptr_contents; @@ -1045,7 +982,7 @@ void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, expected_pkey_fault(pkey); } -void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, +static void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, u16 pkey) { *ptr = __LINE__; @@ -1056,14 +993,14 @@ void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, expected_pkey_fault(pkey); } -void test_write_of_write_disabled_region(int *ptr, u16 pkey) +static void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); pkey_write_deny(pkey); *ptr = __LINE__; expected_pkey_fault(pkey); } -void test_write_of_access_disabled_region(int *ptr, u16 pkey) +static void test_write_of_access_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); pkey_access_deny(pkey); @@ -1071,7 +1008,7 @@ void test_write_of_access_disabled_region(int *ptr, u16 pkey) expected_pkey_fault(pkey); } -void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, +static void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, u16 pkey) { *ptr = __LINE__; @@ -1082,7 +1019,7 @@ void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, expected_pkey_fault(pkey); } -void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) +static void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; int test_fd = get_test_read_fd(); @@ -1094,7 +1031,8 @@ void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) dprintf1("read ret: %d\n", ret); pkey_assert(ret); } -void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) + +static void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) { int ret; int test_fd = get_test_read_fd(); @@ -1107,7 +1045,7 @@ void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) pkey_assert(ret); } -void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) +static void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) { int pipe_ret, vmsplice_ret; struct iovec iov; @@ -1129,7 +1067,7 @@ void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) close(pipe_fds[1]); } -void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) +static void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) { int ignored = 0xdada; int futex_ret; @@ -1147,7 +1085,7 @@ void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) } /* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) +static void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) { int err; int i; @@ -1170,7 +1108,7 @@ void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) } /* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) +static void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) { int err; int bad_pkey = NR_PKEYS+99; @@ -1180,7 +1118,7 @@ void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) pkey_assert(err); } -void become_child(void) +static void become_child(void) { pid_t forkret; @@ -1196,7 +1134,7 @@ void become_child(void) } /* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_alloc_exhaust(int *ptr, u16 pkey) +static void test_pkey_alloc_exhaust(int *ptr, u16 pkey) { int err; int allocated_pkeys[NR_PKEYS] = {0}; @@ -1263,7 +1201,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) } } -void arch_force_pkey_reg_init(void) +static void arch_force_pkey_reg_init(void) { #if defined(__i386__) || defined(__x86_64__) /* arch */ u64 *buf; @@ -1302,7 +1240,7 @@ void arch_force_pkey_reg_init(void) * a long-running test that continually checks the pkey * register. */ -void test_pkey_init_state(int *ptr, u16 pkey) +static void test_pkey_init_state(int *ptr, u16 pkey) { int err; int allocated_pkeys[NR_PKEYS] = {0}; @@ -1340,9 +1278,9 @@ void test_pkey_init_state(int *ptr, u16 pkey) * have to call pkey_alloc() to use it first. Make sure that it * is usable. */ -void test_mprotect_with_pkey_0(int *ptr, u16 pkey) +static void test_mprotect_with_pkey_0(int *ptr, u16 pkey) { - long size; + unsigned long size; int prot; assert(pkey_last_malloc_record); @@ -1364,7 +1302,7 @@ void test_mprotect_with_pkey_0(int *ptr, u16 pkey) mprotect_pkey(ptr, size, prot, pkey); } -void test_ptrace_of_child(int *ptr, u16 pkey) +static void test_ptrace_of_child(int *ptr, u16 pkey) { __attribute__((__unused__)) int peek_result; pid_t child_pid; @@ -1440,7 +1378,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) free(plain_ptr_unaligned); } -void *get_pointer_to_instructions(void) +static void *get_pointer_to_instructions(void) { void *p1; @@ -1461,7 +1399,7 @@ void *get_pointer_to_instructions(void) return p1; } -void test_executing_on_unreadable_memory(int *ptr, u16 pkey) +static void test_executing_on_unreadable_memory(int *ptr, u16 pkey) { void *p1; int scratch; @@ -1493,7 +1431,7 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); } -void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) +static void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) { void *p1; int scratch; @@ -1542,7 +1480,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) } #if defined(__i386__) || defined(__x86_64__) -void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +static void test_ptrace_modifies_pkru(int *ptr, u16 pkey) { u32 new_pkru; pid_t child; @@ -1590,7 +1528,7 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); xsave = (void *)malloc(xsave_size); - pkey_assert(xsave > 0); + pkey_assert(xsave != NULL); /* Modify the PKRU register directly */ iov.iov_base = xsave; @@ -1665,7 +1603,7 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) #endif #if defined(__aarch64__) -void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +static void test_ptrace_modifies_pkru(int *ptr, u16 pkey) { pid_t child; int status, ret; @@ -1742,7 +1680,7 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) } #endif -void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) +static void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; int sret; @@ -1756,7 +1694,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) pkey_assert(sret < 0); } -void (*pkey_tests[])(int *ptr, u16 pkey) = { +static void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, test_read_of_access_disabled_region_with_page_already_mapped, @@ -1782,12 +1720,12 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { #endif }; -void run_tests_once(void) +static void run_tests_once(void) { int *ptr; int prot = PROT_READ|PROT_WRITE; - for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { + for (test_nr = 0; test_nr < (signed int)ARRAY_SIZE(pkey_tests); test_nr++) { int pkey; int orig_pkey_faults = pkey_faults; @@ -1816,7 +1754,7 @@ void run_tests_once(void) iteration_nr++; } -void pkey_setup_shadow(void) +static void pkey_setup_shadow(void) { shadow_pkey_reg = __read_pkey_reg(); } diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 2fc290d9430c..333c468c2699 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -45,6 +45,8 @@ separated by spaces: vmalloc smoke tests - hmm hmm smoke tests +- madv_guard + test madvise(2) MADV_GUARD_INSTALL and MADV_GUARD_REMOVE options - madv_populate test memadvise(2) MADV_POPULATE_{READ,WRITE} options - memfd_secret @@ -307,6 +309,7 @@ CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 3 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16 +CATEGORY="userfaultfd" run_test ./uffd-wp-mremap #cleanup echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages @@ -375,6 +378,9 @@ CATEGORY="mremap" run_test ./mremap_dontunmap CATEGORY="hmm" run_test bash ./test_hmm.sh smoke +# MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests +CATEGORY="madv_guard" run_test ./guard-pages + # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c deleted file mode 100644 index d9f8ba8d5050..000000000000 --- a/tools/testing/selftests/mm/seal_elf.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include "../kselftest.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "mseal_helpers.h" - -/* - * define sys_xyx to call syscall directly. - */ -static int sys_mseal(void *start, size_t len) -{ - int sret; - - errno = 0; - sret = syscall(__NR_mseal, start, len, 0); - return sret; -} - -static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) -{ - int sret; - - errno = 0; - sret = syscall(__NR_mprotect, ptr, size, prot); - return sret; -} - -static bool seal_support(void) -{ - int ret; - void *ptr; - unsigned long page_size = getpagesize(); - - ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (ptr == (void *) -1) - return false; - - ret = sys_mseal(ptr, page_size); - if (ret < 0) - return false; - - return true; -} - -const char somestr[4096] = {"READONLY"}; - -static void test_seal_elf(void) -{ - int ret; - FILE *maps; - char line[512]; - uintptr_t addr_start, addr_end; - char prot[5]; - char filename[256]; - unsigned long page_size = getpagesize(); - unsigned long long ptr = (unsigned long long) somestr; - char *somestr2 = (char *)somestr; - - /* - * Modify the protection of readonly somestr - */ - if (((unsigned long long)ptr % page_size) != 0) - ptr = (unsigned long long)ptr & ~(page_size - 1); - - ksft_print_msg("somestr = %s\n", somestr); - ksft_print_msg("change protection to rw\n"); - ret = sys_mprotect((void *)ptr, page_size, PROT_READ|PROT_WRITE); - FAIL_TEST_IF_FALSE(!ret); - *somestr2 = 'A'; - ksft_print_msg("somestr is modified to: %s\n", somestr); - ret = sys_mprotect((void *)ptr, page_size, PROT_READ); - FAIL_TEST_IF_FALSE(!ret); - - maps = fopen("/proc/self/maps", "r"); - FAIL_TEST_IF_FALSE(maps); - - /* - * apply sealing to elf binary - */ - while (fgets(line, sizeof(line), maps)) { - if (sscanf(line, "%lx-%lx %4s %*x %*x:%*x %*u %255[^\n]", - &addr_start, &addr_end, prot, filename) == 4) { - if (strlen(filename)) { - /* - * seal the mapping if read only. - */ - if (strstr(prot, "r-")) { - ret = sys_mseal((void *)addr_start, addr_end - addr_start); - FAIL_TEST_IF_FALSE(!ret); - ksft_print_msg("sealed: %lx-%lx %s %s\n", - addr_start, addr_end, prot, filename); - if ((uintptr_t) somestr >= addr_start && - (uintptr_t) somestr <= addr_end) - ksft_print_msg("mapping for somestr found\n"); - } - } - } - } - fclose(maps); - - ret = sys_mprotect((void *)ptr, page_size, PROT_READ | PROT_WRITE); - FAIL_TEST_IF_FALSE(ret < 0); - ksft_print_msg("somestr is sealed, mprotect is rejected\n"); - - REPORT_TEST_PASS(); -} - -int main(int argc, char **argv) -{ - bool test_seal = seal_support(); - - ksft_print_header(); - ksft_print_msg("pid=%d\n", getpid()); - - if (!test_seal) - ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n"); - - ksft_set_plan(1); - - test_seal_elf(); - - ksft_finished(); -} diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index bdfa5d085f00..68edb2475ccd 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -74,10 +74,10 @@ static void test_vma_reuse(int pagemap_fd, int pagesize) munmap(map2, pagesize); } -static void test_hugepage(int pagemap_fd, int pagesize) +static void test_hugepage(int pagemap_fd) { char *map; - int i, ret; + unsigned int i, ret; size_t hpage_len = read_pmd_pagesize(); if (!hpage_len) @@ -128,7 +128,7 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) { const char *type[] = {"file", "anon"}; const char *fname = "./soft-dirty-test-file"; - int test_fd; + int test_fd = 0; char *map; if (anon) { @@ -187,7 +187,7 @@ static void test_mprotect_file(int pagemap_fd, int pagesize) test_mprotect(pagemap_fd, pagesize, false); } -int main(int argc, char **argv) +int main(void) { int pagemap_fd; int pagesize; @@ -203,7 +203,7 @@ int main(int argc, char **argv) test_simple(pagemap_fd, pagesize); test_vma_reuse(pagemap_fd, pagesize); - test_hugepage(pagemap_fd, pagesize); + test_hugepage(pagemap_fd); test_mprotect_anon(pagemap_fd, pagesize); test_mprotect_file(pagemap_fd, pagesize); diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index eb6d1b9fc362..3d3bc40a268b 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -103,43 +103,33 @@ static char *allocate_zero_filled_hugepage(size_t len) return result; } -static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, size_t len) { unsigned long rss_anon_before, rss_anon_after; size_t i; - if (!check_huge_anon(one_page, 4, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 4, pmd_pagesize)) + ksft_exit_fail_msg("No THP is allocated\n"); rss_anon_before = rss_anon(); - if (!rss_anon_before) { - printf("No RssAnon is allocated before split\n"); - exit(EXIT_FAILURE); - } + if (!rss_anon_before) + ksft_exit_fail_msg("No RssAnon is allocated before split\n"); /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, (uint64_t)one_page + len, 0); for (i = 0; i < len; i++) - if (one_page[i] != (char)0) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } + if (one_page[i] != (char)0) + ksft_exit_fail_msg("%ld byte corrupted\n", i); - if (!check_huge_anon(one_page, 0, pmd_pagesize)) { - printf("Still AnonHugePages not split\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 0, pmd_pagesize)) + ksft_exit_fail_msg("Still AnonHugePages not split\n"); rss_anon_after = rss_anon(); - if (rss_anon_after >= rss_anon_before) { - printf("Incorrect RssAnon value. Before: %ld After: %ld\n", + if (rss_anon_after >= rss_anon_before) + ksft_exit_fail_msg("Incorrect RssAnon value. Before: %ld After: %ld\n", rss_anon_before, rss_anon_after); - exit(EXIT_FAILURE); - } } void split_pmd_zero_pages(void) @@ -149,12 +139,12 @@ void split_pmd_zero_pages(void) size_t len = nr_hpages * pmd_pagesize; one_page = allocate_zero_filled_hugepage(len); - verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); - printf("Split zero filled huge pages successful\n"); + verify_rss_anon_split_huge_page_all_zeroes(one_page, len); + ksft_test_result_pass("Split zero filled huge pages successful\n"); free(one_page); } -void split_pmd_thp(void) +void split_pmd_thp_to_order(int order) { char *one_page; size_t len = 4 * pmd_pagesize; @@ -174,7 +164,7 @@ void split_pmd_thp(void) /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len, 0); + (uint64_t)one_page + len, order); for (i = 0; i < len; i++) if (one_page[i] != (char)i) @@ -184,7 +174,7 @@ void split_pmd_thp(void) if (!check_huge_anon(one_page, 0, pmd_pagesize)) ksft_exit_fail_msg("Still AnonHugePages not split\n"); - ksft_test_result_pass("Split huge pages successful\n"); + ksft_test_result_pass("Split huge pages to order %d successful\n", order); free(one_page); } @@ -491,7 +481,7 @@ int main(int argc, char **argv) if (argc > 1) optional_xfs_path = argv[1]; - ksft_set_plan(3+9); + ksft_set_plan(1+9+2+9); pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; @@ -502,7 +492,10 @@ int main(int argc, char **argv) fd_size = 2 * pmd_pagesize; split_pmd_zero_pages(); - split_pmd_thp(); + + for (i = 0; i < 9; i++) + split_pmd_thp_to_order(i); + split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index 577eaab6266f..ad872af1c81a 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -87,7 +87,7 @@ int write_file(const char *path, const char *buf, size_t buflen) return (unsigned int) numwritten; } -const unsigned long read_num(const char *path) +unsigned long read_num(const char *path) { char buf[21]; @@ -172,7 +172,7 @@ void thp_write_string(const char *name, const char *val) } } -const unsigned long thp_read_num(const char *name) +unsigned long thp_read_num(const char *name) { char path[PATH_MAX]; int ret; diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index 876235a23460..fc131d23d593 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -64,12 +64,12 @@ struct thp_settings { int read_file(const char *path, char *buf, size_t buflen); int write_file(const char *path, const char *buf, size_t buflen); -const unsigned long read_num(const char *path); +unsigned long read_num(const char *path); void write_num(const char *path, unsigned long num); int thp_read_string(const char *name, const char * const strings[]); void thp_write_string(const char *name, const char *val); -const unsigned long thp_read_num(const char *name); +unsigned long thp_read_num(const char *name); void thp_write_num(const char *name, unsigned long num); void thp_write_settings(struct thp_settings *settings); diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index e4370b79b62f..515b89ac4eb5 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -65,7 +65,7 @@ void show(unsigned long ps) { char buf[100]; - if (ps == getpagesize()) + if ((signed long)ps == getpagesize()) return; ksft_print_msg("%luMB: ", ps >> 20); @@ -106,7 +106,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...) unsigned long read_free(unsigned long ps) { - return read_sysfs(ps != getpagesize(), + return read_sysfs((signed long)ps != getpagesize(), "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", ps >> 10); } @@ -126,7 +126,7 @@ void test_mmap(unsigned long size, unsigned flags) after = read_free(size); show(size); - ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, + ksft_test_result((signed long)size == getpagesize() || (before - after) == NUM_PAGES, "%s mmap\n", __func__); if (munmap(map, size * NUM_PAGES)) @@ -164,7 +164,7 @@ void test_shmget(unsigned long size, unsigned flags) after = read_free(size); show(size); - ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, + ksft_test_result((signed long)size == getpagesize() || (before - after) == NUM_PAGES, "%s: mmap\n", __func__); if (shmdt(map)) ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno)); @@ -173,7 +173,7 @@ void test_shmget(unsigned long size, unsigned flags) void find_pagesizes(void) { unsigned long largest = getpagesize(); - int i; + unsigned int i; glob_t g; glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 717539eddf98..47bdcb47481a 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -46,7 +46,7 @@ static void anon_release_pages(char *rel_area) err("madvise(MADV_DONTNEED) failed"); } -static int anon_allocate_area(void **alloc_area, bool is_src) +static int anon_allocate_area(void **alloc_area, bool __attribute__((unused)) is_src) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); @@ -57,7 +57,9 @@ static int anon_allocate_area(void **alloc_area, bool is_src) return 0; } -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void noop_alias_mapping(__u64 __attribute__((unused)) *start, + size_t __attribute__((unused)) len, + unsigned long __attribute__((unused)) offset) { } @@ -108,7 +110,8 @@ static int hugetlb_allocate_area(void **alloc_area, bool is_src) return 0; } -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void hugetlb_alias_mapping(__u64 *start, size_t __attribute__((unused)) len, + unsigned long offset) { if (!map_shared) return; @@ -167,12 +170,13 @@ static int shmem_allocate_area(void **alloc_area, bool is_src) return 0; } -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void shmem_alias_mapping(__u64 *start, size_t __attribute__((unused)) len, + unsigned long offset) { *start = (unsigned long)area_dst_alias + offset; } -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +static void shmem_check_pmd_mapping(void __attribute__((unused)) *p, int expect_nr_hpages) { if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, read_pmd_pagesize())) @@ -416,7 +420,7 @@ static void continue_range(int ufd, __u64 start, __u64 len, bool wp) ret, (int64_t) req.mapped); } -int uffd_read_msg(int ufd, struct uffd_msg *msg) +int uffd_read_msg(struct uffd_msg *msg) { int ret = read(uffd, msg, sizeof(*msg)); @@ -446,7 +450,7 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) args->wp_faults++; } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { uint8_t *area; - int b; + unsigned int b; /* * Minor page faults @@ -537,7 +541,7 @@ void *uffd_poll_thread(void *arg) } if (!(pollfd[0].revents & POLLIN)) err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) + if (uffd_read_msg(&msg)) continue; switch (msg.event) { default: @@ -617,7 +621,7 @@ int __copy_page(int ufd, unsigned long offset, bool retry, bool wp) err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { + } else if (uffdio_copy.copy != (signed long)page_size) { err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); } else { if (test_uffdio_copy_eexist && retry) { @@ -651,7 +655,7 @@ int move_page(int ufd, unsigned long offset, unsigned long len) err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move); wake_range(ufd, uffdio_move.dst, len); - } else if (uffdio_move.move != len) { + } else if (uffdio_move.move != (signed long)len) { err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move); } else return 1; diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index a70ae10b5f62..4a5d5b37107c 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -117,7 +117,7 @@ void uffd_stats_report(struct uffd_args *args, int n_cpus); int uffd_test_ctx_init(uint64_t features, const char **errmsg); void uffd_test_ctx_clear(void); int userfaultfd_open(uint64_t *features); -int uffd_read_msg(int ufd, struct uffd_msg *msg); +int uffd_read_msg(struct uffd_msg *msg); void wp_range(int ufd, __u64 start, __u64 len, bool wp); void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args); int __copy_page(int ufd, unsigned long offset, bool retry, bool wp); diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index a4b83280998a..5509ec32c329 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -77,7 +77,7 @@ static void usage(void) static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus) { - int i; + unsigned int i; for (i = 0; i < n_cpus; i++) { args[i].cpu = i; @@ -136,7 +136,7 @@ static void *uffd_read_thread(void *arg) /* from here cancellation is ok */ for (;;) { - if (uffd_read_msg(uffd, &msg)) + if (uffd_read_msg(&msg)) continue; uffd_handle_page_fault(&msg, args); } diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index a2e71b1636e7..c3d59ec75404 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -244,7 +244,7 @@ static void *fork_event_consumer(void *data) ready_for_fork = true; /* Read until a full msg received */ - while (uffd_read_msg(args->parent_uffd, &msg)); + while (uffd_read_msg(&msg)); if (msg.event != UFFD_EVENT_FORK) err("wrong message: %u\n", msg.event); @@ -357,7 +357,7 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) return result; } -static void uffd_wp_unpopulated_test(uffd_test_args_t *args) +static void uffd_wp_unpopulated_test(uffd_test_args_t __attribute__((unused)) *args) { uint64_t value; int pagemap_fd; @@ -483,8 +483,7 @@ static void uffd_wp_fork_with_event_test(uffd_test_args_t *args) uffd_wp_fork_test_common(args, true); } -static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args, - bool with_event) +static void uffd_wp_fork_pin_test_common(bool with_event) { int pagemap_fd; pin_args pin_args = {}; @@ -535,14 +534,14 @@ out: close(pagemap_fd); } -static void uffd_wp_fork_pin_test(uffd_test_args_t *args) +static void uffd_wp_fork_pin_test(uffd_test_args_t __attribute__((unused)) *args) { - uffd_wp_fork_pin_test_common(args, false); + uffd_wp_fork_pin_test_common(false); } -static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t *args) +static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t __attribute__((unused)) *args) { - uffd_wp_fork_pin_test_common(args, true); + uffd_wp_fork_pin_test_common(true); } static void check_memory_contents(char *p) @@ -627,24 +626,25 @@ static void uffd_minor_test_common(bool test_collapse, bool test_wp) uffd_test_pass(); } -void uffd_minor_test(uffd_test_args_t *args) +void uffd_minor_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_minor_test_common(false, false); } -void uffd_minor_wp_test(uffd_test_args_t *args) +void uffd_minor_wp_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_minor_test_common(false, true); } -void uffd_minor_collapse_test(uffd_test_args_t *args) +void uffd_minor_collapse_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_minor_test_common(true, false); } static sigjmp_buf jbuf, *sigbuf; -static void sighndl(int sig, siginfo_t *siginfo, void *ptr) +static void sighndl(int sig, siginfo_t __attribute__((unused)) *siginfo, + void __attribute__((unused)) *ptr) { if (sig == SIGBUS) { if (sigbuf) @@ -820,12 +820,12 @@ static void uffd_sigbus_test_common(bool wp) uffd_test_pass(); } -static void uffd_sigbus_test(uffd_test_args_t *args) +static void uffd_sigbus_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_sigbus_test_common(false); } -static void uffd_sigbus_wp_test(uffd_test_args_t *args) +static void uffd_sigbus_wp_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_sigbus_test_common(true); } @@ -873,12 +873,12 @@ static void uffd_events_test_common(bool wp) uffd_test_pass(); } -static void uffd_events_test(uffd_test_args_t *args) +static void uffd_events_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_events_test_common(false); } -static void uffd_events_wp_test(uffd_test_args_t *args) +static void uffd_events_wp_test(uffd_test_args_t __attribute__((unused)) *args) { uffd_events_test_common(true); } @@ -917,7 +917,7 @@ static bool do_uffdio_zeropage(int ufd, bool has_zeropage) else if (res != -EINVAL) err("UFFDIO_ZEROPAGE not -EINVAL"); } else if (has_zeropage) { - if (res != page_size) + if (res != (signed long)page_size) err("UFFDIO_ZEROPAGE unexpected size"); else retry_uffdio_zeropage(ufd, &uffdio_zeropage); @@ -946,10 +946,10 @@ uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len) } /* exercise UFFDIO_ZEROPAGE */ -static void uffd_zeropage_test(uffd_test_args_t *args) +static void uffd_zeropage_test(uffd_test_args_t __attribute__((unused)) *args) { bool has_zeropage; - int i; + unsigned int i; has_zeropage = uffd_register_detect_zeropage(uffd, area_dst, page_size); if (area_dst_alias) @@ -997,12 +997,12 @@ static void do_uffdio_poison(int uffd, unsigned long offset) if (ret) err("UFFDIO_POISON error: %"PRId64, (int64_t)res); - else if (res != page_size) + else if (res != (signed long)page_size) err("UFFDIO_POISON unexpected size: %"PRId64, (int64_t)res); } static void uffd_poison_handle_fault( - struct uffd_msg *msg, struct uffd_args *args) + struct uffd_msg *msg, struct uffd_args __attribute__((unused)) *args) { unsigned long offset; @@ -1023,7 +1023,7 @@ static void uffd_poison_handle_fault( do_uffdio_poison(uffd, offset); } -static void uffd_poison_test(uffd_test_args_t *targs) +static void uffd_poison_test(uffd_test_args_t __attribute__((unused)) *targs) { pthread_t uffd_mon; char c; @@ -1114,7 +1114,7 @@ static void uffd_move_pmd_handle_fault(struct uffd_msg *msg, } static void -uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, +uffd_move_test_common(unsigned long chunk_size, void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args)) { unsigned long nr; @@ -1122,7 +1122,7 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, char c; unsigned long long count; struct uffd_args args = { 0 }; - char *orig_area_src, *orig_area_dst; + char *orig_area_src = NULL, *orig_area_dst = NULL; unsigned long step_size, step_count; unsigned long src_offs = 0; unsigned long dst_offs = 0; @@ -1190,7 +1190,7 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, nr, count, count_verify[src_offs + nr + i]); } } - if (step_size > page_size) { + if (chunk_size > page_size) { area_src = orig_area_src; area_dst = orig_area_dst; } @@ -1206,24 +1206,24 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, uffd_test_pass(); } -static void uffd_move_test(uffd_test_args_t *targs) +static void uffd_move_test(uffd_test_args_t __attribute__((unused)) *targs) { - uffd_move_test_common(targs, page_size, uffd_move_handle_fault); + uffd_move_test_common(page_size, uffd_move_handle_fault); } -static void uffd_move_pmd_test(uffd_test_args_t *targs) +static void uffd_move_pmd_test(uffd_test_args_t __attribute__((unused)) *targs) { if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) err("madvise(MADV_HUGEPAGE) failure"); - uffd_move_test_common(targs, read_pmd_pagesize(), + uffd_move_test_common(read_pmd_pagesize(), uffd_move_pmd_handle_fault); } -static void uffd_move_pmd_split_test(uffd_test_args_t *targs) +static void uffd_move_pmd_split_test(uffd_test_args_t __attribute__((unused)) *targs) { if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) err("madvise(MADV_NOHUGEPAGE) failure"); - uffd_move_test_common(targs, read_pmd_pagesize(), + uffd_move_test_common(read_pmd_pagesize(), uffd_move_pmd_handle_fault); } diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c new file mode 100644 index 000000000000..f548b1e1f197 --- /dev/null +++ b/tools/testing/selftests/mm/uffd-wp-mremap.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include "thp_settings.h" +#include "uffd-common.h" + +static int pagemap_fd; +static size_t pagesize; +static int nr_pagesizes = 1; +static int nr_thpsizes; +static size_t thpsizes[20]; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; + +static int sz2ord(size_t size) +{ + return __builtin_ctzll(size / pagesize); +} + +static int detect_thp_sizes(size_t sizes[], int max) +{ + int count = 0; + unsigned long orders; + size_t kb; + int i; + + /* thp not supported at all. */ + if (!read_pmd_pagesize()) + return 0; + + orders = thp_supported_orders(); + + for (i = 0; orders && count < max; i++) { + if (!(orders & (1UL << i))) + continue; + orders &= ~(1UL << i); + kb = (pagesize >> 10) << i; + sizes[count++] = kb * 1024; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); + } + + return count; +} + +static void *mmap_aligned(size_t size, int prot, int flags) +{ + size_t mmap_size = size * 2; + char *mmap_mem, *mem; + + mmap_mem = mmap(NULL, mmap_size, prot, flags, -1, 0); + if (mmap_mem == MAP_FAILED) + return mmap_mem; + + mem = (char *)(((uintptr_t)mmap_mem + size - 1) & ~(size - 1)); + munmap(mmap_mem, mem - mmap_mem); + munmap(mem + size, mmap_mem + mmap_size - mem - size); + + return mem; +} + +static void *alloc_one_folio(size_t size, bool private, bool hugetlb) +{ + bool thp = !hugetlb && size > pagesize; + int flags = MAP_ANONYMOUS; + int prot = PROT_READ | PROT_WRITE; + char *mem, *addr; + + assert((size & (size - 1)) == 0); + + if (private) + flags |= MAP_PRIVATE; + else + flags |= MAP_SHARED; + + /* + * For THP, we must explicitly enable the THP size, allocate twice the + * required space then manually align. + */ + if (thp) { + struct thp_settings settings = *thp_current_settings(); + + if (private) + settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; + else + settings.shmem_hugepages[sz2ord(size)].enabled = SHMEM_ALWAYS; + + thp_push_settings(&settings); + + mem = mmap_aligned(size, prot, flags); + } else { + if (hugetlb) { + flags |= MAP_HUGETLB; + flags |= __builtin_ctzll(size) << MAP_HUGE_SHIFT; + } + + mem = mmap(NULL, size, prot, flags, -1, 0); + } + + if (mem == MAP_FAILED) { + mem = NULL; + goto out; + } + + assert(((uintptr_t)mem & (size - 1)) == 0); + + /* + * Populate the folio by writing the first byte and check that all pages + * are populated. Finally set the whole thing to non-zero data to avoid + * kernel from mapping it back to the zero page. + */ + mem[0] = 1; + for (addr = mem; addr < mem + size; addr += pagesize) { + if (!pagemap_is_populated(pagemap_fd, addr)) { + munmap(mem, size); + mem = NULL; + goto out; + } + } + memset(mem, 1, size); +out: + if (thp) + thp_pop_settings(); + + return mem; +} + +static bool check_uffd_wp_state(void *mem, size_t size, bool expect) +{ + uint64_t pte; + void *addr; + + for (addr = mem; addr < mem + size; addr += pagesize) { + pte = pagemap_get_entry(pagemap_fd, addr); + if (!!(pte & PM_UFFD_WP) != expect) { + ksft_test_result_fail("uffd-wp not %s for pte %lu!\n", + expect ? "set" : "clear", + (addr - mem) / pagesize); + return false; + } + } + + return true; +} + +static bool range_is_swapped(void *addr, size_t size) +{ + for (; size; addr += pagesize, size -= pagesize) + if (!pagemap_is_swapped(pagemap_fd, addr)) + return false; + return true; +} + +static void test_one_folio(size_t size, bool private, bool swapout, bool hugetlb) +{ + struct uffdio_writeprotect wp_prms; + uint64_t features = 0; + void *addr = NULL; + void *mem = NULL; + + assert(!(hugetlb && swapout)); + + ksft_print_msg("[RUN] %s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n", + __func__, + size, + private ? "true" : "false", + swapout ? "true" : "false", + hugetlb ? "true" : "false"); + + /* Allocate a folio of required size and type. */ + mem = alloc_one_folio(size, private, hugetlb); + if (!mem) { + ksft_test_result_fail("alloc_one_folio() failed\n"); + goto out; + } + + /* Register range for uffd-wp. */ + if (userfaultfd_open(&features)) { + ksft_test_result_fail("userfaultfd_open() failed\n"); + goto out; + } + if (uffd_register(uffd, mem, size, false, true, false)) { + ksft_test_result_fail("uffd_register() failed\n"); + goto out; + } + wp_prms.mode = UFFDIO_WRITEPROTECT_MODE_WP; + wp_prms.range.start = (uintptr_t)mem; + wp_prms.range.len = size; + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp_prms)) { + ksft_test_result_fail("ioctl(UFFDIO_WRITEPROTECT) failed\n"); + goto out; + } + + if (swapout) { + madvise(mem, size, MADV_PAGEOUT); + if (!range_is_swapped(mem, size)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto out; + } + } + + /* Check that uffd-wp is set for all PTEs in range. */ + if (!check_uffd_wp_state(mem, size, true)) + goto out; + + /* + * Move the mapping to a new, aligned location. Since + * UFFD_FEATURE_EVENT_REMAP is not set, we expect the uffd-wp bit for + * each PTE to be cleared in the new mapping. + */ + addr = mmap_aligned(size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS); + if (addr == MAP_FAILED) { + ksft_test_result_fail("mmap_aligned() failed\n"); + goto out; + } + if (mremap(mem, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, addr) == MAP_FAILED) { + ksft_test_result_fail("mremap() failed\n"); + munmap(addr, size); + goto out; + } + mem = addr; + + /* Check that uffd-wp is cleared for all PTEs in range. */ + if (!check_uffd_wp_state(mem, size, false)) + goto out; + + ksft_test_result_pass("%s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n", + __func__, + size, + private ? "true" : "false", + swapout ? "true" : "false", + hugetlb ? "true" : "false"); +out: + if (mem) + munmap(mem, size); + if (uffd >= 0) { + close(uffd); + uffd = -1; + } +} + +struct testcase { + size_t *sizes; + int *nr_sizes; + bool private; + bool swapout; + bool hugetlb; +}; + +static const struct testcase testcases[] = { + /* base pages. */ + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = false, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = true, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = false, + .swapout = true, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = true, + .swapout = true, + .hugetlb = false, + }, + + /* thp. */ + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = false, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = true, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = false, + .swapout = true, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = true, + .swapout = true, + .hugetlb = false, + }, + + /* hugetlb. */ + { + .sizes = hugetlbsizes, + .nr_sizes = &nr_hugetlbsizes, + .private = false, + .swapout = false, + .hugetlb = true, + }, + { + .sizes = hugetlbsizes, + .nr_sizes = &nr_hugetlbsizes, + .private = true, + .swapout = false, + .hugetlb = true, + }, +}; + +int main(void) +{ + struct thp_settings settings; + int i, j, plan = 0; + + pagesize = getpagesize(); + nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); + nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, + ARRAY_SIZE(hugetlbsizes)); + + /* If THP is supported, save THP settings and initially disable THP. */ + if (nr_thpsizes) { + thp_save_settings(); + thp_read_settings(&settings); + for (i = 0; i < NR_ORDERS; i++) { + settings.hugepages[i].enabled = THP_NEVER; + settings.shmem_hugepages[i].enabled = SHMEM_NEVER; + } + thp_push_settings(&settings); + } + + for (i = 0; i < ARRAY_SIZE(testcases); i++) + plan += *testcases[i].nr_sizes; + ksft_set_plan(plan); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + + for (i = 0; i < ARRAY_SIZE(testcases); i++) { + const struct testcase *tc = &testcases[i]; + + for (j = 0; j < *tc->nr_sizes; j++) + test_one_folio(tc->sizes[j], tc->private, tc->swapout, + tc->hugetlb); + } + + /* If THP is supported, restore original THP settings. */ + if (nr_thpsizes) + thp_restore_settings(); + + i = ksft_get_fail_cnt(); + if (i) + ksft_exit_fail_msg("%d out of %d tests failed\n", + i, ksft_test_num()); + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 484f82c7b7c8..6e4269b9b54d 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -160,7 +160,7 @@ static int validate_complete_va_space(void) return 0; } -int main(int argc, char *argv[]) +int main(void) { char *ptr[NR_CHUNKS_LOW]; char **hptr; diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index d8d0cf04bb57..7519c9a892f0 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -138,7 +138,7 @@ void clear_softdirty(void) ksft_exit_fail_msg("opening clear_refs failed\n"); ret = write(fd, ctrl, strlen(ctrl)); close(fd); - if (ret != strlen(ctrl)) + if (ret != (signed int)strlen(ctrl)) ksft_exit_fail_msg("writing clear_refs failed\n"); } diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 1289d311efd7..34c91f7e6128 100644 --- a/tools/testing/selftests/mm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -89,7 +89,7 @@ int main(int argc, char **argv) size = atoi(optarg); break; case 'p': - strncpy(path, optarg, sizeof(path)); + strncpy(path, optarg, sizeof(path) - 1); break; case 'm': if (atoi(optarg) >= MAX_METHOD) { diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h index 3e1b6adc027b..788c597c4fde 100644 --- a/tools/testing/vma/linux/atomic.h +++ b/tools/testing/vma/linux/atomic.h @@ -9,4 +9,9 @@ #define atomic_set(x, y) uatomic_set(x, y) #define U8_MAX UCHAR_MAX +#ifndef atomic_cmpxchg_relaxed +#define atomic_cmpxchg_relaxed uatomic_cmpxchg +#define atomic_cmpxchg_release uatomic_cmpxchg +#endif /* atomic_cmpxchg_relaxed */ + #endif /* _LINUX_ATOMIC_H */ diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 8fab5e13c7c3..04ab45e27fb8 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -18,6 +18,12 @@ static bool fail_prealloc; #define vma_iter_prealloc(vmi, vma) \ (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL)) +#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536 + +unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +unsigned long stack_guard_gap = 256UL<vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return vma; } @@ -214,7 +225,7 @@ static bool vma_write_started(struct vm_area_struct *vma) int seq = vma->vm_lock_seq; /* We reset after each check. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; /* The vma_start_write() stub simply increments this value. */ return seq > -1; @@ -1563,6 +1574,57 @@ static bool test_expand_only_mode(void) return true; } +static bool test_mmap_region_basic(void) +{ + struct mm_struct mm = {}; + unsigned long addr; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, &mm, 0); + + current->mm = &mm; + + /* Map at 0x300000, length 0x3000. */ + addr = __mmap_region(NULL, 0x300000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x300, NULL); + ASSERT_EQ(addr, 0x300000); + + /* Map at 0x250000, length 0x3000. */ + addr = __mmap_region(NULL, 0x250000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x250, NULL); + ASSERT_EQ(addr, 0x250000); + + /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ + addr = __mmap_region(NULL, 0x303000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x303, NULL); + ASSERT_EQ(addr, 0x303000); + + /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ + addr = __mmap_region(NULL, 0x24d000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x24d, NULL); + ASSERT_EQ(addr, 0x24d000); + + ASSERT_EQ(mm.map_count, 2); + + for_each_vma(vmi, vma) { + if (vma->vm_start == 0x300000) { + ASSERT_EQ(vma->vm_end, 0x306000); + ASSERT_EQ(vma->vm_pgoff, 0x300); + } else if (vma->vm_start == 0x24d000) { + ASSERT_EQ(vma->vm_end, 0x253000); + ASSERT_EQ(vma->vm_pgoff, 0x24d); + } else { + ASSERT_FALSE(true); + } + } + + cleanup_mm(&mm, &vmi); + return true; +} + int main(void) { int num_tests = 0, num_fail = 0; @@ -1596,6 +1658,8 @@ int main(void) TEST(copy_vma); TEST(expand_only_mode); + TEST(mmap_region_basic); + #undef TEST printf("%d tests run, %d passed, %d failed.\n", diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index e76ff579e1fd..49a85ce0d45a 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -25,13 +25,24 @@ #include #include #include -#include +#include + +extern unsigned long stack_guard_gap; +#ifdef CONFIG_MMU +extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; +#else +#define mmap_min_addr 0UL +#define dac_mmap_min_addr 0UL +#endif #define VM_WARN_ON(_expr) (WARN_ON(_expr)) #define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) #define VM_BUG_ON(_expr) (BUG_ON(_expr)) #define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) +#define MMF_HAS_MDWE 28 + #define VM_NONE 0x00000000 #define VM_READ 0x00000001 #define VM_WRITE 0x00000002 @@ -39,6 +50,7 @@ #define VM_SHARED 0x00000008 #define VM_MAYREAD 0x00000010 #define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 #define VM_GROWSDOWN 0x00000100 #define VM_PFNMAP 0x00000400 #define VM_LOCKED 0x00002000 @@ -51,6 +63,8 @@ #define VM_STACK VM_GROWSDOWN #define VM_SHADOW_STACK VM_NONE #define VM_SOFTDIRTY 0 +#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ +#define VM_GROWSUP VM_NONE #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) @@ -58,6 +72,20 @@ /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC + +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define RLIMIT_STACK 3 /* max stack size */ +#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ + +#define CAP_IPC_LOCK 14 + #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) @@ -106,10 +134,6 @@ typedef __bitwise unsigned int vm_fault_t; */ #define pr_warn_once pr_err -typedef struct refcount_struct { - atomic_t refs; -} refcount_t; - struct kref { refcount_t refcount; }; @@ -122,10 +146,22 @@ enum { TASK_COMM_LEN = 16, }; +/* + * Flags for bug emulation. + * + * These occupy the top three bytes. + */ +enum { + READ_IMPLIES_EXEC = 0x0400000, +}; + struct task_struct { char comm[TASK_COMM_LEN]; pid_t pid; struct mm_struct *mm; + + /* Used for emulating ABI behavior of previous Linux versions: */ + unsigned int personality; }; struct task_struct *get_current(void); @@ -186,17 +222,18 @@ struct mm_struct { unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ -}; -struct vma_lock { - struct rw_semaphore lock; -}; + unsigned long def_flags; + unsigned long flags; /* Must use atomic bitops to access */ +}; struct file { struct address_space *f_mapping; }; +#define VMA_LOCK_OFFSET 0x40000000 + struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -224,16 +261,13 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* Flag to indicate areas detached from the mm->mm_mt tree */ - bool detached; - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -241,8 +275,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; - struct vma_lock *vm_lock; + unsigned int vm_lock_seq; #endif /* @@ -295,6 +328,10 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + refcount_t vm_refcnt; +#endif } __randomize_layout; struct vm_fault {}; @@ -373,6 +410,17 @@ struct vm_operations_struct { unsigned long addr); }; +struct vm_unmapped_area_info { +#define VM_UNMAPPED_AREA_TOPDOWN 1 + unsigned long flags; + unsigned long length; + unsigned long low_limit; + unsigned long high_limit; + unsigned long align_mask; + unsigned long align_offset; + unsigned long start_gap; +}; + static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); @@ -408,37 +456,54 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) return mas_find(&vmi->mas, ULONG_MAX); } -static inline bool vma_lock_alloc(struct vm_area_struct *vma) +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) { - vma->vm_lock = calloc(1, sizeof(struct vma_lock)); + VM_BUG_ON_VMA(!refcount_read(&vma->vm_refcnt), vma); +} - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; - - return true; +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt), vma); } static inline void vma_assert_write_locked(struct vm_area_struct *); -static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +static inline void vma_mark_attached(struct vm_area_struct *vma) { - /* When detaching vma should be write-locked */ - if (detached) - vma_assert_write_locked(vma); - vma->detached = detached; + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set(&vma->vm_refcnt, 1); +} + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* We are the only writer, so no need to use vma_refcount_put(). */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* + * Reader must have temporarily raised vm_refcnt but it will + * drop it without using the vma since vma is write-locked. + */ + } } extern const struct vm_operations_struct vma_dummy_vm_ops; +extern unsigned long rlimit(unsigned int limit); + static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_mark_detached(vma, false); + vma->vm_lock_seq = UINT_MAX; } static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) @@ -449,10 +514,6 @@ static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - free(vma); - return NULL; - } return vma; } @@ -465,10 +526,8 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return NULL; memcpy(new, orig, sizeof(*new)); - if (!vma_lock_alloc(new)) { - free(new); - return NULL; - } + refcount_set(&new->vm_refcnt, 0); + new->vm_lock_seq = UINT_MAX; INIT_LIST_HEAD(&new->anon_vma_chain); return new; @@ -638,20 +697,9 @@ static inline void mpol_put(struct mempolicy *) { } -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - free(vma->vm_lock); -} - -static inline void __vm_area_free(struct vm_area_struct *vma) -{ - vma_lock_free(vma); - free(vma); -} - static inline void vm_area_free(struct vm_area_struct *vma) { - __vm_area_free(vma); + free(vma); } static inline void lru_add_drain(void) @@ -853,6 +901,11 @@ static inline void mmap_write_unlock(struct mm_struct *) { } +static inline int mmap_write_lock_killable(struct mm_struct *) +{ + return 0; +} + static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -938,7 +991,7 @@ static inline bool is_file_hugepages(struct file *) static inline int security_vm_enough_memory_mm(struct mm_struct *, long) { - return true; + return 0; } static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long) @@ -1033,4 +1086,159 @@ static inline int mmap_file(struct file *, struct vm_area_struct *) return 0; } +static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_GROWSDOWN) + return stack_guard_gap; + + /* See reasoning around the VM_SHADOW_STACK definition */ + if (vma->vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + +static inline unsigned long vm_start_gap(struct vm_area_struct *vma) +{ + unsigned long gap = stack_guard_start_gap(vma); + unsigned long vm_start = vma->vm_start; + + vm_start -= gap; + if (vm_start > vma->vm_start) + vm_start = 0; + return vm_start; +} + +static inline unsigned long vm_end_gap(struct vm_area_struct *vma) +{ + unsigned long vm_end = vma->vm_end; + + if (vma->vm_flags & VM_GROWSUP) { + vm_end += stack_guard_gap; + if (vm_end < vma->vm_end) + vm_end = -PAGE_SIZE; + } + return vm_end; +} + +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_ACCESS_FLAGS; +} + +static inline bool capable(int cap) +{ + return true; +} + +static inline bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, + unsigned long bytes) +{ + unsigned long locked_pages, limit_pages; + + if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + return true; + + locked_pages = bytes >> PAGE_SHIFT; + locked_pages += mm->locked_vm; + + limit_pages = rlimit(RLIMIT_MEMLOCK); + limit_pages >>= PAGE_SHIFT; + + return locked_pages <= limit_pages; +} + +static inline int __anon_vma_prepare(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma)); + + if (!anon_vma) + return -ENOMEM; + + anon_vma->root = anon_vma; + vma->anon_vma = anon_vma; + + return 0; +} + +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + +static inline void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf) +{ +} + +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + * + * This is only applicable if the user has set the Memory-Deny-Write-Execute + * (MDWE) protection mask for the current process. + * + * @old specifies the VMA flags the VMA originally possessed, and @new the ones + * we propose to set. + * + * Return: false if proposed change is OK, true if not ok and should be denied. + */ +static inline bool map_deny_write_exec(unsigned long old, unsigned long new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!(new & VM_EXEC)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (new & VM_WRITE) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!(old & VM_EXEC)) + return true; + + return false; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + int c = atomic_read(&mapping->i_mmap_writable); + + /* Derived from the raw_atomic_inc_unless_negative() implementation. */ + do { + if (c < 0) + return -EPERM; + } while (!__sync_bool_compare_and_swap(&mapping->i_mmap_writable, c, c+1)); + + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */